In [32]:
import tensorflow as tf
import numpy as np
import keras 
import hazm
from keras.models import load_model
import re

# **Text Preprocessing and Data Preparation for Machine Learning Models Using Hazm**

This code preprocesses Persian textual data and prepares it for machine learning models, such as RNNs or LSTMs.

---

## **Code Overview**

The following steps are performed:

1. **Reading Data from a File**  
   The text is read from `data.txt` and stored in a variable.

2. **Removing Empty Lines**  
   Extra empty lines (`\n\n`) are removed from the text.

3. **Stemming and Lemmatization**  
   The `hazm` library is used to normalize the text:
   - **Stemming:** Reduces words to their root form.
   - **Lemmatization:** Converts words to their base dictionary form.

4. **Tokenization and POS Tagging**  
   - The text is tokenized into individual words.  
   - POS (Part-of-Speech) tagging is applied using the `pos_tagger.model`.  
   - Each word is tagged in the format `[word-POS_tag]`.

5. **Creating Unique Tokens and Counting Frequencies**  
   - A list of unique tokens is created.  
   - The frequency of each token in the text is calculated.

6. **Token-to-Index Conversion**  
   Tokens are replaced with their respective indices to prepare numerical input for models.

7. **Data Preparation for Model Training**  
   A sliding window of size 10 is applied:
   - `X_train` contains sequences of 10 consecutive words (as indices).  
   - `Y_train` contains the next word following each sequence.


In [11]:
with open("data.txt","r") as file:
    contact =file.read()

In [12]:
text="زن آه کشید و گفت: «همه چیز به این قضیه مربوط است!»"

In [13]:
contact = contact.replace("\n\n","")
text = text.replace("\n\n","")

In [14]:
stemmer = hazm.Stemmer()
contact=stemmer.stem(contact)
text=stemmer.stem(text)

In [15]:
lemmatizer = hazm.Lemmatizer()
contact=lemmatizer.lemmatize(contact)
text=lemmatizer.lemmatize(text)

In [16]:
def merge (arr) :
    
    arr1=[]
    for v in arr:
        arr1.append(f"[{v[0]}-{v[1]}]")
    return arr1

In [17]:
contact=hazm.word_tokenize(contact)
text=hazm.word_tokenize(text)

In [18]:
spacy_posTagger = hazm.POSTagger(model='pos_tagger.model')
contact=merge(spacy_posTagger.tag(tokens = contact))
text=merge(spacy_posTagger.tag(tokens = text))

In [19]:
token = []
count={}

In [20]:
for index,value in enumerate(contact):
    if not value in token:
        token.append(value)

In [21]:
for index,value in enumerate(token):
    count[value]=contact.count(value)

In [22]:
for index,value in enumerate(text):
    text[index] = token.index(value)

In [23]:
X_train=[]
Y_train=[]

In [24]:
for index,value in enumerate(text):
    if len(text)-10 > index:
        X_train.append(text[index:index+10])
        Y_train.append(text[index+10])

In [25]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)

# **Text Generation Using Trained LSTM Model**

This code uses a trained LSTM model to generate text. The model predicts the next words based on previously learned data.

---

## **Code Overview**

### **Code Functionality:**
1. **Loading the Model:** The saved model `token_generator_best_loss.keras` is loaded.  
2. **Array to Text Conversion:** The function `arr_to_text` converts an array of tokens into text.  
3. **Text Cleaning:** The function `clean_text` cleans the generated text by removing unnecessary characters (English letters and symbols).  
4. **Generating Next Words:**
   - The model predicts the next word and updates the sequence accordingly.  
   - The predicted word is appended to the sequence, and the first word is removed.  
5. **Output Display:** The generated text is printed at each step.


In [33]:
model = load_model("best_model.keras")

In [34]:
def arr_to_text(sequence):
    text = ""
    for token_index in sequence:
        text += " " + token[int(token_index)]
    return text

In [35]:
def clean_text(sequence, output):
    generated_text = arr_to_text(sequence) + " " + token[np.argmax(output)]
    generated_text = re.sub(r'[a-zA-Z]+', '', generated_text) 
    generated_text = re.sub(r'[\[\]-]', '', generated_text) 
    return generated_text


In [36]:
current_sequence = X_train[0]

In [None]:
for _ in range(100):
    input_sequence = np.array([current_sequence]) 
    predicted_output = model.predict(input_sequence) 
    print(clean_text(current_sequence, predicted_output)) 
    
    # به‌روزرسانی توالی برای پیش‌بینی بعدی
    current_sequence = np.delete(current_sequence, 0) 
    current_sequence = np.append(current_sequence, np.argmax(predicted_output))