In [None]:
import tensorflow as tf
import numpy as np
import keras 
import hazm
from keras.models import load_model

# **Text Preprocessing and Data Preparation for Machine Learning Models Using Hazm**

This code preprocesses Persian textual data and prepares it for machine learning models, such as RNNs or LSTMs.

---

## **Code Overview**

The following steps are performed:

1. **Reading Data from a File**  
   The text is read from `data.txt` and stored in a variable.

2. **Removing Empty Lines**  
   Extra empty lines (`\n\n`) are removed from the text.

3. **Stemming and Lemmatization**  
   The `hazm` library is used to normalize the text:
   - **Stemming:** Reduces words to their root form.
   - **Lemmatization:** Converts words to their base dictionary form.

4. **Tokenization and POS Tagging**  
   - The text is tokenized into individual words.  
   - POS (Part-of-Speech) tagging is applied using the `pos_tagger.model`.  
   - Each word is tagged in the format `[word-POS_tag]`.

5. **Creating Unique Tokens and Counting Frequencies**  
   - A list of unique tokens is created.  
   - The frequency of each token in the text is calculated.

6. **Token-to-Index Conversion**  
   Tokens are replaced with their respective indices to prepare numerical input for models.

7. **Data Preparation for Model Training**  
   A sliding window of size 10 is applied:
   - `X_train` contains sequences of 10 consecutive words (as indices).  
   - `Y_train` contains the next word following each sequence.


In [2]:
with open("data.txt","r") as file:
    contact =file.read()

In [3]:
text="زن آه کشید و گفت: «همه چیز به این قضیه مربوط است!»"

In [4]:
contact = contact.replace("\n\n","")
text = text.replace("\n\n","")

In [5]:
stemmer = hazm.Stemmer()
contact=stemmer.stem(contact)
text=stemmer.stem(text)

In [6]:
lemmatizer = hazm.Lemmatizer()
contact=lemmatizer.lemmatize(contact)
text=lemmatizer.lemmatize(text)

In [7]:
def merge (arr) :
    
    arr1=[]
    for v in arr:
        arr1.append(f"[{v[0]}-{v[1]}]")
    return arr1

In [8]:
contact=hazm.word_tokenize(contact)
text=hazm.word_tokenize(text)

In [9]:
spacy_posTagger = hazm.POSTagger(model='pos_tagger.model')
contact=merge(spacy_posTagger.tag(tokens = contact))
text=merge(spacy_posTagger.tag(tokens = text))

In [10]:
token = []
count={}

In [11]:
for index,value in enumerate(contact):
    if not value in token:
        token.append(value)

In [12]:
for index,value in enumerate(token):
    count[value]=contact.count(value)

In [13]:
for index,value in enumerate(text):
    text[index] = token.index(value)

In [15]:
X_train=[]
Y_train=[]

In [16]:
for index,value in enumerate(text):
    if len(text)-10 > index:
        X_train.append(text[index:index+10])
        Y_train.append(text[index+10])

In [18]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)

In [None]:


model = load_model("best_model.keras")

In [None]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_train.shape

In [None]:
def arrtotext (arr):
    text = ""
    for val in arr :
        text+=" "+token[int(val)]

    return text

In [21]:
def clean_text(inputq):
    text =arrtotext(inputq)+" "+token[np.argmax(out)]
    text = re.sub(r'[a-zA-Z]+', '', text)
    text = re.sub(r'[\[\]-]', '', text)
    return text


In [None]:
inputw = X_train[0]
out = model.predict(X_train[0:1])

In [None]:
for i in range(100):
    inputq = np.array([inputw])
    out = model.predict(inputq)
    print(clean_text(inputw))
    inputw = np.delete(inputw, 0)
    inputw = np.append(inputw, np.argmax(out)) 