In [1]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# from ktext.preprocess import processor

In [2]:
# pip install ktext

In [3]:
df = pd.read_csv("eng_-french.csv")

In [4]:
df

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [5]:
df["English words/sentences count"]=df["English words/sentences"].str.split().str.len()

In [6]:
df["English words/sentences"].str.split().str.len()

0          1
1          1
2          1
3          1
4          1
          ..
175616    34
175617    34
175618    37
175619    43
175620    44
Name: English words/sentences, Length: 175621, dtype: int64

In [7]:
df["French words/sentences count"]=df["French words/sentences"].str.split().str.len()

In [8]:
df

Unnamed: 0,English words/sentences,French words/sentences,English words/sentences count,French words/sentences count
0,Hi.,Salut!,1,1
1,Run!,Cours !,1,2
2,Run!,Courez !,1,2
3,Who?,Qui ?,1,2
4,Wow!,Ça alors !,1,3
...,...,...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç...",34,47
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...,34,33
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...,37,47
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...,43,49


In [9]:
df=df[(df["English words/sentences count"]<6) & (df["French words/sentences count"]<6)]

In [10]:
df

Unnamed: 0,English words/sentences,French words/sentences,English words/sentences count,French words/sentences count
0,Hi.,Salut!,1,1
1,Run!,Cours !,1,2
2,Run!,Courez !,1,2
3,Who?,Qui ?,1,2
4,Wow!,Ça alors !,1,3
...,...,...,...,...
153098,Tom's great-great-grandfather was a pirate.,L'arrière-arrière-grand-père de Tom était pirate.,5,5
153107,"Unfortunately, the information is accurate.","Malheureusement, l'information est exacte.",5,4
154934,They're having a going-out-of-business sale.,Ils ont une liquidation.,5,4
154962,This theory is scientifically controversial.,Cette théorie est scientifiquement controversée.,5,5


In [11]:
df= df.drop_duplicates()

In [12]:
df

Unnamed: 0,English words/sentences,French words/sentences,English words/sentences count,French words/sentences count
0,Hi.,Salut!,1,1
1,Run!,Cours !,1,2
2,Run!,Courez !,1,2
3,Who?,Qui ?,1,2
4,Wow!,Ça alors !,1,3
...,...,...,...,...
153098,Tom's great-great-grandfather was a pirate.,L'arrière-arrière-grand-père de Tom était pirate.,5,5
153107,"Unfortunately, the information is accurate.","Malheureusement, l'information est exacte.",5,4
154934,They're having a going-out-of-business sale.,Ils ont une liquidation.,5,4
154962,This theory is scientifically controversial.,Cette théorie est scientifiquement controversée.,5,5


## Diving  Data  into Train Test validate:

In [13]:
from sklearn.model_selection import train_test_split

train,test = train_test_split(df,test_size=0.2)

In [14]:
train["English words/sentences count"].max()

5

In [15]:
train["French words/sentences count"].max()

5

In [16]:
train , valid  = train_test_split(train , test_size=0.2)

In [17]:
print(train.shape)
print(test.shape)
print(valid.shape)


(34695, 4)
(10843, 4)
(8674, 4)


## Word Frequency 

In [18]:
freq_x = train["English words/sentences"].str.split(expand=True).stack().value_counts().reset_index()
freq_y = train["French words/sentences"].str.split(expand=True).stack().value_counts().reset_index()

In [19]:
freq_x.to_csv("English_freq.csv",index=False)
freq_y.to_csv("French_freq.csv",index=False)

In [20]:
freq_x = pd.read_csv("English_freq.csv")
freq_y = pd.read_csv("French_freq.csv")

In [21]:
freq_x

Unnamed: 0,index,0
0,I,5981
1,a,3401
2,you,3379
3,is,2913
4,the,2303
...,...,...
10473,listens.,1
10474,jackpot.,1
10475,wrench.,1
10476,footsteps,1


In [22]:
freq_y

Unnamed: 0,index,0
0,?,5147
1,Je,4613
2,est,2314
3,Il,2295
4,pas,2171
...,...,...
16345,fonctionna,1
16346,toussota.,1
16347,machiniste.,1
16348,Compare,1


## Data Preparation

In [23]:
def get_data(raw_line):
    text=[]
    for raw_line in raw_line:
        text.append("<start> " + raw_line + " <end>")
    return text

In [24]:
english_train = get_data(list(train["English words/sentences"]))
english_train


['<start> Nothing will stop his going. <end>',
 "<start> It's right. <end>",
 '<start> I joined the army. <end>',
 '<start> I still like Tom. <end>',
 '<start> The cup has a crack. <end>',
 '<start> Tom looks great. <end>',
 '<start> Are you in danger? <end>',
 '<start> I am afraid of dying. <end>',
 '<start> They feared you. <end>',
 '<start> This is the best method. <end>',
 '<start> They never gave up. <end>',
 '<start> I thought I was happy. <end>',
 "<start> I'm not kidding. <end>",
 '<start> How did you do that? <end>',
 "<start> I hadn't thought of that. <end>",
 '<start> Shake my hand. <end>',
 '<start> There were no consequences. <end>',
 '<start> What are you waiting for? <end>',
 '<start> I like to draw pictures. <end>',
 "<start> They're going to try. <end>",
 "<start> That's OK. <end>",
 "<start> We've had a complaint. <end>",
 '<start> Tie your shoelaces. <end>',
 '<start> This is sweet. <end>',
 "<start> You're very courageous. <end>",
 '<start> I want to confess. <end>'

In [25]:
frence_train = get_data(list(train["French words/sentences"]))
frence_train

["<start> Rien ne l'arrêtera. <end>",
 "<start> C'est juste. <end>",
 "<start> J'ai rejoint l'armée. <end>",
 "<start> J'aime toujours Tom. <end>",
 '<start> La tasse a une fêlure. <end>',
 '<start> Tom est superbe. <end>',
 '<start> Es-tu en danger\xa0? <end>',
 "<start> J'ai peur de mourir. <end>",
 '<start> Elles te craignaient. <end>',
 "<start> C'est la meilleur méthode. <end>",
 "<start> Ils n'ont jamais abandonné. <end>",
 "<start> Je pensais que j'étais heureuse. <end>",
 '<start> Je ne blague pas. <end>',
 '<start> Comment as-tu fait cela\u202f? <end>',
 "<start> Je n'y avais pas songé. <end>",
 '<start> Serre-moi la main. <end>',
 "<start> Il n'y eut aucune conséquence. <end>",
 "<start> Qu'attends-tu ? <end>",
 "<start> J'aime dessiner. <end>",
 '<start> Elles vont essayer. <end>',
 '<start> Pas de problème. <end>',
 '<start> Nous avons eu une plainte. <end>',
 '<start> Attache tes lacets. <end>',
 "<start> C'est gentil. <end>",
 '<start> Vous êtes fort courageuse. <end>',
 

In [26]:
english_valid = get_data(list(valid["English words/sentences"]))
english_valid

['<start> Watch out for that man. <end>',
 '<start> This is unacceptable. <end>',
 '<start> She was crying with pain. <end>',
 '<start> They were tired of waiting. <end>',
 "<start> We aren't always right. <end>",
 '<start> You have my respect. <end>',
 '<start> He picked up a stone. <end>',
 '<start> Do you like to sing? <end>',
 "<start> It wasn't enough. <end>",
 "<start> Let's establish some ground rules. <end>",
 '<start> I take it back. <end>',
 '<start> He loves toys. <end>',
 '<start> Tom found your cap. <end>',
 '<start> Give me an example. <end>',
 '<start> Tom is making a mistake. <end>',
 "<start> It was wonderful, wasn't it? <end>",
 '<start> Everything is ready here. <end>',
 '<start> Is that bothering you? <end>',
 '<start> Do you have enough time? <end>',
 '<start> Everyone waited. <end>',
 '<start> I took a bath. <end>',
 '<start> This dictionary is mine. <end>',
 '<start> The bridge is closed. <end>',
 '<start> This is inefficient. <end>',
 '<start> My family is small

In [27]:
frence_valid = get_data(list(valid["French words/sentences"]))
frence_valid

['<start> Occupe-toi de cet homme-là. <end>',
 "<start> C'est inacceptable. <end>",
 '<start> Elle pleurait de douleur. <end>',
 "<start> Elles étaient fatiguées d'attendre. <end>",
 "<start> Nous n'avons pas toujours raison. <end>",
 '<start> Vous avez mon respect. <end>',
 '<start> Il ramassa une pierre. <end>',
 '<start> Vous aimez chanter ? <end>',
 "<start> Ce n'était pas suffisant. <end>",
 '<start> Établissons quelques règles de base. <end>',
 '<start> Je le reprends. <end>',
 '<start> Il adore les jouets. <end>',
 '<start> Tom a retrouvé ta casquette. <end>',
 '<start> Donnez-moi un exemple. <end>',
 '<start> Tom fait une erreur. <end>',
 "<start> C'était merveilleux, n'est-ce pas\xa0? <end>",
 '<start> Tout est prêt ici. <end>',
 '<start> Cela te tracasse-t-il ? <end>',
 '<start> As-tu assez de temps ? <end>',
 '<start> Tout le monde a attendu. <end>',
 "<start> J'ai pris un bain. <end>",
 '<start> Ce dictionnaire est à moi. <end>',
 '<start> Le pont est fermé. <end>',
 "<star

##  Text Preprocessing 

In [28]:
# Tokenizating and padding input language
[]
fre_token = Tokenizer(filters="",lower=False)
fre_token.fit_on_texts(frence_train)

fre_tokenized = fre_token.texts_to_sequences(frence_train)
fre_padded = pad_sequences(fre_tokenized,padding="post")


# Tokenizating and padding output language:
eng_token = Tokenizer(filters="",lower=False)
eng_token.fit_on_texts(english_train)

eng_tokenized = eng_token.texts_to_sequences(english_train)
eng_padded = pad_sequences(eng_tokenized,padding="post")

In [29]:
# number of a  unique tokens   in input  adn output language:
num_op_token  = len(fre_token.word_index)
num_in_token = len(eng_token.word_index)


#maximum length of a sentence  inn both language:
max_len_op = fre_padded.shape[1]  # french
max_len_ip = eng_padded.shape[1]  #enlgish

In [30]:
frence_train[0]

"<start> Rien ne l'arrêtera. <end>"

In [31]:
num_op_token

17268

In [32]:
num_in_token

10480

In [33]:
max_len_ip

7

In [34]:
max_len_op

7

In [35]:
fre_padded[0]

array([   1,  396,   18, 8053,    2,    0,    0])

In [36]:
frence_train[0]

"<start> Rien ne l'arrêtera. <end>"

In [37]:
english_train[0]

'<start> Nothing will stop his going. <end>'

In [38]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Define the parameters for TextVectorization
max_words = 11148
max_sequence_length = 7

# Initialize TextVectorization layer
english_pp = TextVectorization(max_tokens=max_words, output_sequence_length=max_sequence_length)

# Fit the TextVectorization layer on your training data
english_pp.adapt(english_train)

# Transform your training data using the fitted TextVectorization layer
english_train_vecs = english_pp(english_train)

In [39]:
english_train_vecs

<tf.Tensor: shape=(34695, 7), dtype=int64, numpy=
array([[   2,  200,   53, ...,   48,   79,    3],
       [   2,   26,   82, ...,    0,    0,    0],
       [   2,    4, 1652, ..., 2526,    3,    0],
       ...,
       [   2,   16,  304, ...,    5,    3,    0],
       [   2,   19,    6, ...,    3,    0,    0],
       [   2,   15,   17, ...,    6, 1091,    3]], dtype=int64)>

In [40]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Define the parameters for TextVectorization
max_words = 18825
max_sequence_length = 7

# Initialize TextVectorization layer with append_indicator and padding options
french_pp = TextVectorization(
    max_tokens=max_words,
    output_sequence_length=max_sequence_length,
    output_mode='int',  # You may adjust the output_mode as needed
    pad_to_max_tokens=True,  # You may adjust the pad_value as needed
)

# Fit the TextVectorization layer on your training data
french_pp.adapt(frence_train)

# Transform your training data using the fitted TextVectorization layer
french_train_vec = french_pp(frence_train)

In [41]:
french_train_vec

<tf.Tensor: shape=(34695, 7), dtype=int64, numpy=
array([[   2,   76,   15, ...,    3,    0,    0],
       [   2,   12,  132, ...,    0,    0,    0],
       [   2,   17, 1846, ...,    3,    0,    0],
       ...,
       [   2,  183,  113, ...,   13,    6,    3],
       [   2,  264,   19, ...,    3,    0,    0],
       [   2,   68,   16, ..., 1204,    3,    0]], dtype=int64)>

In [51]:
import dill as dpickle
import pickle

In [43]:
# pip install dill

In [53]:
# Define a filename for your model
model_filename = "english_pp.dpkl.pkl"

# Use pickle to save the model to a file
with open(model_filename, 'wb') as file:
    pickle.dump(english_pp, file)


InvalidArgumentError: Cannot convert a Tensor of dtype resource to a NumPy array.

In [None]:
    
# Define a filename for your model
model_filename = "french_pp.dpkl"

# Use pickle to save the model to a file
with open(model_filename, 'wb') as file:
    pickle.dump(french_pp, file)

In [48]:
# # save the preprossec:
with open("english_pp.dpkl","wb") 
    dpickle.dump(english_pp,f)
    
with open("french_pp.dpkl","wb") 
    dpickle.dump(french_pp,f)
    
    
#save the processed data:
np.save("French_train_vecs.npy",french_train_vec)
np.save("English_train_vecs.npy", english_train_vecs)

In [50]:
def load_decoder_inputs(decoder_np_vecs="French_train_vecs.npy"):
    vectorized_title = np.load(decoder_np_vecs)
    
    decoder_input_data = vectorized_title[:,:-1]
    
    decoder_target_data = vectorized_title[:,1:]
    
    print(f'Shape of decoder input:{decoder_input_data.shape}')
    print(f'Shape of the decoder target: {decoder_target_data}')
    return decoder_input_data , decoder_target_data
          
def load_encoder_input(encoder_np_vecs = "English_train_vecs.npy"):
    vectorized_body = np.load(encoder_np_vecs)
          
    encoder_input_data = vectorized_body
    doc_length = encoder_input_data.shape[1]
    print(f"Shape of the encoder input: {encoder_input_data.shape}")
    return encoder_input_data , doc_length

In [None]:
def load_text_processor()