In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import collections
import numpy as np
import pandas as pd
import keras
import tensorflow as tf


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras import Sequential

In [None]:
data = pd.read_csv("../input/language-translation-englishfrench/eng_-french.csv")
data.tail()

In [None]:
english = data["English words/sentences"]
french = data["French words/sentences"]

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for i,text in enumerate(english):
  stri = ""
  txt = tokenizer.tokenize(text)
  for j in txt:
    j = j.lower()
    stri = stri + j
    stri = stri + " "
  english[i] = stri

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for i,text in enumerate(french):
  stri = ""
  txt = tokenizer.tokenize(text)
  for j in txt:
    j = j.lower()
    stri = stri + j
    stri = stri + " "
  french[i] = stri

In [None]:
print(english[0:10])
print(french[0:10])

In [None]:
n1 = 9900
n2=10000
eng = []
fre = []

for data in english:
  eng.append(data)

for data in french:
  fre.append(data)

eng = np.asarray(eng)
fre = np.asarray(fre)

eng = eng[0:175000]
fre = fre[0:175000]

for i in range(n1,n2):
  print(eng[i] + "\t->\t" + fre[i] + "\n")

In [None]:
print("English Shape is " + str(eng.shape))
print("French Shape is " + str(fre.shape))

In [None]:
english_word_counter = collections.Counter([word for sentence in eng for word in sentence.split(" ")])
french_word_counter = collections.Counter([word for sentence in fre for word in sentence.split(" ")])

print('{} English words.'.format(len([word for sentence in eng for word in sentence.split()])))
print('{} French words.'.format(len([word for sentence in fre for word in sentence.split()])))
print("\n")
print('{} unique English words.'.format(len(english_word_counter)))
print('{} unique French words.'.format(len(french_word_counter)))
print("\n")
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_word_counter.most_common(10)))[0]) + '"')
print("\n")
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_word_counter.most_common(10)))[0]) + '"')

In [None]:
def tokenize(x):
  tokenizer = Tokenizer(char_level=False,oov_token=" ")
  tokenizer.fit_on_texts(x)
  return tokenizer.texts_to_sequences(x), tokenizer

text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
  
text , tokenizer = tokenize(text_sentences)
print(text)
print(tokenizer.word_index)

In [None]:
def pad(x,length=None):
  if (length==None):
    length = max([len(sentence) for sentence in x])
  a = pad_sequences(x,maxlen=length,padding="post")
  return a

test_pad = pad(text)
for i, (token_sent, pad_sent) in enumerate(zip(text, test_pad)):
    print('Sequence {} in x'.format(i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

In [None]:
def preprocess(x,y):
  preprocess_x,x_tk = tokenize(x)
  preprocess_y,y_tk = tokenize(y)

  preprocess_x = pad(preprocess_x)
  preprocess_y = pad(preprocess_y)
  preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

  return preprocess_x, preprocess_y, x_tk, y_tk

In [None]:
pre_eng,pre_fre,eng_tk,fre_tk = preprocess(eng,fre)
max_eng_seq_len = pre_eng.shape[1]
max_fr_seq_len = pre_fre.shape[1]
english_vocab_size = len(eng_tk.word_index)
french_vocab_size = len(fre_tk.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_eng_seq_len)
print("Max French sentence length:", max_fr_seq_len)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

In [None]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    learning_rate = 1e-3
    model = keras.Sequential([
        Embedding(english_vocab_size+1, 128, input_length=input_shape[1]),
        Bidirectional(GRU(128, return_sequences=True)),
        tf.keras.layers.Dropout(0.25),
        TimeDistributed(Dense(french_vocab_size, activation='softmax'))
    ])
    model.summary()
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

tmp_x = pad(pre_eng, max_fr_seq_len)
rnn_model = embed_model(tmp_x.shape,max_fr_seq_len,english_vocab_size,french_vocab_size)
rnn_model.fit(tmp_x, pre_fre, batch_size=1024, epochs=50, validation_split=0.2)
rnn_model.save_weights("rnn_model_weights.h5")
#rnn_model.load_weights("/content/rnn_model_weights.h5")

In [None]:
def logits_to_text(logits, tokenizer):
  index_to_words = {id: word for word, id in tokenizer.word_index.items()}
  index_to_words[0] = '<PAD>'

  return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [None]:
for i in range(5):
  print(logits_to_text(rnn_model.predict(tmp_x[1000])[i], fre_tk))
print(tmp_x[10000])

i = 10000
print(eng[i] + "\t->\t" + fre[i] + "\n")


#[24 ,80 ,159 ,0 ,  0 ,  0,   0,   0,   0,   0,   0]

In [None]:
new_words = []
num_arr = [0 for i in range(11)]
sentence = "is"
sentence = sentence.lower()
words = sentence.split(" ")

for i,word in enumerate(words):
  num_arr[i] = eng_tk.word_index[word]


arra = []
for i in range(10):
  arra.append(logits_to_text(rnn_model.predict(num_arr)[i], fre_tk))

sent = ""
for word in arra:
  if word == "<PAD>":
    break
  else:
    sent = sent + word
    sent = sent + " "

print(sent)