In [1]:
from numpy import array
import pandas as pd
import re
import string
from keras.preprocessing.text import Tokenizer
from numpy import argmax
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from nltk.translate.bleu_score import corpus_bleu
import json
import pickle

def createParallelCorpra(sourceFile,targetFile,masterFile):
    fileEn = open(sourceFile,mode = 'r',encoding='utf8') 
    linesEn = fileEn.readlines()
    
    fileSh = open(targetFile,mode = 'r',encoding ='utf8')
    linesSh = fileSh.readlines()
    
    temp_df = pd.DataFrame(columns = ['English', 'Shakespeare']) 
    
    temp_df['English'] = [cleaning_spliting(line) for line in linesEn]
    temp_df['Shakespeare'] = [re.sub(r'[^a-z ]',r'',line.lower()).split() for line in linesSh]
    
    master_df = pd.read_json(masterFile)
    master_df = master_df.append(temp_df,ignore_index=True)
    master_df.to_json(masterFile)
    master_df.head()
    
def cleaning_spliting(sentence):
    return re.sub(r'[^a-z ]',r'',sentence.lower()).split()

def create_tokenizer(lines,lang_name):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    with open(lang_name+'_tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return tokenizer

def get_saved_tokenizer(lang_name):
    with open(lang_name+'_tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    return tokenizer

def max_length(lines):
    return max(len(line) for line in lines)

def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist,dtype='np.int16')
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

# generate target given source sequence
def predict_sequence(model, target_tokenizer, encoded_source):
    prediction = model.predict(encoded_source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, target_tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)
    
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model
    
def get_language_info(lang_name):
    with open(lang_name+'Info.json', 'r') as fp:
        lang_info = json.load(fp)
    return lang_info
    
    
# evaluate the skill of the model
def evaluate_model(model, target_tokenizer, encoded_sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(encoded_sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, target_tokenizer, source)
        raw_src = raw_dataset.iloc[i,0]
        raw_target = raw_dataset.iloc[i,1]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

    

def train_model(corpusFileName='parallelCorpus',source_language='english',target_language='shakespearen'): 
    
    df = pd.read_json(corpusFileName+'.json')
    
    source_tokenizer = create_tokenizer(df.iloc[:, 0],source_language)
    source_vocab_size = len(source_tokenizer.word_index) + 1
    source_length = max_length(df.iloc[:, 0])
    
    sourceInfo = {'vocab_size':source_vocab_size,
                  'sentence_length':source_length
    }
    
    with open(source_language+'Info.json', 'w') as fs:
        json.dump(sourceInfo, fs)
        
    print(source_language+' Vocabulary Size: %d' % source_vocab_size)
    print(source_language+' Max sentence Length: %d' % (source_length))

    # prepare target tokenizer
    target_tokenizer = create_tokenizer(df.iloc[:, 1],target_language)
    target_vocab_size = len(target_tokenizer.word_index) + 1
    target_length = max_length(df.iloc[:, 1])
    
    print(target_language+' Vocabulary Size: %d' % target_vocab_size)
    print(target_language+' Max Length: %d' % (target_length))
    
    targetInfo = {'vocab_size':target_vocab_size,
                  'sentence_length':target_length
    }
    
    with open(target_language+'Info.json', 'w') as ft:
        json.dump(targetInfo, ft)

    trainX = encode_sequences(source_tokenizer, source_length, df.iloc[:, 0])
    trainY = encode_sequences(target_tokenizer, target_length, df.iloc[:, 1])
    trainY = encode_output(trainY, target_vocab_size)

    model = define_model(source_vocab_size, target_vocab_size, source_length, target_length, 256)
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    model.fit(trainX, trainY, epochs=1, batch_size=64, verbose=2)
    model.save(source_language+target_language+'translator.h5')
    evaluate_model(model, target_tokenizer, trainX, df.iloc[0:5,])
    print(model.summary())
    return model

def test_model(source_language='english',target_language='shakespearen'):
    
    model = load_model(source_language+target_language+'translator.h5')
    target_tokenizer = get_saved_tokenizer(target_language)
    source_tokenizer = get_saved_tokenizer(source_language)
    sourceInfo = get_language_info(source_language)
    
    source_sentence = input('Enter sentence in '+source_language+ ':')
    source_sentence = cleaning_spliting(source_sentence)
    sentences = list()
    sentences.append(source_sentence)
    
    
    encoded_source = encode_sequences(source_tokenizer,sourceInfo['sentence_length'],sentences)
    translated_sentence = predict_sequence(model,target_tokenizer,encoded_source)
    print('translated: '+translated_sentence)
    
train_model()

Using TensorFlow backend.


english Vocabulary Size: 8539
english Max sentence Length: 88
shakespearen Vocabulary Size: 10269
shakespearen Max Length: 98


MemoryError: Unable to allocate 3.84 MiB for an array with shape (98, 10269) and data type float32

In [8]:
import pickle

# saving
with open('shake_tokenizer.pickle', 'wb') as handle:
    pickle.dump(shake_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('shake_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
evaluate_model(model,tokenizer, trainX, df.iloc[0:5,])

src=[['of', 'course']], target=[['excellent', 'well']], predicted=[i my]
src=[['you', 'sell', 'fish']], target=[['you', 'are', 'a', 'fishmonger']], predicted=[i my lord]
src=[['no', 'not', 'me', 'sir']], target=[['not', 'i', 'my', 'lord']], predicted=[i my my]
src=[['in', 'that', 'case', 'i', 'wish', 'you', 'were', 'as', 'good', 'a', 'man', 'as', 'a', 'fish', 'seller']], target=[['then', 'i', 'would', 'you', 'were', 'so', 'honest', 'a', 'man']], predicted=[i the the the the the the]
src=[['good', 'sir']], target=[['honest', 'my', 'lord']], predicted=[i my]
BLEU-1: 0.175339
BLEU-2: 0.104347
BLEU-3: 0.000000
BLEU-4: 0.000000


In [26]:
def execute():
    sourceFile=r'C:\Users\ser\Desktop\StatisNLP\ShakespereanTranslator\Data\romeojuliet_modern.txt'
    targetFile=r'C:\Users\ser\Desktop\StatisNLP\ShakespereanTranslator\Data\romeojuliet_original.txt'
    masterFile=r'parallelCorpus.json'
    createParallelCorpra(sourceFile,targetFile,masterFile)
execute()


In [27]:
pd.read_json('parallelCorpus.json').shape

(13641, 2)