In [85]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import pandas as pd
#import os

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)

	return array(cleaned) 

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = 'fre.txt'
doc = load_doc(filename)
#print(doc)
# split into english-german pairs
pairs = to_pairs(doc)
#temp=array(pairs)
# clean sentences
clean_pairs_ = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs_, 'english-german.pkl')
# spot check
for i in range(100):
	print(clean_pairs_[i,:])


Saved: english-german.pkl
['new jersey is sometimes quiet during autumn and it is snowy in april'
 'new jersey est parfois calme pendant l automne et il est neigeux en avril']
['the united states is usually chilly during july and it is usually freezing in november'
 'les etatsunis est generalement froid en juillet et il gele habituellement en novembre']
['california is usually quiet during march and it is usually hot in june'
 'california est generalement calme en mars et il est generalement chaud en juin']
['the united states is sometimes mild during june and it is cold in september'
 'les etatsunis est parfois legere en juin et il fait froid en septembre']
['your least liked fruit is the grape but my least liked is the apple'
 'votre moins aime fruit est le raisin mais mon moins aime est la pomme']
['his favorite fruit is the orange but my favorite is the grape'
 'son fruit prefere est lorange mais mon prefere est le raisin']
['paris is relaxing during december but it is usually chil

In [87]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [88]:
filename='input.txt'
#doc = load_doc(filename)

# split into english-german pairs
#pairs = to_pairs(doc)
# clean sentences
#clean_pairs_t = clean_pairs(pairs)
# save clean pairs to file
#save_clean_data(clean_pairs_t, 'english-german-temp.pkl')
# spot check
# open the file as read only
file = open(filename, mode='rt', encoding='utf-8')
# read all text
text = file.read()
# close the file
file.close()
text=text.split('\n')  
print(len(text))
del(text[-1])
#for i in range(len(text)):
 #   print(text[i])
temp=[]
#for i in text:
 #   x=i.split()
  #  l.append(x[0])
   # l.append(x[1])
   # temp.append(l)
for i in text:
    l=i.split('-')
    temp.append(l)

    

21


In [89]:
from pickle import load
import pydot
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
#plot_model(model, to_file='model.png', show_shapes=True)    "error(pydot can't import)"
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

English Vocabulary Size: 199
English Max Length: 15
German Vocabulary Size: 315
German Max Length: 19
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 19, 256)           80640     
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 15, 256)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 15, 256)           525312    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 15, 199)           51143     
Total params: 1,182,407
Trainable params: 1,182,407
Non-trainable params: 0
_________________________________________________________________
None
Train on 9000 samples, va

<keras.callbacks.History at 0x1949a387a90>

In [137]:
from pickle import load
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append(raw_target.split())
		predicted.append(translation.split())
	#calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
test_t=load_clean_sentences('english-german-test-t.pkl')
#test_t=load_clean_sentences('english-german-test-t.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

#our input
# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('trainX')
evaluate_model(model, eng_tokenizer, testX, test)


train
[['the peach is their least favorite fruit but the orange is his least favorite'
  'la peche est leur fruit prefere moins mais lorange est son moins prefere']
 ['his favorite fruit is the pear but her favorite is the lime'
  'son fruit prefere est la poire mais son prefere est la chaux']
 ['the orange is our least liked fruit but the apple is her least liked'
  'lorange est notre fruit moins aime mais la pomme elle est moins aime']
 ...
 ['new jersey is snowy during march but it is sometimes wonderful in november'
  'new jersey est la neige en mars mais il est parfois merveilleux en novembre']
 ['california is usually chilly during spring but it is wonderful in march'
  'californie est generalement froid au printemps mais il est merveilleux en mars']
 ['new jersey is sometimes wonderful during march but it is sometimes rainy in june'
  'new jersey est parfois merveilleux au mois de mars mais il est parfois pluvieux en juin']]
src=[la peche est leur fruit prefere moins mais lorang

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.002897
BLEU-2: 0.053825
BLEU-3: 0.173217
BLEU-4: 0.232002
trainX
src=[la france est jamais calme en juillet et il gele habituellement en hiver], target=[france is never quiet during july and it is usually freezing in winter], predicted=[france is never quiet during july and it is usually freezing in winter]
src=[la fraise est son fruit prefere moins mais lorange est notre moins prefere], target=[the strawberry is his least favorite fruit but the orange is our least favorite], predicted=[the strawberry is his least favorite fruit but the orange is our least favorite]
src=[chine est jamais agreable en ete mais il est generalement chaud en octobre], target=[china is never nice during summer but it is usually hot in october], predicted=[china is never nice during summer but it is usually hot in october]
src=[nous detestons les bananes et les citrons verts], target=[we dislike bananas and limes], predicted=[we dislike bananas and limes]
src=[l inde est agreable a lautomne mais il 

In [131]:
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
    target.append(word)
    return ' '.join(target)

In [133]:
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    predict_sequence(model, eng_tokenizer, sources)
#     for i, source in enumerate(sources):
#         print(i,source)
		# translate encoded source text
		#source = source.reshape((1, source.shape[0]))
   # translation =predict_sequence(model, eng_tokenizer, sources)
		#raw_target, raw_src = raw_dataset[i]
		#if i < 10:
			#print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		#actual.append(raw_target.split())
		#predicted.append(translation.split())
	#calculate BLEU score
	#print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	#print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	#print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	#print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))



In [134]:
input1='paris est parfois belle à l automne , mais il est neigeux en juillet'

In [135]:
evaluate_model(model, eng_tokenizer, input1)


AttributeError: 'str' object has no attribute 'ndim'