## Part 2: Extra Credits1: Model for all languages ##

### Importing Libraries ###

In [None]:
from itertools import cycle

from keras.layers import Activation, Dense, LSTM
from keras.models import Sequential, load_model
from keras.optimizers import RMSprop

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
import sys
import os
import numpy as np
import pandas as pd

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
sys.setrecursionlimit(10000)     # setting this higher in case a saved model is loaded
RANDOM_STATE = 316
np.random.seed(100)              # set seed for reproducibility
tf.random.set_seed(100)          # set seed for reproducibility

val_frac = 0.2                   # fraction of data to be set as validation set
test_frac = 0.2                  # fraction of data to be set as test set


TOTAL_LANGUAGES = ['dut', 'eng', 'frn', 'ger', 'grk', 'por', 'spn']

rlabels = ['dut.true', 'eng.true', 'frn.true', 'ger.true', 'grk.true', 'por.true', 'spn.true']

clabels = ['dut', 'eng', 'frn', 'ger', 'grk', 'por', 'spn']

num_languages = 3
LANGUAGES = TOTAL_LANGUAGES[:num_languages]
rlabels = rlabels[:num_languages]
clabels = clabels[:num_languages]


maxlen = 20                      # length of sequence input to the LSTM model
step = 3                         # the step size for cutting the input sequence into redundant sequences of size maxlen

len_test = 5                     # length of the test 
num_test = 100                   # number of test substrings taken from ONE particular language

EPOCHS = 5                       # number of epochs

In [None]:
def read_file(lang):
    path = './subset_all/{}.txt'.format(lang)

    text = open(path, encoding='utf-8').read().lower()
    print('\n\n{} corpus length: {}'.format(lang, len(text)))
    
    chars = sorted(list(set(text)))
    print('unique chars:', len(chars))
    
    return text, chars

In [None]:
def cut_sequences(text):
    # cut the text in semi-redundant sequences of maxlen characters
    sentences = []
    next_chars = []
    
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    
    print('nb sequences:', len(sentences))
    
    return sentences, next_chars

In [None]:
def vectorize(charset, sentences, next_chars):
    print('Vectorizing text...')
    
    X = np.zeros((len(sentences), maxlen, len(charset)), dtype=np.bool)
    y = np.zeros((len(sentences), len(charset)), dtype=np.bool)
    
    char_indices = dict((c, i) for i, c in enumerate(charset))
    
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
    
    return X, y

In [None]:
def split_n_save(X, y, charset, lang):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_frac, random_state=RANDOM_STATE)
    
    np.savez_compressed('./splits_all/{}.npz'.format(lang),
                        xtrain=X_train,
                        xtest=X_test,
                        ytrain=y_train,
                        ytest=y_test,
                       charset=charset)
    
    return X_train, X_test, y_train, y_test

In [None]:
def build_model(lang, charset):
    # build the model: a single LSTM
    model = Sequential(name=lang)
    
    model.add(LSTM(128, input_shape=(maxlen, len(charset))))
    
    model.add(Dense(len(charset)))
    model.add(Activation('softmax'))
    
    optimizer = RMSprop(learning_rate=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    
    return model

In [None]:
def normalize_preds(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    
    return preds

In [None]:
def train_n_save(model, X, y):
    # train the model for 5 epochs
    model.fit(X, y, batch_size=128, epochs=EPOCHS, verbose=2, validation_split=val_frac)
    
    model.save('./models_all/{}.h5'.format(model.name))

In [None]:
charset = set()
for lang in LANGUAGES:
    text, chars = read_file(lang)
    charset = charset.union(chars)

for lang in LANGUAGES:
    text, _ = read_file(lang)
    
    sentences, next_chars = cut_sequences(text)
    
    X, y = vectorize(charset, sentences, next_chars)
    
    X_train, X_test, y_train, y_test = split_n_save(X, y, charset, lang)
    
    model = build_model(lang, charset)
    train_n_save(model, X_train, y_train)

In [None]:
def load_the_model(lang):
    model = load_model('./models_all/{}.h5'.format(lang))
    
    data = np.load('./splits_all/{}.npz'.format(lang))
    
    X_train = data['xtrain']
    X_test = data['xtest']
    y_train = data['ytrain']
    y_test = data['ytest']
    
    return model, X_train, X_test, y_train, y_test

In [None]:
def select_n_subs(X_test): 
    # empty array, with length as 100, maxlen is 40 (we will only set 5 characters to be true)
    test = np.zeros((num_test, len_test, len(charset)), dtype=np.bool)
    counter = 0
    
    for line in cycle(X_test):   # cycle through a line in the test set
        shouldgo = np.random.rand()   # select a line randomly
        if shouldgo >= 0.5:
            i = np.random.choice(len(line) - len_test)
            test[counter,:,:] = line[i:i+5,:]
            counter += 1
        if counter == num_test:   # break if 100 sub strings have been selected
            break
    return test

In [None]:
# Create the test substring dataset and the labels

test = np.empty((0, len_test, len(charset)), dtype = np.bool)
labels_test = np.zeros((num_test*len(LANGUAGES), len(LANGUAGES)), dtype = np.int64)    # one hot encoding for the labels
for i, lang in enumerate(LANGUAGES):
    data = np.load('./splits_all/{}.npz'.format(lang))
    test = np.vstack((test,select_n_subs(data['xtest'])))
    labels_test[(i*num_test):(i+1)*num_test, i] = 1.0

In [None]:
# load the models
def load_all_models():
    models = []
    for count, lang in enumerate(LANGUAGES, start = 1):
        print("loading model no: {}".format(count))
        models.append(load_the_model(lang)[0])
    return models

In [None]:
# make prediction on the test sets
def predict(models, test):
    likelihoods = np.zeros((len(test), len(models)), dtype = np.float64)
    for i in range(len(test)):
        if((i+1)%100 == 0):
            print("predicting on sample: {} out of {} samples".format(i+1, len(test)))
        for model_index, model in enumerate(models):
            
            model_preds = np.zeros((len_test,), dtype = np.float64)   # store probabilities for each character for a model
            
            start_seq = np.zeros((1, maxlen, test.shape[-1]), dtype = np.float64)
            start_seq[:,:,:] = 1.0 / len(charset)
            
            for j in range(len_test):
                
                preds = model.predict(start_seq, verbose=0)[0]   # probability distribution of the 1st element given by the model
                preds = normalize_preds(preds)
          
                # find the probability of the actual test chracter as predicted by the given model
                # boolean array with true in the actual character position
                model_preds[j] = preds[test[i,j,:]]
                
                # remove the first (index 0) character in the start sequence and add the first character of the test substring
                # to the end of the start sequence to update the input to the LSTM
                start_seq[:,:-1,:] = start_seq[:,1:,:]
                start_seq[:,-1,:] = test[i,j,:]
             
            likelihoods[i,model_index] = sum(np.log(model_preds))
    
    return likelihoods
    

In [None]:
def confusion_table():
    models = load_all_models()
    likelihoods = predict(models, test)
    cm = np.zeros((len(LANGUAGES), len(LANGUAGES)), dtype = np.int64)
    cm = pd.DataFrame(cm, index = rlabels, columns = clabels)
    for i in range(len(test)):
        predicted = np.argmax(likelihoods[i,:])
        real = np.argmax(labels_test[i,:])
        cm.iloc[real, predicted] += 1
    
    accuracy = (np.trace(cm))/len(labels_test)
    cm['Total'] = cm.sum(axis=1)
    cm.loc['Total']= cm.sum()
    print("The overall accuracy is = {:3.1f} %".format(accuracy*100))
    print("The Confusion Matrix is below:")
    print(cm)
    
    return (likelihoods, accuracy, cm)

In [None]:
likelihoods, accuracy, cm = confusion_table()