In [None]:
from io import open
import glob
import os
import unicodedata
import string
import numpy as np

from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

from IPython.display import HTML
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import SimpleRNN, LSTM
from keras.regularizers import l2

### Load Data

In [None]:
# inspect the data directory
def findFiles(path): 
    return glob.glob(path)

print(findFiles('data/names/*.txt'))

In [None]:
# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return lines

def load_data(categories=None):
    names_list = []
    for filename in findFiles('data/names/*.txt'):
        category = os.path.splitext(os.path.basename(filename))[0]
        if not categories or category in categories: 
            names = readLines(filename)
            names_list.extend([(name,category) for name in names])
    df = pd.DataFrame(names_list)
    df.columns = ["name","lang"]
    return df
    

languages = ["English","French","Italian","German","Spanish"]
names = load_data(languages)
names.head()

In [None]:
maxlen = np.max([len(name) for name in names.name])
print("Maximum name length: ", maxlen)
names.groupby('lang')['name'].count()

### Alphabet

Extract from the list of names all the characters occurring. This gives the basis for constructing a vector space representation of the characters by one-hot-vectors.

Foresee a suitable character for the end of the word, e.g. 'END'.

In [None]:
### START YOUR CODE
alphabet = 
...

### END YOUR CODE
alphabet.append('-') # added for later purpose
len_alphabet = len(alphabet)
char_index = dict((c, i) for i, c in enumerate(alphabet))
print("Size of alphabet: ",len_alphabet)
print(alphabet)

### Vector Representations

In [None]:
language_to_index = {country:index for index,country in enumerate(names.lang.unique())}
index_to_language = {index:country for index,country in enumerate(names.lang.unique())}

def onehot(i, length):
    v = np.zeros(length);
    v[i] = 1
    return v

def name_representation(name, maxlen, char_index):
    '''
    Create a vector representation for the given name. A list of length maxlen, each element being a 
    numpy array of lenght len_vocab. 
    
    Parameters:
    maxlen: lenght of the list
    char_index: dict that returns the index for a given character
    '''
    ### START YOUR CODE

    
    
    
    ### START YOUR CODE
    return vector

def lang_representation(language, language_to_index):
    y = np.zeros(len(language_to_index))
    y[language_to_index[language]]=1
    return y

def lang_from_output(score):
    return index_to_language[np.argmax(score)]

def predict(name, model):
    score = model.predict(np.array([name_representation(name, maxlen,char_index)]))[0]
    return lang_from_output(score)

In [None]:
print(lang_representation("French",language_to_index))

In [None]:
msk = np.random.rand(len(names)) < 0.8
train = names[msk]
test = names[~msk]

In [None]:
X_train = []
Y_train = []
for name in train.name:
    X_train.append(name_representation(name,maxlen,char_index))
for lang in train.lang:
    Y_train.append(lang_representation(lang, language_to_index))

X_train = np.asarray(X_train)
Y_train = np.asarray(Y_train)
print(X_train.shape,Y_train.shape) 

In [None]:
X_test = []
Y_test = []
for name in test.name:
    X_test.append(name_representation(name,maxlen,char_index))
for lang in test.lang:
    Y_test.append(lang_representation(lang, language_to_index))
        
X_test = np.asarray(X_test)
Y_test = np.asarray(Y_test)
print(X_test.shape,Y_test.shape) 

### Define and Train the Model

Create an RNN consisting of a single layer with a SimpleRNN (keras) and a softmax.

Then train the model. Play with different number of hidden units in the layer to obtain a good accuracy.

In [None]:
### START YOUR CODE
n_languages = 
n_hidden = 

model = 
model.add(SimpleRNN(...))
...

### END YOUR CODE
model.summary()

In [None]:
### START YOUR CODE
...


### END YOUR CODE

In [None]:
score, acc = model.evaluate(X_test, Y_test)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
plt.plot(log.history['acc'], label='Training')
plt.plot(log.history['val_acc'], label='Testing')
plt.legend()
plt.grid()

### Some additional outputs: Classify the last names of all the students and teaching personal in the class.

In [None]:
df_samples = pd.read_csv("stud_names.csv")
df_samples.columns=["First","Last"]
#pred_langs = [predict(unicodeToAscii(name), model) for name in df_samples.Last]
pred_langs = [predict(name, model) for name in df_samples.Last]
df_samples["pred0"] = pred_langs
df_samples

### Confusion matrix

Compute the confusion matrix and jduge for which languages the names are hard to classify. 

In [None]:
### START YOUR CODE
...

cm = 
### END YOUR CODE
print(cm)

In [None]:
# Set up plot
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
fig.colorbar(cax)

# Set up axes
ax.set_xticklabels([''] + languages, rotation=90)
ax.set_yticklabels([''] + languages)

# Force label at every tick
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

# sphinx_gallery_thumbnail_number = 2
plt.show()