In [21]:
import pandas as pd
import numpy as np
import os
import re
from nltk.tokenize import sent_tokenize

In [22]:
languages = {
    0: 'Danish', 1: 'German',
    2: 'Greek', 3: 'English',
    4: 'Spanish', 5: 'Finnish',
    6: 'French', 7: 'Italian',
    8: 'Dutch', 9: 'Portuguese',
    10: 'Swedish', 11: 'Bulgarian',
    12: 'Czech', 13: 'Estonian',
    14: 'Hungarian', 15: 'Lithuanian',
    16: 'Latvian', 17: 'Polish',
    18: 'Romanian', 19: 'Slovak',
    20: 'Slovenian'
}

In [23]:
def extract_language(language):
    with open(os.getcwd() + '/dataset/' + language +".txt") as outfile:
        lang = outfile.read()
    return lang

def clean(language):
    pattern = r'<(!?).*>'    
    
    language = re.sub(pattern, '', language)
    
    language = ''.join([i for i in language if not i.isdigit()])
    language = ''.join([i for i in language if i not in "(){}[]\n,'"])
    
    language = sent_tokenize(language)
    language = [i for i in language if len(i)> 4]
    return language
    
def stack(sentences, langauge_id, language):
    length = len(sentences)
    
    target = [langauge_id] * length
    lang = [language] * length
    
    df = pd.DataFrame(np.c_[sentences, target, lang], columns=['Sentences','Target', 'Language'])
    return df

def shuffle(dataframe):
    return dataframe.sample(frac=1).reset_index(drop=True)

def preprocess():
    data = pd.DataFrame([])
    for code,language in languages.items():
        extracted = extract_language(language.lower())
        cleaned = clean(extracted)
        dataframe = stack(cleaned, code, language)
        
        data = data.append(dataframe, ignore_index=True)
    data = shuffle(data)
    data['Target'] = data['Target'].astype(int)
    return data

In [24]:
def total_lines():
    sum = 0
    for code, lang in languages.items():
        extracted = extract_language(lang.lower())
        cleaned = clean(extracted)
        sum += len(cleaned)
    return sum

In [25]:
total_lines()

16199

In [26]:
data = preprocess()

In [27]:
data

Unnamed: 0,Sentences,Target,Language
0,Co to dokładnie znaczy?,17,Polish
1,Kära kolleger!,10,Swedish
2,Je snadnější zemi zruinovat než ji přestavět.,12,Czech
3,Por quanto tempo poderemos nós aceitar um dese...,9,Portuguese
4,Pod tym względem oczekujemy że stosownie do wy...,17,Polish
...,...,...,...
16194,A koppenhágai csúcstalálkozó ahova az éghajlat...,14,Hungarian
16195,Basta soffermarsi sui dati contenuti nella ses...,7,Italian
16196,To nagrado je prejel zaradi zagovarjanja svobo...,20,Slovenian
16197,Včera na slyšení ve výboru jste uvedla že člen...,12,Czech


In [28]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [29]:
data['Target'].max()

20

In [30]:
y = tf.keras.utils.to_categorical(data['Target'], num_classes=21)

In [31]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=40000)

In [32]:
tok.fit_on_texts(data['Sentences'])

In [33]:
x = tok.texts_to_sequences(data['Sentences'])

In [34]:
vocab = len(tok.word_index) + 1
vocab

93722

In [35]:
pad = tf.keras.preprocessing.sequence.pad_sequences(x,maxlen=(120))

In [124]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab,
                              output_dim=100,
                             input_length=120),
    tf.keras.layers.LSTM(200),
    #tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(21, activation=tf.nn.softmax)
])

In [125]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 120, 100)          9372200   
_________________________________________________________________
lstm_5 (LSTM)                (None, 200)               240800    
_________________________________________________________________
dense_13 (Dense)             (None, 21)                4221      
Total params: 9,617,221
Trainable params: 9,617,221
Non-trainable params: 0
_________________________________________________________________


In [126]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [127]:
from sklearn.model_selection import train_test_split

In [128]:
X_train, X_test, y_train, y_test = train_test_split(pad, y, test_size=0.1, random_state=42)

In [129]:
model.fit(X_train,y_train,epochs=10, batch_size=512, validation_data=(X_test, y_test),
          callbacks=[tf.keras.callbacks.EarlyStopping(), tf.keras.callbacks.TensorBoard(log_dir='./graph', write_graph=True, write_images=True)])

Train on 14579 samples, validate on 1620 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


<tensorflow.python.keras.callbacks.History at 0x7f74490a2358>

In [130]:
model.evaluate(X_test, y_test)



[0.12001513737587281, 0.97037035]

In [79]:
#model.save('model_one.h5')

In [None]:
## Just testing :) 

In [167]:
test_text = ['']

In [168]:
text = tf.keras.preprocessing.sequence.pad_sequences(tok.texts_to_sequences(test_text), maxlen=120)

In [169]:
languages.get(model.predict_classes(text)[0])

'Latvian'