In [6]:
import pandas as pd
import numpy as np
import os
import re
from nltk.tokenize import sent_tokenize

In [7]:
languages = {
    0: 'Danish', 1: 'German',
    2: 'Greek', 3: 'English',
    4: 'Spanish', 5: 'Finnish',
    6: 'French', 7: 'Italian',
    8: 'Dutch', 9: 'Portuguese',
    10: 'Swedish', 11: 'Bulgarian',
    12: 'Czech', 13: 'Estonian',
    14: 'Hungarian', 15: 'Lithuanian',
    16: 'Latvian', 17: 'Polish',
    18: 'Romanian', 19: 'Slovak',
    20: 'Slovenian'
}

In [8]:
def extract_language(language):
    with open(os.getcwd() + '/dataset/' + language +".txt") as outfile:
        lang = outfile.read()
    return lang

def clean(language):
    pattern = r'<(!?).*>'    
    
    language = re.sub(pattern, '', language)
    
    language = ''.join([i for i in language if not i.isdigit()])
    language = ''.join([i for i in language if i not in "(){}[]\n,'"])
    
    language = sent_tokenize(language)
    language = [i for i in language if len(i)> 4]
    return language
    
def stack(sentences, langauge_id, language):
    length = len(sentences)
    
    target = [langauge_id] * length
    lang = [language] * length
    
    df = pd.DataFrame(np.c_[sentences, target, lang], columns=['Sentences','Target', 'Language'])
    return df

def shuffle(dataframe):
    return dataframe.sample(frac=1).reset_index(drop=True)

def preprocess():
    data = pd.DataFrame([])
    for code,language in languages.items():
        extracted = extract_language(language.lower())
        cleaned = clean(extracted)
        dataframe = stack(cleaned, code, language)
        
        data = data.append(dataframe, ignore_index=True)
    data = shuffle(data)
    data['Target'] = data['Target'].astype(int)
    return data

In [9]:
def total_lines():
    sum = 0
    for code, lang in languages.items():
        extracted = extract_language(lang.lower())
        cleaned = clean(extracted)
        sum += len(cleaned)
    return sum

In [10]:
total_lines()

993071

In [11]:
data = preprocess()

In [12]:
data

Unnamed: 0,Sentences,Target,Language
0,È assolutamente scandaloso quale che sia il me...,7,Italian
1,Има обаче един факт който не може да бъде прен...,11,Bulgarian
2,Het gaat om een echte monopolierichtlijn want ...,8,Dutch
3,Puola on yksi suurista toivoa herättävistä mer...,5,Finnish
4,Prav tako to ni vprašanje napredka in šele poz...,20,Slovenian
...,...,...,...
993066,Meidän täällä parlamentissa on torjuttava uusi...,5,Finnish
993067,To môže byť prípad budov ktoré samozrejme pred...,19,Slovak
993068,Indholdet er nemlig ikke længere kun afhængig ...,0,Danish
993069,În calitate de deputați în Parlamentul Europea...,18,Romanian


In [13]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [15]:
data['Target'].max()

20

In [17]:
#y = tf.keras.utils.to_categorical(data['Target'], num_classes=21)
y = data['Target']
y

0          7
1         11
2          8
3          5
4         20
          ..
993066     5
993067    19
993068     0
993069    18
993070     9
Name: Target, Length: 993071, dtype: int64

In [18]:
tok = tf.keras.preprocessing.text.Tokenizer()

In [19]:
tok.fit_on_texts(data['Sentences'])

In [20]:
texts = tok.texts_to_sequences(data['Sentences'])

In [21]:
vocab = len(tok.word_index) + 1
vocab

1050037

In [23]:
pad = tf.keras.preprocessing.sequence.pad_sequences(texts,maxlen=(100))

In [26]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab,
                              output_dim=128,
                             input_length=100),
    #tf.keras.layers.LSTM(200),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(21, activation=tf.nn.softmax)
])

In [27]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 128)          134404736 
_________________________________________________________________
flatten_1 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 21)                268821    
Total params: 134,673,557
Trainable params: 134,673,557
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [29]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(pad, np.array(y), test_size=0.1, random_state=42)

In [33]:
model.fit(X_train,y_train,epochs=100, batch_size=512, validation_data=(X_test, y_test),
          callbacks=[tf.keras.callbacks.EarlyStopping(),
                     tf.keras.callbacks.TensorBoard(log_dir='./graph', write_graph=True, write_images=True)])

Train on 893763 samples, validate on 99308 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


<tensorflow.python.keras.callbacks.History at 0x7f75e9acec50>

In [34]:
model.save('model_two.h5')

In [38]:
import pickle

In [44]:
with open('tokenizer_two.json', 'wb') as handle:
    pickle.dump(tok, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
with open('tokenizer_two.json', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
## Just testing :) 

In [35]:
test_text = ['hello world']

In [36]:
text = tf.keras.preprocessing.sequence.pad_sequences(tok.texts_to_sequences(test_text), maxlen=100)

In [37]:
languages.get(model.predict_classes(text)[0])

'English'

In [43]:
tokenizer.texts_to_sequences(test_text)

[[515459, 2546]]

In [50]:
text = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test_text), maxlen=100)

In [52]:
languages.get(model.predict_classes(text)[0])

'English'