In [1]:
import pandas as pd
import numpy as np
import os
import re
from nltk.tokenize import sent_tokenize

In [2]:
languages = {
    0: 'Danish', 1: 'German',
    2: 'Greek', 3: 'English',
    4: 'Spanish', 5: 'Finnish',
    6: 'French', 7: 'Italian',
    8: 'Dutch', 9: 'Portuguese',
    10: 'Swedish', 11: 'Bulgarian',
    12: 'Czech', 13: 'Estonian',
    14: 'Hungarian', 15: 'Lithuanian',
    16: 'Latvian', 17: 'Polish',
    18: 'Romanian', 19: 'Slovak',
    20: 'Slovenian'
}

In [3]:
def extract_language(language):
    with open(os.getcwd() + '/dataset/' + language +".txt") as outfile:
        lang = outfile.read()
    return lang.lower()

def clean(language):
    pattern = r'<(!?).*>'    
    
    language = re.sub(pattern, '', language)
    
    language = ''.join([i for i in language if not i.isdigit()])
    language = ''.join([i for i in language if i not in "(){}[]\n,'"])
    
    language = sent_tokenize(language)
    language = [i for i in language if len(i)> 4]
    return language
    
def stack(sentences, langauge_id, language):
    length = len(sentences)
    
    target = [langauge_id] * length
    lang = [language] * length
    
    df = pd.DataFrame(np.c_[sentences, target, lang], columns=['Sentences','Target', 'Language'])
    return df

def shuffle(dataframe):
    return dataframe.sample(frac=1).reset_index(drop=True)

def preprocess():
    data = pd.DataFrame([])
    for code,language in languages.items():
        extracted = extract_language(language.lower())
        cleaned = clean(extracted)
        dataframe = stack(cleaned, code, language)
        
        data = data.append(dataframe, ignore_index=True)
    data = shuffle(data)
    data['Target'] = data['Target'].astype(int)
    return data

In [4]:
def total_lines():
    sum = 0
    for code, lang in languages.items():
        extracted = extract_language(lang.lower())
        cleaned = clean(extracted)
        sum += len(cleaned)
    return sum

In [5]:
total_lines()

992612

In [6]:
data = preprocess()

In [7]:
data

Unnamed: 0,Sentences,Target,Language
0,enako velja tudi za mala in srednje velika pod...,20,Slovenian
1,az egyes árutételekhez tartozó élelmiszereket ...,14,Hungarian
2,det är en bra idé att utarbeta en indikator fö...,10,Swedish
3,tie būs ļoti nozīmīgi šim ziņojumam.,16,Latvian
4,nogmaals het moet opgehelderd worden en ik ben...,8,Dutch
...,...,...,...
992607,šiame pranešime taip pat pabrėžiamas poreikis ...,15,Lithuanian
992608,les rapports à venir nous informeront sur vos ...,6,French
992609,ωστόσο είναι ικανοποιητικός ο συμβιβασμός που ...,2,Greek
992610,czy taki produkt wzbudziłby państwa zaufanie?,17,Polish


In [8]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [9]:
data['Target'].max()

20

In [10]:
#y = tf.keras.utils.to_categorical(data['Target'], num_classes=21)
y = data['Target']
y

0         20
1         14
2         10
3         16
4          8
          ..
992607    15
992608     6
992609     2
992610    17
992611    14
Name: Target, Length: 992612, dtype: int64

In [11]:
tok = tf.keras.preprocessing.text.Tokenizer()

In [12]:
tok.fit_on_texts(data['Sentences'])

In [13]:
texts = tok.texts_to_sequences(data['Sentences'])

In [16]:
vocab = len(tok.word_index) + 1
vocab

1050038

In [15]:
pad = tf.keras.preprocessing.sequence.pad_sequences(texts,maxlen=(100))

In [18]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab,
                              output_dim=128,
                             input_length=100),
    #tf.keras.layers.LSTM(200),
    tf.keras.layers.Flatten(),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(21, activation=tf.nn.softmax)
])

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 128)          134404864 
_________________________________________________________________
flatten (Flatten)            (None, 12800)             0         
_________________________________________________________________
batch_normalization (BatchNo (None, 12800)             51200     
_________________________________________________________________
dense (Dense)                (None, 21)                268821    
Total params: 134,724,885
Trainable params: 134,699,285
Non-trainable params: 25,600
_________________________________________________________________


In [20]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(pad, np.array(y), test_size=0.1, random_state=42)

In [23]:
model.fit(X_train,y_train,epochs=100, batch_size=2048, validation_data=(X_test, y_test),
          callbacks=[tf.keras.callbacks.EarlyStopping(),
                     tf.keras.callbacks.TensorBoard(log_dir='./graph', write_graph=True, write_images=True)])

Train on 893350 samples, validate on 99262 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


<tensorflow.python.keras.callbacks.History at 0x7f77e6d0e9b0>

In [24]:
model.save('model_three.h5')

In [25]:
import pickle

In [26]:
with open('tokenizer_three.json', 'wb') as handle:
    pickle.dump(tok, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
with open('tokenizer_three.json', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [28]:
## Just testing :) 

In [35]:
test_text = ['hello world']

In [36]:
text = tf.keras.preprocessing.sequence.pad_sequences(tok.texts_to_sequences(test_text), maxlen=100)

In [37]:
languages.get(model.predict_classes(text)[0])

'English'

In [43]:
tokenizer.texts_to_sequences(test_text)

[[515459, 2546]]

In [50]:
text = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test_text), maxlen=100)

In [52]:
languages.get(model.predict_classes(text)[0])

'English'