In [1]:
import pandas as pd
import numpy as np
import os
import re
from nltk.tokenize import sent_tokenize

In [2]:
languages = {
    0: 'Danish', 1: 'German',
    2: 'Greek', 3: 'English',
    4: 'Spanish', 5: 'Finnish',
    6: 'French', 7: 'Italian',
    8: 'Dutch', 9: 'Portuguese',
    10: 'Swedish', 11: 'Bulgarian',
    12: 'Czech', 13: 'Estonian',
    14: 'Hungarian', 15: 'Lithuanian',
    16: 'Latvian', 17: 'Polish',
    18: 'Romanian', 19: 'Slovak',
    20: 'Slovenian'
}

In [3]:
def extract_language(language):
    with open(os.getcwd() + '/dataset/' + language +".txt") as outfile:
        lang = outfile.read()
    return lang

def clean(language):
    pattern = r'<(!?).*>'    
    
    language = re.sub(pattern, '', language)
    
    language = ''.join([i for i in language if not i.isdigit()])
    language = ''.join([i for i in language if i not in "(){}[]\n,'"])
    
    language = sent_tokenize(language)
    language = [i for i in language if len(i)> 4]
    return language
    
def stack(sentences, langauge_id, language):
    length = len(sentences)
    
    target = [langauge_id] * length
    lang = [language] * length
    
    df = pd.DataFrame(np.c_[sentences, target, lang], columns=['Sentences','Target', 'Language'])
    return df

def shuffle(dataframe):
    return dataframe.sample(frac=1).reset_index(drop=True)

def preprocess():
    data = pd.DataFrame([])
    for code,language in languages.items():
        extracted = extract_language(language.lower())
        cleaned = clean(extracted)
        dataframe = stack(cleaned, code, language)
        
        data = data.append(dataframe, ignore_index=True)
    data = shuffle(data)
    data['Target'] = data['Target'].astype(int)
    return data

In [4]:
def total_lines():
    sum = 0
    for code, lang in languages.items():
        extracted = extract_language(lang.lower())
        cleaned = clean(extracted)
        sum += len(cleaned)
    return sum

In [5]:
total_lines()

16199

In [6]:
data = preprocess()

In [7]:
data

Unnamed: 0,Sentences,Target,Language
0,Niinpä voitte tässä asiassa olla tyytyväinen s...,5,Finnish
1,Trotz der bisherigen Maßnahmen sind diese Ungl...,1,German
2,Kahjuks ei ole Kambodža kiire majandusarenguga...,13,Estonian
3,Εάν επιτραπεί στις αρχές αυτές να καταρτίσουν ...,2,Greek
4,Казваме че има млн.,11,Bulgarian
...,...,...,...
16194,Todėl paprasčiausiai nesuprantu kodėl negalime...,15,Lithuanian
16195,But I believe that we should do all we can to ...,3,English
16196,Ze moet alles in het werk stellen om ervoor te...,8,Dutch
16197,Ich möchte Sie auf einen Fall aufmerksam mache...,1,German


In [8]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [9]:
data['Target'].max()

20

In [10]:
y = tf.keras.utils.to_categorical(data['Target'], num_classes=21)

In [11]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=40000)

In [12]:
tok.fit_on_texts(data['Sentences'])

In [13]:
x = tok.texts_to_sequences(data['Sentences'])

In [14]:
vocab = len(tok.word_index) + 1
vocab

93722

In [15]:
pad = tf.keras.preprocessing.sequence.pad_sequences(x,maxlen=(120))

In [16]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab,
                              output_dim=200,
                             input_length=120),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(21, activation=tf.nn.softmax)
])

In [17]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(pad, y, test_size=0.1, random_state=42)

In [20]:
model.fit(X_train,y_train,epochs=3)

Train on 14579 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fd57c7c0090>

In [21]:
model.evaluate(X_test, y_test)



[0.0422817061436765, 0.995679]

In [22]:
model.save('language2.h5')