In [1]:
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix
import numpy as np
import pickle

  import pandas.util.testing as tm


In [2]:
# read the data
file_nl = open('nl.json',)
data_nl = pd.DataFrame.from_dict(json.load(file_nl))
data_nl['Language'] = ['nl'] * len(data_nl)

file_en = open('en.json',)
data_en = pd.DataFrame.from_dict(json.load(file_en))
data_en['Language'] = ['nonl'] * len(data_en)

file_da = open('da.json',)
data_da = pd.DataFrame.from_dict(json.load(file_da))
data_da['Language'] = ['nonl'] * len(data_da)

file_it = open('it.json',)
data_it = pd.DataFrame.from_dict(json.load(file_it))
data_it['Language'] = ['nonl'] * len(data_it)

file_de = open('de.json',)
data_de = pd.DataFrame.from_dict(json.load(file_de))
data_de["Language"] = ['nonl'] * len(data_de)

file_af = open('af.json')
data_af = pd.DataFrame.from_dict(json.load(file_af))
data_af["Language"] = ['af'] * len(data_af)

In [18]:
data_nl_balanced = pd.concat([data_nl[data_nl['siteUrl'].str.contains('.com/')],
                         data_nl[data_nl['siteUrl'].str.contains('.org/')],
                         data_nl[~data_nl['siteUrl'].str.contains('.com/|.org/')].sample(6000)],
                        ignore_index = True)

# put dutch and other languages data in one dataframe (except for african)
data = pd.concat([data_nl_balanced, data_en, data_da, data_it, data_de], ignore_index = True)
#len(data_nl_balanced)

In [19]:
#Filter by text language
lang = ['nl', 'nonl']
data = data[data['Language'].isin(lang)]

#Select 10000 rows for dutch and 20000 for non-dutch
data_trim = pd.DataFrame(columns=['siteUrl','Language'])

lang_trim_nl = data[data['Language'] == 'nl'].sample(10000,random_state = 100)
lang_trim_nonl = data[data['Language'] == 'nonl'].sample(20000,random_state = 100)
data_trim = data_trim.append(lang_trim_nl)
data_trim = data_trim.append(lang_trim_nonl)

#Create a random train, valid, test split
data_shuffle = data_trim.sample(frac=1)
data_shuffle.drop(["sample"], axis = 1)

train = data_shuffle[0:25000]
valid = data_shuffle[20000:25000]
test = data_shuffle[25000:30000]

In [20]:
def get_trigrams(corpus,n_feat=300):
    """
    Returns a list of the N most common character trigrams from a list of sentences
    params
    ------------
        corpus: list of strings
        n_feat: integer
    """
    
    #fit the n-gram model
    vectorizer = CountVectorizer(analyzer='char',
                            ngram_range=(3,3)
                            ,max_features=n_feat)
    
    X = vectorizer.fit_transform(corpus)
    
    #Get model feature names
    feature_names = vectorizer.get_feature_names()
    
    return feature_names

In [21]:
#obtain trigrams from each language
features = {}
features_set = set()

for l in lang:
    
    #get corpus filtered by language
    corpus = train[train.Language==l]['siteUrl']
    
    #get 400 most frequent trigrams
    trigrams = get_trigrams(corpus)
    
    #add to dict and set
    features[l] = trigrams 
    features_set.update(trigrams)

    
#create vocabulary list using feature set
vocab = dict()
for i,f in enumerate(features_set):
    vocab[f]=i
    


In [22]:
#train count vectoriser using vocabulary
vectorizer = CountVectorizer(analyzer='char',
                             ngram_range=(3,3),
                            vocabulary=vocab)

#create feature matrix for training set
corpus = train['siteUrl']   
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()

train_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)

In [23]:
#Scale feature matrix 
train_min = train_feat.min()
train_max = train_feat.max()
train_feat = (train_feat - train_min)/(train_max-train_min)

#Add target variable 
train_feat['Language'] = list(train['Language'])

In [24]:
#create feature matrix for validation set
corpus = valid['siteUrl']   
X = vectorizer.fit_transform(corpus)

valid_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
valid_feat = (valid_feat - train_min)/(train_max-train_min)
valid_feat['Language'] = list(valid['Language'])

#create feature matrix for test set
corpus = test['siteUrl']   
X = vectorizer.fit_transform(corpus)

test_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
test_feat = (test_feat - train_min)/(train_max-train_min)
test_feat['Language'] = list(test['Language'])

In [25]:
#Fit encoder
encoder = LabelEncoder()
encoder.fit(['nl', 'nonl'])

def encode(y):
    """
    Returns a list of one hot encodings
    Params
    ---------
        y: list of language labels
    """
    
    y_encoded = encoder.transform(y)
    y_dummy = np_utils.to_categorical(y_encoded)
    
    return y_dummy

In [32]:
#Get training data
x = train_feat.drop('Language',axis=1)
y = encode(train_feat['Language'])

#Define model
model = Sequential()
model.add(Dense(500, input_dim=len(vocab), activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='categorical_crossentropy',
              optimizer= keras.optimizers.Adam(learning_rate=0.0001),
              metrics=['accuracy'])

#Train model
model.fit(x, y, epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21424125048>

In [33]:
x_test = test_feat.drop('Language',axis=1)
y_test = test_feat['Language']

#Get predictions on test set
labels = model.predict(x_test)
predictions = np.argmax(labels, axis=1)
predictions = np.where(predictions, 'nonl', 'nl')

#Accuracy on test set
accuracy = accuracy_score(y_test,predictions)
print("accuray: ", accuracy)

#Create confusion matrix
lang = ['nl', 'nonl']
conf_matrix = confusion_matrix(y_test,predictions)
conf_matrix_df = pd.DataFrame(conf_matrix,columns=lang,index=lang)
print(conf_matrix_df)

accuray:  0.8836
        nl  nonl
nl    1418   247
nonl   335  3000


In [34]:
# Save the model for later use
model.save('nn_trigram_balanced')

train_min.to_pickle('trainmin_balanced.pkl')
train_max.to_pickle('trainmax_balanced.pkl')

file = open("vocabulary_balanced.pkl", 'wb')
pickle.dump(vocab, file)
file.close()

INFO:tensorflow:Assets written to: nn_trigram_balanced\assets
