In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn import linear_model
import spacy
from pure_sklearn.map import convert_estimator
import pickle

In [None]:
nlp  = spacy.load('nl_core_news_lg') 

In [None]:
data = pd.read_pickle('./data/processed/woorden_met_hetofde.pickle')
data.head()

In [None]:
data.det.value_counts()

In [None]:
16675-5137

In [None]:
# Separate majority and minority classes
df_majority = data[data.det=='de']
df_minority = data[data.det=='het']
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=16675,    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
df_upsampled.det.value_counts()

In [None]:
df_upsampled.head()

In [None]:
X = df_upsampled.woord_vec.tolist()
y = df_upsampled.det.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### KNN 

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3,n_jobs=-1)

In [None]:
%%timeit
neigh.fit(X_train, y_train)

In [None]:
predicted = neigh.predict(X_test)

In [None]:
np.mean(predicted==y_test)

## RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [None]:
clf.fit(X_train, y_train)

In [None]:
predicted = clf.predict(X_test)

In [None]:
np.mean(predicted==y_test)

In [None]:

# # convert to pure python estimator
# clf_pure_predict = convert_estimator(clf)
# with open("./data/dumps/random_forest_clf.pkl", "wb") as f:
#     pickle.dump(clf_pure_predict, f)

In [None]:
validation_data = [(token.text, token.vector) for token in doc if token.pos_ == "NOUN"]


In [None]:
vector_list = [v[1] for v in validation_data]
word_list   = [v[0] for v in validation_data]

In [None]:
# load pickled model
with open("./data/dumps/random_forest_clf.pkl", "rb") as f:
    clf = pickle.load(f)

In [None]:
# make prediction with pure-predict object
predictions = clf.predict_proba(vector_list)

In [None]:
json_result_list = []
for prediction, word in zip(predictions,word_list):
    json_result = {}
    json_result['woord'] = word
    json_result['probability'] = {'de' : prediction[0] , 'het' :   prediction[1]}
    json_result_list.append(json_result)

In [None]:
json_result_list

## MLP 

In [None]:
clf = MLPClassifier(random_state=1, learning_rate='adaptive',
                    max_iter=200).fit(X_train, y_train)

In [None]:
import pickle
# convert to pure python estimator
clf_pure_predict = convert_estimator(clf)
with open("./data/dumps/mlp_classifier.pickle","wb") as f:
    pickle.dump(clf_pure_predict, f)

In [None]:
predicted = clf.predict(X_test)

In [None]:
np.mean(predicted==y_test)

## SGD

In [None]:
%%timeit
clf = linear_model.SGDClassifier().fit(X_train, y_train)

In [None]:
%%timeit
predicted = clf.predict(X_test)

In [None]:
%%timeit
np.mean(predicted==y_test)