In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn import linear_model
from sklearn.metrics import classification_report
import spacy
from pure_sklearn.map import convert_estimator
import pickle
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy_score,  make_scorer
from sklearn import preprocessing


balanced_accuracy = make_scorer(balanced_accuracy_score, adjusted=True )

In [None]:
nlp  = spacy.load('nl_core_news_md') 

In [None]:
data = pd.read_pickle('./data/processed/woorden_met_hetofde.pickle')
data.head()

In [None]:
data.det.value_counts()

In [None]:
le = preprocessing.LabelEncoder()

In [None]:
selected_data = data[['det','woord_vec']]
# y_transformed = le.fit_transform(y)

train, validation = train_test_split(selected_data, test_size=0.10, stratify=selected_data['det'], random_state=42)

In [None]:
train.det.value_counts()

In [None]:
# Separate majority and minority classes
df_majority = train[data.det=='de']
df_minority = train[data.det=='het']
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=15588,    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
df_upsampled.det.value_counts()

In [None]:
X = df_upsampled.woord_vec.tolist()
y = df_upsampled.det.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 1000}

clf_random_forest_untuned = RandomForestClassifier(n_jobs=-1,
                                                   n_estimators=2000,
                                                   max_depth=5, 
                                                   class_weight='balanced')
clf_random_forest_untuned.fit(X_train, y_train)

## Predict in the test set of the upsampled data

In [None]:
predicted = clf_random_forest_untuned.predict(X_test)


In [None]:
print(classification_report(y_test, predicted))

## Predict in the validation data

In [None]:
X_validation = validation['woord_vec'].tolist()
y_validation = validation['det'].tolist()

In [None]:
prediction_on_validation = clf_random_forest_untuned.predict( X_validation )
print(classification_report(y_validation, prediction_on_validation))

In [None]:
# Retrain with full dataset
# clf_random_forest_untuned.fit(X,y)

In [None]:
# convert to pure python estimator
clf_pure_predict = convert_estimator(clf_random_forest_untuned)
with open("./data/dumps/random_forest.pickle","wb") as f:
    pickle.dump(clf_pure_predict, f)

In [None]:

clf_random_forest = RandomForestClassifier(n_jobs=-1, class_weight='balanced')

parameters = {
    'n_estimators'      : [10,500,1000],
    'max_depth'         : [5, 10, 50, 100],
    'max_features': ['auto'],
    'criterion' :['gini']
}

grid_clf_random_forest = GridSearchCV(clf_random_forest, param_grid = parameters,scoring = 'f1', n_jobs=-1)
grid_clf_random_forest.fit(X_train, y_train)


In [None]:
print("Best parameters set found on development set:")
print()
print(grid_clf_random_forest.best_params_)
print()
print("Grid scores on development set:")
print()
means = grid_clf_random_forest.cv_results_['mean_test_score']
stds = grid_clf_random_forest.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_clf_random_forest.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, grid_clf_random_forest.predict(X_test)
print(classification_report(y_true, y_pred))
print()


In [None]:
predicted = grid_clf_random_forest.predict(X_test)
np.mean(predicted==y_test)

In [None]:
## Adaboost with LogisticRegression

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
base_classifier = LogisticRegression(n_jobs=-1,
                         penalty='l2',
                         C= 0.001, 
                         random_state=42,
                         class_weight='balanced')

In [None]:
adaboost_clf_untuned = AdaBoostClassifier(base_classifier, n_estimators=100, random_state=0)

In [None]:
adaboost_clf_untuned.fit(X_train,y_train)

In [None]:
predicted = adaboost_clf_untuned.predict(X_test)
np.mean(predicted==y_test)

In [None]:
cross_val_score(adaboost_clf_untuned, X_train, y_train, cv=3, n_jobs=-1, scoring=balanced_accuracy)


### SVM

In [None]:
from sklearn import svm
clf = svm.SVC(class_weight='balanced')

In [None]:
clf.fit(X_train,y_train)

In [None]:
predicted = clf.predict(X_test)
np.mean(predicted==y_test)

## Adboost 

In [None]:
#base_classifier = RandomForestClassifier(n_jobs=-1, class_weight='balanced')

In [None]:

defualt_adaboost_clf_untuned = AdaBoostClassifier(n_estimators=100, random_state=0)

In [None]:
defualt_adaboost_clf_untuned.fit(X_train,y_train)

In [None]:
predicted = defualt_adaboost_clf_untuned.predict(X_test)
np.mean(predicted==y_test)

In [None]:
defualt_adaboost_clf_untuned_cv_score = cross_val_score(defualt_adaboost_clf_untuned, X, y, cv=3, n_jobs=-1, scoring=balanced_accuracy)

In [None]:
defualt_adaboost_clf_untuned_cv_score

In [None]:
# Tune on full data set
defualt_adaboost_clf_untuned.fit(X,y)

In [None]:
# convert to pure python estimator
clf_pure_predict = convert_estimator(defualt_adaboost_clf_untuned)
with open("./data/dumps/adaboost_classifier.pickle","wb") as f:
    pickle.dump(clf_pure_predict, f)

## Parameter tuned Adaboost

In [None]:
from skopt.space import Real, Integer
from skopt.utils import use_named_args

clf = AdaBoostClassifier(random_state=0)

# The list of hyper-parameters we want to optimize. For each one we define the
# bounds, the corresponding scikit-learn parameter name, as well as how to
# sample values from that dimension (`'log-uniform'` for the learning rate)
space  = [
          Real(10**-1, 10**1, "log-uniform", name='learning_rate'),
          Integer(50, 500, name='n_estimators')
        ]

# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set
# scikit-learn estimator parameters
@use_named_args(space)
def objective(**params):
    clf.set_params(**params)

    return 1-np.mean(cross_val_score(clf, X, y_transformed, cv=3, n_jobs=-1,
                                    scoring=balanced_accuracy))


In [None]:
from skopt import gp_minimize
res_gp = gp_minimize(objective, space, n_calls=15, random_state=0,n_jobs=4, verbose=True)

"Best score=%.4f" % res_gp.fun

## Gradient boosting 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
clf = GradientBoostingClassifier(random_state=0)

In [None]:
clf.fit(X_train,y_train)

In [None]:
predicted = clf.predict(X_test)
np.mean(predicted==y_test)

In [None]:
# convert to pure python estimator
clf_pure_predict = convert_estimator(clf)
with open("./data/dumps/GradientBoostingClassifier.pickle","wb") as f:
    pickle.dump(clf_pure_predict, f)

In [None]:
doc = nlp("Kass is kaas.")
validation_data = [(token.text, token.vector) for token in doc if token.pos_ == "NOUN"]
vector_list = [v[1] for v in validation_data]
word_list   = [v[0] for v in validation_data]


In [None]:
# load pickled model
with open("./data/dumps/GradientBoostingClassifier.pickle", "rb") as f:
    clf = pickle.load(f)
    
# make prediction with pure-predict object
predictions = clf.predict_proba(vector_list)

json_result_list = []
for prediction, word in zip(predictions,word_list):
    json_result = {}
    json_result['woord'] = word
    json_result['probability'] = {'de' : prediction[0] , 'het' :   prediction[1]}
    json_result_list.append(json_result)

In [None]:
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn.model_selection import cross_val_score

clf = GradientBoostingClassifier(random_state=0)

n_features=100
# The list of hyper-parameters we want to optimize. For each one we define the
# bounds, the corresponding scikit-learn parameter name, as well as how to
# sample values from that dimension (`'log-uniform'` for the learning rate)
space  = [Integer(1, 5, name='max_depth'),
          Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
          Integer(1, n_features, name='max_features'),
          Integer(2, 100, name='min_samples_split'),
          Integer(1, 100, name='min_samples_leaf')]

# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set
# scikit-learn estimator parameters
@use_named_args(space)
def objective(**params):
    clf.set_params(**params)

    return 1-np.mean(cross_val_score(clf, X, y_transformed, cv=3, n_jobs=-1,
                                    scoring="f1"))


In [None]:
from skopt import gp_minimize
res_gp = gp_minimize(objective, space, n_calls=50, random_state=0,n_jobs=4, verbose=True)

"Best score=%.4f" % res_gp.fun

### KNN 

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3,n_jobs=-1)



In [None]:
neigh.fit(X_train, y_train)

In [None]:
predicted = neigh.predict(X_test)

In [None]:
np.mean(predicted==y_test)

## XGB

In [None]:
import xgboost as xgb
# data_dmatrix = xgb.DMatrix(data=X,label=y_transformed)

In [None]:
clf_xgb = xgb.XGBClassifier(scale_pos_weight=100)
# param_dist = {'n_estimators': stats.randint(150, 500),
#               'learning_rate': stats.uniform(0.01, 0.07),
#               'subsample': stats.uniform(0.3, 0.7),
#               'max_depth': [3, 4, 5, 6, 7, 8, 9],
#               'colsample_bytree': stats.uniform(0.5, 0.45),
#               'min_child_weight': [1, 2, 3]
#              }
xgb_param = clf_xgb.get_xgb_params()
# vresult = xgb.cv(xgb_param, data_dmatrix, nfold=3, metrics=['auc'],stratified=True, maximize = True, seed=1301)

In [None]:
# vresult.head()

In [None]:
tmp = [[11, 22, 33, 44],
        [55, 66, 77 ,88],
        [99, 100, 101, 102]]
np.array(tmp)

In [None]:
X_ = data.woord_vec
y_ = data.det

X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, test_size=0.33, stratify=y_ , random_state=42)

In [None]:
a = data.woord_vec.tolist()

In [None]:
clf_xgb.set_params()

In [None]:
from scipy import stats
from xgboost import XGBClassifier
from sklearn.model_selec`tion import RandomizedSearchCV, KFold
from sklearn.metrics import f1_score

clf_xgb = XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': stats.randint(150, 500),
              'learning_rate': stats.uniform(0.01, 0.07),
              'subsample': stats.uniform(0.3, 0.7),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': stats.uniform(0.5, 0.45),
              'min_child_weight': [1, 2, 3]
             }
clf = RandomizedSearchCV(clf_xgb, param_distributions = param_dist, n_iter = 25, scoring = 'f1', error_score = 0, verbose = 3, n_jobs = -1)

numFolds = 5
folds = KFold(n_splits = numFolds, shuffle = True)

estimators = []
results = np.zeros(len(X))
score = 0.0
for train_index, test_index in folds.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index].values.ravel(), y.iloc[test_index].values.ravel()
    clf.fit(X_train, y_train)

    estimators.append(clf.best_estimator_)
    results[test_index] = clf.predict(X_test)
    score += f1_score(y_test, results[test_index])
score /= numFolds

In [None]:
cv_results.head()

In [None]:
predicted = clf.predict(X_test)

In [None]:
np.mean(predicted==y_test)

In [None]:

# # convert to pure python estimator
# clf_pure_predict = convert_estimator(clf)
# with open("./data/dumps/random_forest_clf.pkl", "wb") as f:
#     pickle.dump(clf_pure_predict, f)

In [None]:
GradientBoostingClassifier.pickle

In [None]:
json_result_list

## MLP 

In [None]:
clf = MLPClassifier(random_state=1, learning_rate='adaptive',
                    max_iter=200).fit(X_train, y_train)

In [None]:
import pickle
# convert to pure python estimator
clf_pure_predict = convert_estimator(clf)
with open("./data/dumps/mlp_classifier.pickle","wb") as f:
    pickle.dump(clf_pure_predict, f)

In [None]:
predicted = clf.predict(X_test)

In [None]:
np.mean(predicted==y_test)

## SGD

In [None]:
%%timeit
clf = linear_model.SGDClassifier().fit(X_train, y_train)

In [None]:
%%timeit
predicted = clf.predict(X_test)

In [None]:
%%timeit
np.mean(predicted==y_test)

In [None]:
import string
table = str.maketrans(dict.fromkeys(string.punctuation))  # OR {key: None for key in string.punctuation}
text = "Kass is kass. Dit is kass. Waarom?"
## Strip and lower
text = text.strip().lower()
## Remove all punctuations
text = text.translate(table)  
## Only keep uniuqe characters
text = ' '.join(set(text.split()))


In [None]:
text

In [None]:
' '.join( list(.split())) ) 