In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from collections import Counter
%matplotlib inline

In [45]:
combined_df = pd.read_csv('combined_df.csv', index_col='date', infer_datetime_format=True, parse_dates=True)

In [46]:
X = combined_df.drop(columns = 'class')
y = combined_df['class']

In [47]:
# check class balance
y.value_counts()

 0    1391
 1      36
-1      23
Name: class, dtype: int64

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [50]:
# standardize X values
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [51]:
# resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
X_resampled, y_resampled = SMOTEENN(random_state=1).fit_resample(X_train, y_train)
# view the count of target classes with Counter
Counter(y_resampled)

Counter({-1: 1110, 0: 847, 1: 1100})

In [52]:
X_resampled

array([[-1.31758137,  0.64633172, -3.2136348 , ..., -0.54841146,
        -3.03746369,  0.35899676],
       [-0.40243347,  0.24265921,  0.31523476, ...,  0.62397447,
         0.65873401,  0.2072905 ],
       [ 2.1433701 , -0.01924736,  0.42381536, ...,  1.52050488,
         0.3786822 ,  0.31978455],
       ...,
       [-1.23225976, -1.71754032,  0.21675949, ..., -0.93652152,
         0.52632034, -0.03167695],
       [-0.94277964,  0.97365259, -3.2136348 , ...,  0.05431306,
        -3.03746369, -0.91992313],
       [-0.10788386, -0.09923109, -2.42838457, ...,  2.55055268,
        -2.20544442, -0.13572614]])

In [53]:
model = KNeighborsClassifier()

In [54]:
# list hyperparameters to tune.
search_params = {'leaf_size': list(range(1,50))
                'n_neighbors': list(range(1,30))
                'p':[1,2]
               }

In [55]:
# use gridsearch
clf = GridSearchCV(model, search_params, cv=10)

In [56]:
clf.fit(X_resampled, y_resampled)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29],
                         'p': [1, 2]})

In [57]:
clf.best_estimator_

KNeighborsClassifier(leaf_size=1, n_neighbors=1)

In [58]:
score = clf.best_estimator_.score(X_test, y_test)
print("%0.2f accuracy" % (score))

0.80 accuracy


In [60]:
predicted = clf.best_estimator_.predict(X_test)

In [61]:
classification_report(y_test, predicted)

'              precision    recall  f1-score   support\n\n          -1       0.08      0.40      0.13         5\n           0       0.97      0.82      0.89       277\n           1       0.07      0.25      0.11         8\n\n    accuracy                           0.80       290\n   macro avg       0.37      0.49      0.38       290\nweighted avg       0.93      0.80      0.86       290\n'