In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv("creditcard.csv")
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [3]:
df = df.dropna()

In [6]:
df_1 = df[df.Class==1]
df_0 = df[df.Class==0].sample(n=df_1.shape[0])
df_ds = pd.concat([df_1, df_0])
df_ds.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,1
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.0,1
4920,4462.0,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,239.93,1


In [12]:
df_ds = df_ds.sample(frac=1.0)
X = df_ds.drop(df_ds.columns[-1], axis=1)
X = (X - X.mean())/X.std()
y = df_ds.iloc[:,-1]
grid = {"n_neighbors": np.arange(1,10)}

In [13]:
knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, param_grid=grid)

In [14]:
knnCV.fit(X,y)

In [15]:
knnCV.best_params_

{'n_neighbors': 7}

In [17]:
knn_best = knnCV.best_estimator_
y_h = knn_best.predict(X)

In [19]:
print('Train Accuracy:', knn_best.score(X, y))
print('Confusion Matrix:')
print(confusion_matrix(y, y_h))
print('Accuracy Score:', accuracy_score(y, y_h))
print('Precision Score:', precision_score(y, y_h))
print('Recall Score:', recall_score(y, y_h))
print('F1 Score:', f1_score(y, y_h))

Train Accuracy: 0.931910569105691
Confusion Matrix:
[[483   9]
 [ 58 434]]
Accuracy Score: 0.931910569105691
Precision Score: 0.9796839729119639
Recall Score: 0.8821138211382114
F1 Score: 0.9283422459893048


In [20]:
X_t = df.drop(df.columns[-1], axis=1)
X_t = (X_t - X_t.mean())/X_t.std()
y_t = df.iloc[:,-1]

In [21]:
y_th = knn_best.predict(X_t)



In [23]:
print('Test Accuracy:', knn_best.score(X_t, y_t))
print('Confusion Matrix:')
print(confusion_matrix(y_t, y_th))
print('Accuracy Score:', accuracy_score(y_t, y_th))
print('Precision Score:', precision_score(y_t, y_th))
print('Recall Score:', recall_score(y_t, y_th))
print('F1 Score:', f1_score(y_t, y_th))

Test Accuracy: 0.4501750308103382
Confusion Matrix:
[[127725 156590]
 [     4    488]]
Accuracy Score: 0.4501750308103382
Precision Score: 0.0031067367804530235
Recall Score: 0.991869918699187
F1 Score: 0.006194072475725075
