In [1]:
import pandas as pd
import plotly.express as pe
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, precision_score, recall_score, classification_report

from hyperopt import hp,tpe,fmin,Trials,STATUS_OK,space_eval
from sklearn.model_selection import StratifiedKFold,cross_val_score
from statistics import mean


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


In [2]:
df = pd.read_csv("./datasets/creditcard.csv")
df

Unnamed: 0,PCA_1,PCA_2,PCA_3,Class
0,94407.854507,-93.620565,2.296309,1
1,94341.884035,435.393668,1.699793,1
2,90351.867921,146.538218,1.879951,1
3,87827.857829,-34.248732,4.186702,1
4,87294.854502,-92.228955,-0.746728,1
...,...,...,...,...
979,-72822.137080,61.185174,1.055759,0
980,-29162.144380,-70.173145,0.317346,0
981,-58601.141137,-11.078538,-1.843060,0
982,-27034.144838,-78.904244,1.118717,0


## Exploration of data!

In [3]:
df.describe().round(2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PCA_1,984.0,5709.81,48816.34,-77526.15,-41383.87,10609.87,49322.36,94407.85
PCA_2,984.0,13.73,219.4,-93.62,-84.64,-69.03,7.98,2190.01
PCA_3,984.0,2.52,5.87,-2.32,-1.21,0.79,2.96,32.41
Class,984.0,0.5,0.5,0.0,0.0,0.5,1.0,1.0


In [4]:
label = df.pop(  'Class'  )
label

0      1
1      1
2      1
3      1
4      1
      ..
979    0
980    0
981    0
982    0
983    0
Name: Class, Length: 984, dtype: int64

In [5]:
sc = StandardScaler()
for col in df.columns:
    df[[col]] = sc.fit_transform( df[[col]]  )


df

Unnamed: 0,PCA_1,PCA_2,PCA_3
0,1.817898,-0.489533,-0.037348
1,1.816546,1.922843,-0.138967
2,1.734770,0.605623,-0.108277
3,1.683039,-0.218790,0.284689
4,1.672115,-0.483187,-0.555744
...,...,...,...
979,-1.609540,0.216402,-0.248682
980,-0.714713,-0.382610,-0.374474
981,-1.318076,-0.113131,-0.742509
982,-0.671099,-0.422425,-0.237957


# Class balance
### Notice the ratio of fraudulent to non-fraudulent transaction

In [6]:
label.value_counts()

1    492
0    492
Name: Class, dtype: int64

In [7]:
label.value_counts(normalize=True)

1    0.5
0    0.5
Name: Class, dtype: float64

In [8]:
#Separate features and labels
X = df.values
y = label.values



x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.1,stratify=y)


#  
#  Build a Gradient Boosting Model
###  

In this section we'll do k-fold cross validation and use gradient boosting for the prediction algorithm.We'll build our own cross validation loop for gradient boosting.  

In [9]:
skf = StratifiedKFold(n_splits=10) #use StratifiedKFold package to define data splits for fold
skf.get_n_splits(x_train, y_train)

10

In [10]:
params = {'max_depth': 6,
         'n_estimators': 500,
         'learning_rate': 0.07,
         'objective': 'binary:logistic',
         'eval_metric': 'error',
         'alpha': 5,
         'nthread': 5,
         'verbosity': 1}

         
results_list = []
for train_index, val_index in skf.split(x_train, y_train):

    x_tr, x_val = x_train[train_index, :], x_train[val_index, :]
    y_tr, y_val = y_train[train_index], y_train[val_index]

    model = xgb.XGBClassifier(**params, use_label_encoder=False)
    model.fit(x_tr, y_tr, eval_set=[(x_tr, y_tr), (x_val, y_val)], verbose=False)
    
    results = model.evals_result()
    results_list.append(results)



In [11]:
# tr_avg = [sum([results_list[j]['validation_0']['error'][i] for j in range(10)])/10 for 
#           i in range(len(results_list[0]['validation_0']['error']))]

# val_avg = [sum([results_list[j]['validation_1']['error'][i] for j in range(10)])/10 for 
#           i in range(len(results_list[0]['validation_1']['error']))]

# pe.line(tr_avg)
# pe.line(val_avg)


# # pe.xlabel('number of trees')
# # pe.ylabel('error')
# # pe.show()

In [12]:
te_probs_gbm = model.predict_proba(x_test)
prob1 = [x[1] for x in te_probs_gbm] #linear model returns prob of both binary outcomes

#threshold at 0.5
y_te_05 = [int(x + 0.5) for x in prob1]
y_te_75 = [int(x + 0.25) for x in prob1]
y_te_25 = [int(x + 0.75) for x in prob1]

conf_mat05 = confusion_matrix(y_test, y_te_05)
conf_mat75 = confusion_matrix(y_test, y_te_75)
conf_mat25 = confusion_matrix(y_test, y_te_25)

print('confusion matrix with 0.5 threshold \n', conf_mat05)
print('confusion matrix with 0.75 threshold \n', conf_mat75)
print('confusion matrix with 0.25 threshold \n', conf_mat25)

confusion matrix with 0.5 threshold 
 [[40  9]
 [ 5 45]]
confusion matrix with 0.75 threshold 
 [[47  2]
 [13 37]]
confusion matrix with 0.25 threshold 
 [[31 18]
 [ 2 48]]


In [13]:
predicted_y = model.predict(x_test)

print(classification_report(predicted_y, y_test))



              precision    recall  f1-score   support

           0       0.82      0.89      0.85        45
           1       0.90      0.83      0.87        54

    accuracy                           0.86        99
   macro avg       0.86      0.86      0.86        99
weighted avg       0.86      0.86      0.86        99



##  Feature importance


In [14]:
columns = df.columns

feature_importance_df = pd.DataFrame.from_dict(dict(zip(columns, model.feature_importances_)), orient='index')
display(feature_importance_df)

Unnamed: 0,0
PCA_1,0.065525
PCA_2,0.218384
PCA_3,0.716091
