In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.utils import class_weight
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

# Getting the sense of data

In [2]:
#Load the data
df = pd.read_csv("creditcard.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

Use RobustScaler which is less prone to outliers to normalized Amount and Time.

In [3]:
df['Scaled_Amount'] = RobustScaler().fit_transform(df['Amount'].values.reshape(-1,1))
df['Scaled_Time'] = RobustScaler().fit_transform(df['Time'].values.reshape(-1,1))
df.drop(['Time','Amount'], axis=1,  inplace = True)
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,Scaled_Amount,Scaled_Time
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,1.783274,-0.994983
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.269825,-0.994983
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,4.983721,-0.994972
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,1.418291,-0.994972
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,0.670579,-0.99496


In [4]:
df = df.dropna()
X = df.drop("Class",axis = 1)
y = df["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1,stratify = y)

print("X:", X.shape, y.shape )
print("Train:", X_train.shape, y_train.shape )
print("Test:", X_test.shape, y_test.shape )

print("{:0.3f} % of fraud case in dataframe.".format(((y==1).astype(int).sum())/((y==0).astype(int).sum())*100) )
print("{:0.3f} % of fraud case in Train split.".format(((y_train==1).astype(int).sum())/((y_train==0).astype(int).sum())*100) )
print("{:0.3f} % of fraud case in Test split.".format(((y_test==1).astype(int).sum())/((y_test==0).astype(int).sum())*100) )

X: (284807, 30) (284807,)
Train: (256326, 30) (256326,)
Test: (28481, 30) (28481,)
0.173 % of fraud case in dataframe.
0.173 % of fraud case in Train split.
0.172 % of fraud case in Test split.


# Hyperparamaters

In [5]:
#for log Reg
Penalty_list = [ 'l1', 'l2', 'elasticnet']
C_list = [0.01, 0.1, 1, 10]
logReg_param_grid = {'penalty': Penalty_list, 'C': C_list}

#Decision Tree 
Criterion_list = ['gini', 'entropy']
max_depth_list = [1,2,3,4,5]
min_samples_split_list = [ 2, 4, 6]
decisionTree_param_grid = {'max_depth': max_depth_list, 
                           'criterion': Criterion_list, 
                           'min_samples_split': min_samples_split_list}


#Random Forest 
numberOfTree_list = [50,100,200,500]
max_depth_list = [ 4, 5, 6, 7, 8]
randomForest_param_grid = {'n_estimators': numberOfTree_list, 'max_depth': max_depth_list}

# Prepare models

In [6]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
    
models = []
models.append(('Logistic Regression', 
               LogisticRegression(class_weight = class_weights, solver = 'saga'),
               logReg_param_grid))

models.append(('Decision Tree', 
               DecisionTreeClassifier(),
               decisionTree_param_grid))

models.append(('Random Forest', 
               RandomForestClassifier(),
               randomForest_param_grid))

In [7]:
def train_model(model,hyperparam):

    #find optimal combination of hyperaramaters
    grid_search = GridSearchCV(estimator=model,param_grid=hyperparam, n_jobs = -1, cv=5)
    grid_search.fit(X_train,y_train)
    
    #Find optimal hyper-paramater
    best_hyperparam = grid_search.best_params_

    #trainng f1 score
    y_prediction_train = grid_search.best_estimator_.predict(X_train)
    train_f1 = f1_score(y_train,y_prediction_train)

    #testing f1 score
    y_prediction_test = grid_search.best_estimator_.predict(X_test)
    test_f1 = f1_score(y_test,y_prediction_test)
    
    return train_f1, test_f1, best_hyperparam

In [8]:
results = []
for name, model, hyperparam in models:
    print("now training " , name)
    start = time.time()
    results.append((name, train_model(model,hyperparam)) )
    end = time.time()
    print(name, " done in ", end-start)

now training  Logistic Regression
Logistic Regression  done in  69.8926055431366
now training  Decision Tree
Decision Tree  done in  54.56714916229248
now training  Random Forest
Random Forest  done in  1893.6917078495026


In [9]:
print(results)

[('Logistic Regression', (0.7154255319148936, 0.735632183908046, {'C': 0.1, 'penalty': 'l2'})), ('Decision Tree', (0.865525672371638, 0.8387096774193548, {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 6})), ('Random Forest', (0.8968944099378882, 0.8089887640449438, {'max_depth': 8, 'n_estimators': 500}))]


In [10]:
def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))

#strip result into multiple based on what it refeers to

modelNameResult = []
bestHyperparamaterResult = []
trainingErrorResult = []
testingErrorResult = []

for row in results:
    modelNameResult.append(row[0])
    trainingErrorResult.append(row[1][0])
    testingErrorResult.append(row[1][1])
    bestHyperparamaterResult.append(row[1][2])

tempDataframe = pd.DataFrame({'Classifier':modelNameResult,
                           'Training F1 SCore':trainingErrorResult,
                           'Testing F1 Score':testingErrorResult,
                           'Optimal Hyper-paramater':bestHyperparamaterResult })

pretty_print(tempDataframe)

Unnamed: 0,Classifier,Training F1 SCore,Testing F1 Score,Optimal Hyper-paramater
0,Logistic Regression,0.715426,0.735632,"{'C': 0.1, 'penalty': 'l2'}"
1,Decision Tree,0.865526,0.83871,"{'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 6}"
2,Random Forest,0.896894,0.808989,"{'max_depth': 8, 'n_estimators': 500}"
