In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,recall_score,f1_score,roc_curve, auc
from sklearn.metrics import confusion_matrix
from pandas_ml import ConfusionMatrix
creditcard = pd.read_csv('xxxxx')

In [10]:
#Prior to running the models, time and amount columns will be standardized since columns from V1 to V28 have undergone PCA 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(np.array(creditcard['Amount']).reshape(-1,1))
scaled_amount = scaler.transform(np.array(creditcard['Amount']).reshape(-1,1))
scaler1 = StandardScaler().fit(np.array(creditcard['Time']).reshape(-1,1))
scaled_time = scaler1.transform(np.array(creditcard['Time']).reshape(-1,1))

In [11]:
#Creating a 2nd dataframe with scaled amount and scaled time as columns. Will drop the original "time" and "amount" column
df = pd.DataFrame(scaled_amount,columns =['scaled_amount'])
df2 = pd.DataFrame(scaled_time,columns =['scaled_time'])
credit_card_ss = pd.concat([creditcard,df, df2], axis=1)
cols_to_drop =['Time','Amount']
credit_card_ss = credit_card_ss[credit_card_ss.columns.drop(cols_to_drop)]

In [12]:
from sklearn.model_selection import train_test_split
X_credit_card_ss = credit_card_ss.drop('Class', axis=1)
y_credit_card_ss = credit_card_ss['Class']
X_credit_card_ss_train, X_credit_card_ss_test, y_credit_card_ss_train, y_credit_card_ss_test = train_test_split(X_credit_card_ss, y_credit_card_ss, test_size=0.3, random_state=55)

In [13]:
from sklearn.utils import resample 

# Separate majority and minority classes
creditcard_nonfraud = credit_card_ss[credit_card_ss.Class==0]
creditcard_fraud = credit_card_ss[credit_card_ss.Class==1]
#In order to build this oversample dataset, I will take 300 fraud cases of original dataset and store them in a dataframe, then replicate those values to 10000. 
#Will take 10000 non fraud cases from original datgaset without replacement and store in a different dataset.
#Will combine both datasets to form the oversampled dataset on which the models will train. The reason for taking only 300
#fraud cases instead of all cases of fraud was to leave behind some of them so model can be validated. The model can only be 
#tested on data that it has not seen as a validation step.

creditcard_undersampled = resample(credit_card_ss[credit_card_ss.Class==1], 
                                 replace=False,    # sample without replacement
                                 n_samples=300,     # represent around 61% of fraudulent cases
                                 random_state=75) # reproducible results

creditcard_oversampled_fraud = resample(creditcard_undersampled, 
                                 replace=True,    # sample with replacement the fraud cases 
                                 n_samples=10000,     # replicating the minority cases to 5000
                                 random_state=75) # reproducible results


# Take equal number of majority classes
creditcard_nonfraud_undersampled = resample(credit_card_ss[credit_card_ss.Class==0], 
                                 replace=False,    # sample without replacement
                                 n_samples=10000,     # to match minority class
                                 random_state=70) # reproducible results

# Combine upsampled minority class with downsampled majority class
creditcard_oversampled = pd.concat([creditcard_nonfraud_undersampled, creditcard_oversampled_fraud])

# Display new class counts
print(creditcard_oversampled.Class.value_counts())


#Creating new dataframe which does not contain the data that is present in creditcard_undersampled dataset
creditcard_original_leftover = credit_card_ss.loc[~credit_card_ss.set_index(list(credit_card_ss.columns)).index.isin(creditcard_oversampled.set_index(list(creditcard_oversampled.columns)).index)]
#Checking shape of all datasets
print ('Dimension of creditcard_original_leftover:'+str(creditcard_original_leftover.shape))
print('Dimension of creditcard_oversampled:',str(creditcard_oversampled.shape))
print('Dimension of creditcard:',str(credit_card_ss.shape))

1    10000
0    10000
Name: Class, dtype: int64
Dimension of creditcard_original_leftover:(274405, 31)
Dimension of creditcard_oversampled: (20000, 31)
Dimension of creditcard: (284807, 31)


In [14]:
pd.value_counts(creditcard_original_leftover['Class'].values, sort=False) #Checking the value counts of column class in the original 
#dataset that does not contain the data used for building the oversampled dataset

0    274222
1       183
dtype: int64

In [15]:
#Splitting oversampled dataset 
from sklearn.model_selection import train_test_split
X_over = creditcard_oversampled.drop('Class', axis=1)
y_over = creditcard_oversampled['Class']
X_over_train, X_over_test, y_over_train, y_over_test = train_test_split(X_over, y_over, test_size=0.3, random_state=105)

In [None]:
#Use logistic regression on undersampeld dataset and get the scores. 
#Will use gridsearch as well to see which hyperparameters give the best recall and f1 scores. 
from sklearn.model_selection import GridSearchCV

parameter_candidates = {'penalty':['l2','l1'],'C': [0.001,1, 10, 100, 1000]}

cv_range=[10,20]
for i in cv_range:
    lr_over = GridSearchCV(estimator=LogisticRegression(random_state=90), param_grid=parameter_candidates, n_jobs=-1, cv=i)

#Training set 
    lr_over.fit(X_over_train,y_over_train)

#Prediction on training set - undersampled 
    pred_y_lr_over = lr_over.predict(X_over_test)

#Getting accuracy scores on the undersampled dataset 
#Accuracy and Recall scores. 
print ('Accuracy score of logistic regression classifier on test set:{:.3f}'.format(accuracy_score(y_over_test,pred_y_lr_over)))
print ('Recall score of logistic regression classifier on test set:{:.3f}'.format(recall_score(y_over_test,pred_y_lr_over)))
print ('F1 score of logistic regression classifier on test set:{:.3f}'.format(f1_score(y_over_test,pred_y_lr_over)))
print()
cm_lr_over = confusion_matrix(y_over_test,pred_y_lr_over)
print('Confusion matrix with logistic regression classifier with oversampled test set:\n%s' % cm_lr_over)
print()
print('Best C for logistic regression:',lr_over.best_estimator_.C) 
print('Best penalty:',lr_over.best_estimator_.penalty)
print('The best paramaters for the logistic regression classifier according to GridSearch and CV = %r:'% (i),lr_over.best_params_)


In [None]:
#using decision tree classifier on undersampled training set
#Will use gridsearch as well to see which hyperparameters give the best recall and f1 scores. 

parameter_candidates = {'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }
cv_range=[5,10,20]
for i in cv_range:
    dt_over = GridSearchCV(estimator=DecisionTreeClassifier(random_state=75), param_grid=parameter_candidates, n_jobs=-1, cv=i)

#Training set 
    dt_over.fit(X_over_train,y_over_train)

#Prediction on training set - undersampled 
    pred_y_dt_over = dt_over.predict(X_over_test)

#Getting accuracy scores on the undersampled dataset 
#Accuracy and Recall scores. 
print ('Accuracy score of decision tree classifier on test set:{:.3f}'.format(accuracy_score(y_over_test,pred_y_dt_over)))
print ('Recall score of decision tree classifier on test set:{:.3f}'.format(recall_score(y_over_test,pred_y_dt_over)))
print ('F1 score of decision tree classifier on test set:{:.3f}'.format(f1_score(y_over_test,pred_y_dt_over)))
print()
cm_dt_over = confusion_matrix(y_over_test,pred_y_dt_over)
print('Confusion matrix with decision tree classifier with on oversampled test set:\n%s' % cm_dt_over)
print()
print('The best paramaters for the decision tree classifier according to GridSearch and CV = %r:'% (i),dt_over.best_params_)

In [None]:
lr = LogisticRegression()
model = lr.fit(X_over_train, y_over_train)
y_pred = model.predict(X_over_test)
print ('Accuracy score of logistic regression classifier on test set:{:.3f}'.format(accuracy_score(y_over_test,y_pred)))

In [None]:
#Using SVM classifier 
#Will use gridsearch as well to see which hyperparameters give the best recall and f1 scores. 
from sklearn import svm
parameter_candidates = {'C': [0.001, 0.01,10, 100, 1000], 
              'kernel': ['rbf', 'linear','poly'],
              'gamma': [0.001, 0.01, 10, 100, 1000] 
             }
cv_range=[10,20]
for i in cv_range:
    svc_under = GridSearchCV(estimator=svm.SVC(random_state=60), param_grid=parameter_candidates, n_jobs=-1, cv=i)

#Training set 
    svc_under.fit(X_under_train,y_under_train)

#Prediction on training set - undersampled 
    pred_y_svc_under = svc_under.predict(X_under_test)

#Getting accuracy scores on the undersampled dataset 
#Accuracy and Recall scores. 
print ('Accuracy score of SVM classifier on test set:{:.3f}'.format(accuracy_score(y_under_test,pred_y_svc_under)))
print ('Recall score of SVM classifier on test set:{:.3f}'.format(recall_score(y_under_test,pred_y_svc_under)))
print ('F1 score of SVM classifier on test set:{:.3f}'.format(f1_score(y_under_test,pred_y_svc_under)))
print()
cm_svc_under = confusion_matrix(y_under_test,pred_y_svc_under)
print('Confusion matrix with SVM classifier with on undersampled test set:\n%s' % cm_svc_under)
print()
print('The best paramaters for the SVM classifier according to GridSearch and CV = %r:'% (i),svc_under.best_params_)