In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.svm import OneClassSVM
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, plot_roc_curve
from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv('/content/sample_data/creditcard.csv')

print(f"Dataset shape:- \n{data.shape}")
print(f"Data features :- \n{data.columns}")


Imbalance in data

In [None]:


fraud = data[data['Class'] == 1]
valid = data[data['Class'] == 0]
print('Fraud Transactions: {}'.format(len(fraud)))
print('Non-fraud Transactions: {}'.format(len(valid)))


Removing irrelevant columns

In [None]:

data = data.drop(['Time'], axis = 1)
print(f"List of feature names after removing Time column:- \n{data.columns}")


Checking for null / Nan values

In [None]:

print("Dataset info:-")
print(data.info())


Data transformation using Standard Scaler

In [None]:

print(f"Few values of Amount column before Scaling :- \n{data['Amount'][0:4]}")
data['Norm_Amount'] = StandardScaler().fit_transform(
data['Amount'].values.reshape(-1,1))
data = data.drop(['Amount'], axis = 1)
print(f"Few values of Amount column after applying StandardScaler:- \n{data['Norm_Amount'][0:4]}")


Function to calculate the performance metrics

In [None]:

def performanceMetrics(y_test,y_predict):
  # confusion matrix
  LABELS = ['Normal', 'Fraud']
  conf_matrix = confusion_matrix(y_test, y_predict)
  sns.heatmap(conf_matrix, xticklabels = LABELS, yticklabels = LABELS, annot = True, fmt = 'd')
  plt.title('Confusion matrix')
  plt.xlabel('Predicted class')
  plt.ylabel('True class')
  plt.show()
 
  # classification report 
  print(f"Classification Report :- \n")
  acc = accuracy_score(y_test, y_predict)
  prec = precision_score(y_test, y_predict)
  rec = recall_score(y_test, y_predict)
  f1 = f1_score(y_test, y_predict)
  print('accuracy: %0.4f'%acc,'\tprecision: %0.4f'%prec,'\trecall: %0.4f'%rec,'\tF1-score: %0.4f'%f1)
 
  # area under roc curve
  print(f"AROC score :- \n {roc_auc_score(y_test, y_predict)}")


Train and Test Models

In [None]:

def trainAndTestModels():
  print('\n======== RandomForest ==========')
  # initialize object for RandomForestClassifier class
  rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
  # train the classifier
  rf_classifier.fit(X_train, y_train) 
  # predict result using test dataset
  y_pred1 = rf_classifier.predict(X_test)
  # measure performance
  performanceMetrics(y_test,y_pred1)

  print('\n======== KNN ==========')  
  # initialize object for KNeighborsClassifier class
  knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
  # train the classifier
  knn_classifier.fit(X_train, y_train)
  # predict result using test dataset
  y_pred2 = knn_classifier.predict(X_test)
  # measure performance
  performanceMetrics(y_test,y_pred2)


  print('\n======== ANN ==========')
  mlp_classifier = MLPClassifier(hidden_layer_sizes=(29,29,29), activation='relu', solver='adam', max_iter=500)
  mlp_classifier.fit(X_train,y_train)
  y_pred3 = mlp_classifier.predict(X_test)
  # measure performance
  performanceMetrics(y_test,y_pred3)


  print('\n======== Majority Voting ==========')
  estimator = []
  estimator.append(('RF', rf_classifier))
  estimator.append(('KNN', knn_classifier))
  estimator.append(('ANN', mlp_classifier))
  
  # Voting Classifier with hard voting
  vot_hard = VotingClassifier(estimators = estimator, voting ='hard')
  vot_hard.fit(X_train, y_train)
  y_pred = vot_hard.predict(X_test)

  # measure performance
  performanceMetrics(y_test,y_pred)

Anomaly Detection using OneClassSVM

In [None]:

X_train, X_test = train_test_split(data, test_size=0.2, random_state=66)
#Training with the observations having label 0 i.e , non-fraud transactions
X_train = X_train[X_train.Class == 0]
y_train = X_train.Class
X_train = X_train.drop(['Class'], axis=1)
y_test = X_test.Class
X_test = X_test.drop(['Class'], axis=1)
X_train = X_train.values
X_test = X_test.values


# Training and testing the model
model = OneClassSVM(gamma='auto', nu=0.05)
model.fit(X_train) #Not all train data is used because of long training time
y_pred = model.predict(X_test)
# y_pred = y_pred.apply(lambda x: 1 if x == -1 else 0)
for i in range(len(y_pred)):
    y_pred[i] = 1 if y_pred[i] == -1 else 0

performanceMetrics(y_test, y_pred)

Without sampling

In [None]:

X = data.iloc[:,data.columns != 'Class']
y = data.iloc[:,data.columns == 'Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
trainAndTestModels()

Undersampling 

In [None]:


fraud_indices = np.array(data[data.Class == 1].index)
fraudTransactions = len(fraud_indices)
nonFraud_indices = data[data.Class == 0].index
# Random select N indices from non fraudulent samples (N equals to number of fraudulent records)
random_normal_indices = np.random.choice(nonFraud_indices, fraudTransactions, replace=False)
random_normal_indices = np.array(random_normal_indices)
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
under_sample_data = data.iloc[under_sample_indices,:]
X_undersample = under_sample_data.iloc[:,under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:,under_sample_data.columns == 'Class']
X_train, X_test, y_train, y_test = train_test_split(X_undersample,y_undersample, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
trainAndTestModels()

Oversampling using SMOTE

In [None]:


X_resample, y_resample = SMOTE().fit_resample(X,y.values.ravel())
print('Number of total transactions before SMOTE sampling: ', len(y), '...after SMOTE upsampling: ', len(y_resample))
print('Number of fraudulent transactions before SMOTE sampling: ', len(y[y.Class==1]), 
      '...after SMOTE upsampling: ', np.sum(y_resample[y_resample==1]))
y_resample = pd.DataFrame(y_resample)
X_resample = pd.DataFrame(X_resample)
X_train, X_test, y_train, y_test = train_test_split(X_resample,y_resample,test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
trainAndTestModels()