In [59]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [60]:
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
df.shape

In [61]:
df.head()

In [62]:
# 1 is for fraud and 0 for genuine transaction in class column
# v1 to v28 are variable out of PCA to protect customer data.

df.describe()

In [63]:
for i in df.columns:
    if(df[i].isna().sum() > 0):
        print(i,': ',df[i].isna().sum())

In [64]:
train = df.sample(frac=0.9, random_state = 25)
test = df.drop(train.index)

print(train.shape)
print(test.shape)

In [65]:
from sklearn.feature_selection import f_classif

fs = f_classif

def f_score(f):
    X = train[[f]]
    f_score = fs(X,train['Class'])
    return f_score

In [66]:
X = df.drop(columns=['Class'],axis=1)

f_static = []
p_values = []
for i in X.columns:
    f_static.append(f_score(i)[0][0])
    p_values.append(f_score(i)[1][0])

In [67]:
score = pd.DataFrame(f_static,columns=['f_static'])

In [68]:
score['p_values'] = p_values
score['column'] = X.columns 

score.head()

In [69]:
score.info()

In [70]:
score = score.sort_values(by = ['f_static'], ascending = False)
score.head()

In [71]:
from matplotlib import pyplot as plt

plt.figure(figsize=(15,5))
plt.bar(score['column'],score['f_static'])
plt.xlabel('columns')
plt.ylabel('f_static')
plt.show()

In [72]:
plt.figure(figsize=(15,5))
plt.bar(score['column'],score['p_values'])
plt.xlabel('columns')
plt.ylabel('p_values')
plt.show()

In [73]:
score = score.reset_index(drop=True)

In [74]:
x_list = ['Class']

for i in range(10):
    x_list.append(score['column'][i])
    
new_train = train[x_list]
new_test = test[x_list]
new_train.head()

In [75]:
new_train = new_train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [76]:
new_train['Class'].value_counts()

In [77]:
from sklearn.model_selection import train_test_split

training_data, validation_data = train_test_split(new_train, test_size = 0.2, random_state = 25)

print(training_data.shape)
print(validation_data.shape)

Trying different methods to counter the imbalanced dataset:

1.Satandard Bagging Classifier

In [78]:
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_roc_curve

X = training_data.drop(columns=['Class'],axis=1)
y = training_data[['Class']]


def bc(X,y):
    model = BaggingClassifier()

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring = 'roc_auc', cv=cv, n_jobs=-1)
    m = mean(scores)
    
    pred = model.predict(test.drop(columns=['Class'],axis=1))
    auprc = average_precision_score(test[['class']],pred)
    return m,auprc

def bbc(X,y):
    model = BalancedBaggingClassifier()
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    m = mean(scores)
    return m

def rf(X,y):
    model = RandomForestClassifier(n_estimators = 10)
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    m = mean(scores)
    return m
    
def rfcw(X,y):
    model = RandomForestClassifier(n_estimators = 10, class_weight = 'balanced')
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    m = mean(scores)
    return m

def rfcwss(X,y):
    model = RandomForestClassifier(n_estimators=10, class_weight='balanced_subsample')
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X,y, scoring='roc_auc', cv=cv, n_jobs=-1)
    m = mean(scores)
    return m

def brf(X,y):
    model = BalancedRandomForestClassifier(n_estimators=10)
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    m = mean(scores)
    return m

def eec(X,y):
    model = EasyEnsembleClassifier()
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    m = mean(scores)
    
    model.fit(X,y)
    pred = model.predict(new_test.drop(columns=['Class'],axis=1))
    auprc = average_precision_score(new_test['Class'],pred)
    accuracy = accuracy_score(new_test[['Class']],pred)
    return m,auprc,accuracy, model



In [79]:
#bbc = bbc(X,y)
#print('Balanced Bagging Classifier:', ' ', bbc)

In [80]:
#bc = bc(X,y)
#print('Bagging Classifier:', ' ', bc)

In [81]:
#srf = rf(X,y)
#print('Standard Random Forest Classifier:', ' ', srf)

In [82]:
#rfcw = rfcw(X,y)
#print('Random Forest Classifier - class weight: balanced:', ' ', rfcw)

In [83]:
#rfcwss = rfcwss(X,y)
#print('Random Forest Class Weight Sub Sample:', ' ', rfcwss)

In [84]:
#brf = brf(X,y)
#print('Balanced Random Forest:', ' ', brf)

eec,auprc_eec,accuracy_eec,model = eec(X,y)
print('Easy Esnsemble Classifier roc_auc:', ' ', eec)
print('auprc:', ' ', auprc_eec)
print('accuracy:', ' ',accuracy_eec)
plot_roc_curve(model, new_test.drop(columns=['Class']), new_test[['Class']]) 

In [85]:

model = BalancedRandomForestClassifier(n_estimators=10)
    
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators':[3,10,30], 'max_features':[2,4,6,8,10]}]

#model_eec = EasyEnsembleClassifier()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', return_train_score=True)

grid_search.fit(X,y)

    
model.fit(X,y)
pred = model.predict(new_test.drop(columns=['Class'],axis=1))
auprc = average_precision_score(new_test['Class'],pred)
accuracy = accuracy_score(new_test[['Class']],pred)


In [86]:
grid_search.best_params_

In [87]:
grid_search.best_estimator_

In [88]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

In [89]:
plot_roc_curve(model, new_test.drop(columns=['Class']), new_test[['Class']]) 

In [90]:
print('Accuracy on test set:', ' ', accuracy)