# IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import imblearn
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA


# READING CSV FILE

In [2]:
df = pd.read_csv("./creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# UNDERSAMPLING THE DATA

In [3]:
no_frauds = len(df[df['Class'] == 1])
non_fraud_indices = df[df.Class == 0].index
non_fraud_indices = df[df.Class == 0].index
random_indices = np.random.choice(non_fraud_indices, no_frauds, replace=False)
fraud_indices = df[df.Class == 1].index
under_sample_indices = np.concatenate([fraud_indices,random_indices])
under_sample = df.loc[under_sample_indices]

# SPLITTING THE DATASET INTO TRAIN AND TEST

In [4]:
X_under = under_sample[['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V18','V19',
                        'V20','V21','V22','V23','V24','V25','V26','V27','V28','Amount']]
y_under = under_sample['Class']
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(X_under,y_under,test_size = 0.3, random_state = 0)

# RFE ON NAIVE BAYES

In [5]:
model = BernoulliNB()
rfe = RFE(model)
fit = rfe.fit(X_under_train,y_under_train)
print("Number of Features: %d"% fit.n_features_)
print("Selected features: %s"% fit.support_)
print("Ranking of features: %s"% fit.ranking_)  

Number of Features: 14
Selected features: [ True False  True False  True  True  True False  True  True False  True
 False  True False  True  True  True False False False False  True  True
 False False False False False]
Ranking of features: [ 1 13  1 16  1  1  1  9  1  1 14  1  4  1  2  1  1  1 10  7 12  5  1  1
  6  3 11  8 15]


# MODEL FITTING ON TRAIN DATA

In [6]:
gnb = GaussianNB()
X = X_under_train[['V1','V3','V5', 'V6', 'V7','V9','V10','V12','V14','V15','V16','V17','V18','V23','V24']]
gnb.fit(X,y_under_train)
y_pred = gnb.predict(X)
print(confusion_matrix(y_under_train,y_pred))  
print(classification_report(y_under_train,y_pred))
print(accuracy_score(y_under_train, y_pred))

[[335   8]
 [ 49 296]]
             precision    recall  f1-score   support

          0       0.87      0.98      0.92       343
          1       0.97      0.86      0.91       345

avg / total       0.92      0.92      0.92       688

0.9171511627906976


# MODEL FITTING ON TEST DATA

In [7]:
X2 = X_under_test[['V1','V3','V5', 'V6', 'V7','V9','V10','V12','V14','V15','V16','V17','V18','V23','V24']]
y_pred = gnb.predict(X2)
print(confusion_matrix(y_under_test,y_pred))  
print(classification_report(y_under_test,y_pred))
print(accuracy_score(y_under_test, y_pred))

[[149   0]
 [ 22 125]]
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       149
          1       1.00      0.85      0.92       147

avg / total       0.94      0.93      0.93       296

0.9256756756756757


# GRIDSEARCH ON TRAIN DATA

In [8]:
# Logistic regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

logregpipe = Pipeline([('scale', StandardScaler()),
                   ('logreg',LogisticRegression(multi_class="multinomial",solver="lbfgs"))])

# Gridsearch to determine the value of C
param_grid = {'logreg__C':np.arange(0.01,100,10)}
logreg_cv = GridSearchCV(logregpipe,param_grid,cv=5,return_train_score=True)
logreg_cv.fit(X,y_under_train)
print(logreg_cv.best_params_)

bestlogreg = logreg_cv.best_estimator_
bestlogreg.fit(X,y_under_train)
bestlogreg.coef_ = bestlogreg.named_steps['logreg'].coef_
bestlogreg.score(X,y_under_train)

{'logreg__C': 10.01}


0.936046511627907

# GRIDSEARCH ON TEST DATA

In [9]:
bestlogreg = logreg_cv.best_estimator_
bestlogreg.fit(X2,y_under_test)
bestlogreg.coef_ = bestlogreg.named_steps['logreg'].coef_
bestlogreg.score(X2,y_under_test)

0.9324324324324325

# CROSS VALIDATION ON TRAIN DATA

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import cross_validation

predicted = cross_validation.cross_val_predict(LogisticRegression(),X,y_under_train, cv=10)
print (metrics.accuracy_score(y_under_train, predicted))

0.9287790697674418




# CROSS VALIDATION ON TEST DATA

In [11]:
predicted1 = cross_validation.cross_val_predict(LogisticRegression(),X2, y_under_test, cv=10)

print (metrics.classification_report(y_under_test, predicted1))
print (metrics.accuracy_score(y_under_test, predicted1))

             precision    recall  f1-score   support

          0       0.89      0.95      0.92       149
          1       0.95      0.88      0.91       147

avg / total       0.92      0.92      0.92       296

0.9155405405405406


# K BEST ON NAIVE BAYES

In [12]:
array = under_sample.values
test = SelectKBest(score_func=f_classif,k=10)
fit = test.fit(X_under_train, y_under_train)
print("scores_:",test.scores_)
print("pvalues_:",test.pvalues_)
print("selected index:",test.get_support(True))
print("after transform:",test.transform(X_under_train)) 

scores_: [1.65592843e+02 2.20551955e+02 3.43212801e+02 6.72683958e+02
 1.19196109e+02 1.42737958e+02 2.25752047e+02 6.13275884e+00
 2.79631571e+02 4.88209223e+02 5.96432541e+02 6.58021075e+02
 2.14225592e-01 8.94099051e+02 4.44519587e+00 3.55217940e+02
 3.20498725e+02 1.87876037e+02 6.41033839e+01 1.75695938e+01
 2.05682869e+01 2.89655486e-01 1.64784613e+00 7.69941297e+00
 9.43310591e-01 1.31281455e+00 1.00674815e+01 5.98782855e+00
 7.68106861e+00]
pvalues_: [4.22895241e-034 1.82849968e-043 1.95313906e-062 6.84189115e-104
 1.07097431e-025 5.06567685e-030 2.54854459e-044 1.35100597e-002
 6.60753346e-053 4.07170712e-082 2.83440166e-095 2.84402820e-102
 6.43622181e-001 2.07605403e-126 3.53615968e-002 3.61676986e-064
 4.21702811e-059 5.71496554e-038 5.05654966e-015 3.13237129e-005
 6.79095430e-006 5.90615958e-001 1.99685763e-001 5.67423525e-003
 3.31770215e-001 2.52285232e-001 1.57644943e-003 1.46547547e-002
 5.73154676e-003]
selected index: [ 2  3  6  8  9 10 11 13 15 16]
after transform:

# MODEL FITTING ON TRAIN DATA

In [13]:
gnb = GaussianNB()
X = X_under_train[['V2','V3','V4','V9','V10','V11','V12','V14','V16','V17']]
gnb.fit(X,y_under_train)
y_pred = gnb.predict(X)
print(confusion_matrix(y_under_train,y_pred))  
print(classification_report(y_under_train,y_pred))
print(accuracy_score(y_under_train, y_pred))

[[337   6]
 [ 47 298]]
             precision    recall  f1-score   support

          0       0.88      0.98      0.93       343
          1       0.98      0.86      0.92       345

avg / total       0.93      0.92      0.92       688

0.9229651162790697


# MODEL FITTING ON TEST DATA

In [14]:
X2 = X_under_test[['V2','V3','V4','V9','V10','V11','V12','V14','V16','V17']]
y_pred = gnb.predict(X2)
print(confusion_matrix(y_under_test,y_pred))  
print(classification_report(y_under_test,y_pred))
print(accuracy_score(y_under_test, y_pred))

[[148   1]
 [ 23 124]]
             precision    recall  f1-score   support

          0       0.87      0.99      0.92       149
          1       0.99      0.84      0.91       147

avg / total       0.93      0.92      0.92       296

0.918918918918919


# GRIDSEARCH ON TRAIN DATA

In [15]:
# Logistic regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

logregpipe = Pipeline([('scale', StandardScaler()),
                   ('logreg',LogisticRegression(multi_class="multinomial",solver="lbfgs"))])

# Gridsearch to determine the value of C
param_grid = {'logreg__C':np.arange(0.01,100,10)}
logreg_cv = GridSearchCV(logregpipe,param_grid,cv=5,return_train_score=True)
logreg_cv.fit(X,y_under_train)
print(logreg_cv.best_params_)

bestlogreg = logreg_cv.best_estimator_
bestlogreg.fit(X,y_under_train)
bestlogreg.coef_ = bestlogreg.named_steps['logreg'].coef_
bestlogreg.score(X,y_under_train)

{'logreg__C': 10.01}


0.9433139534883721

# GRIDSEARCH ON TEST DATA

In [16]:
bestlogreg = logreg_cv.best_estimator_
bestlogreg.fit(X2,y_under_test)
bestlogreg.coef_ = bestlogreg.named_steps['logreg'].coef_
bestlogreg.score(X2,y_under_test)

0.956081081081081

# CROSS VALIDATION ON TRAIN DATA

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import cross_validation

predicted = cross_validation.cross_val_predict(LogisticRegression(),X,y_under_train, cv=10)
print (metrics.accuracy_score(y_under_train, predicted))

0.9375


# CROSS VALIDATION ON TEST DATA

In [18]:
predicted1 = cross_validation.cross_val_predict(LogisticRegression(),X2, y_under_test, cv=10)

print (metrics.classification_report(y_under_test, predicted1))
print (metrics.accuracy_score(y_under_test, predicted1))

             precision    recall  f1-score   support

          0       0.91      0.98      0.94       149
          1       0.98      0.90      0.94       147

avg / total       0.94      0.94      0.94       296

0.9391891891891891


# PCA ON NAIVE BAYES

In [19]:
pca = PCA(n_components=5)
X_under_train = pca.fit_transform(X_under_train)
X_under_test = pca.transform(X_under_test)
explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[9.93521210e-01 4.44591830e-03 5.59561879e-04 5.26170173e-04
 2.42045617e-04]


# MODEL FITTING ON TRAIN & TEST DATA

In [20]:
gnb = GaussianNB()
gnb.fit(X_under_train,y_under_train)
y_pred = gnb.predict(X_under_test)
print(confusion_matrix(y_under_test,y_pred))  
print(classification_report(y_under_test,y_pred))
print(accuracy_score(y_under_test, y_pred))

[[144   5]
 [ 20 127]]
             precision    recall  f1-score   support

          0       0.88      0.97      0.92       149
          1       0.96      0.86      0.91       147

avg / total       0.92      0.92      0.92       296

0.9155405405405406


# GRIDSEARCH ON TEST & TRAIN DATA

In [21]:
# Logistic regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

logregpipe = Pipeline([('scale', StandardScaler()),
                   ('logreg',LogisticRegression(multi_class="multinomial",solver="lbfgs"))])

# Gridsearch to determine the value of C
param_grid = {'logreg__C':np.arange(0.01,100,10)}
logreg_cv = GridSearchCV(logregpipe,param_grid,cv=5,return_train_score=True)
logreg_cv.fit(X_under_train,y_under_train)
print(logreg_cv.best_params_)

bestlogreg = logreg_cv.best_estimator_
bestlogreg.fit(X_under_train,y_under_train)
bestlogreg.coef_ = bestlogreg.named_steps['logreg'].coef_
bestlogreg.score(X_under_train,y_under_train)

{'logreg__C': 10.01}


0.9375

# CROSS VALIDATION ON TEST & TRAIN DATA

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import cross_validation

predicted = cross_validation.cross_val_predict(LogisticRegression(),X_under_train,y_under_train, cv=10)
print (metrics.accuracy_score(y_under_train, predicted))

0.934593023255814


In [23]:
predicted1 = cross_validation.cross_val_predict(LogisticRegression(),X_under_test, y_under_test, cv=10)

print (metrics.classification_report(y_under_test, predicted1))
print (metrics.accuracy_score(y_under_test, predicted1))

             precision    recall  f1-score   support

          0       0.89      0.98      0.93       149
          1       0.98      0.88      0.92       147

avg / total       0.93      0.93      0.93       296

0.9290540540540541
