# IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import imblearn
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA


# READING CSV FILE

In [2]:
df = pd.read_csv("./creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# UNDERSAMPLING THE DATA

In [3]:
no_frauds = len(df[df['Class'] == 1])
non_fraud_indices = df[df.Class == 0].index
non_fraud_indices = df[df.Class == 0].index
random_indices = np.random.choice(non_fraud_indices, no_frauds, replace=False)
fraud_indices = df[df.Class == 1].index
under_sample_indices = np.concatenate([fraud_indices,random_indices])
under_sample = df.loc[under_sample_indices]

# SPLITTING THE DATASET INTO TRAIN AND TEST

In [4]:
X_under = under_sample.loc[:,under_sample.columns != 'Class']
y_under = under_sample.loc[:,under_sample.columns == 'Class']
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(X_under,y_under,test_size = 0.3, random_state = 0)


# RFE ON NAIVE BAYES

In [5]:
model = BernoulliNB()
rfe = RFE(model)
fit = rfe.fit(X_under_train,y_under_train)
print("Number of Features: %d"% fit.n_features_)
print("Selected features: %s"% fit.support_)
print("Ranking of features: %s"% fit.ranking_)  

Number of Features: 15
Selected features: [False  True False  True False  True  True  True False  True  True False
  True False  True  True  True  True  True False False False False  True
  True False False False False False]
Ranking of features: [16  1 12  1 15  1  1  1  8  1  1 13  1  3  1  1  1  1  1  9  6 11  4  1
  1  5  2 10  7 14]


  y = column_or_1d(y, warn=True)


# MODEL FITTING ON TRAIN DATA

In [6]:
gnb = GaussianNB()
X = X_under_train[['V1','V3','V5', 'V6', 'V7','V9','V10','V12','V14','V15','V16','V17','V18','V23','V24']]
gnb.fit(X,y_under_train)
y_pred = gnb.predict(X)
print(confusion_matrix(y_under_train,y_pred))  
print(classification_report(y_under_train,y_pred))
print(accuracy_score(y_under_train, y_pred))

[[339   4]
 [ 46 299]]
             precision    recall  f1-score   support

          0       0.88      0.99      0.93       343
          1       0.99      0.87      0.92       345

avg / total       0.93      0.93      0.93       688

0.9273255813953488


  y = column_or_1d(y, warn=True)


# MODEL FITTING ON TEST DATA

In [7]:
X2 = X_under_test[['V1','V3','V5', 'V6', 'V7','V9','V10','V12','V14','V15','V16','V17','V18','V23','V24']]
y_pred = gnb.predict(X2)
print(confusion_matrix(y_under_test,y_pred))  
print(classification_report(y_under_test,y_pred))
print(accuracy_score(y_under_test, y_pred))

[[139  10]
 [ 21 126]]
             precision    recall  f1-score   support

          0       0.87      0.93      0.90       149
          1       0.93      0.86      0.89       147

avg / total       0.90      0.90      0.90       296

0.8952702702702703


# GRIDSEARCH ON TRAIN DATA

In [8]:
# Logistic regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

logregpipe = Pipeline([('scale', StandardScaler()),
                   ('logreg',LogisticRegression(multi_class="multinomial",solver="lbfgs"))])

# Gridsearch to determine the value of C
param_grid = {'logreg__C':np.arange(0.01,100,10)}
logreg_cv = GridSearchCV(logregpipe,param_grid,cv=5,return_train_score=True)
logreg_cv.fit(X,y_under_train)
print(logreg_cv.best_params_)

bestlogreg = logreg_cv.best_estimator_
bestlogreg.fit(X,y_under_train)
bestlogreg.coef_ = bestlogreg.named_steps['logreg'].coef_
bestlogreg.score(X,y_under_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'logreg__C': 10.01}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.9331395348837209

# GRIDSEARCH ON TEST DATA

In [9]:
bestlogreg = logreg_cv.best_estimator_
bestlogreg.fit(X2,y_under_test)
bestlogreg.coef_ = bestlogreg.named_steps['logreg'].coef_
bestlogreg.score(X2,y_under_test)

  y = column_or_1d(y, warn=True)


0.9155405405405406

# CROSS VALIDATION ON TRAIN DATA

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import cross_validation

predicted = cross_validation.cross_val_predict(LogisticRegression(),X,y_under_train, cv=10)
print (metrics.accuracy_score(y_under_train, predicted))



IndexError: too many indices for array

# CROSS VALIDATION ON TEST DATA

In [11]:
predicted1 = cross_validation.cross_val_predict(LogisticRegression(),X2, y_under_test, cv=10)

print (metrics.classification_report(y_under_test, predicted1))
print (metrics.accuracy_score(y_under_test, predicted1))

IndexError: too many indices for array

# K BEST ON NAIVE BAYES

In [48]:
array = under_sample.values
test = SelectKBest(score_func=f_classif,k=10)
fit = test.fit(X_under_train, y_under_train)
print("scores_:",test.scores_)
print("pvalues_:",test.pvalues_)
print("selected index:",test.get_support(True))
print("after transform:",test.transform(X_under_train)) 

scores_: [2.37229381e+01 1.79182582e+02 2.43698789e+02 3.40187567e+02
 7.51002797e+02 1.11607521e+02 1.63011205e+02 2.33969542e+02
 3.28688887e+00 2.99150074e+02 4.72964524e+02 5.74960523e+02
 6.47205878e+02 1.65921392e+00 9.28325585e+02 2.28230220e-02
 3.71149045e+02 3.17713518e+02 1.99185191e+02 6.14325691e+01
 1.16058167e+01 2.09957005e+01 1.36068457e+00 2.86732624e+00
 3.92521641e+00 2.46980507e+00 2.21180060e+00 9.30910309e+00
 4.27507054e+00 3.82773094e-01]
pvalues_: [1.38238441e-006 1.79617701e-036 3.09299895e-047 5.37659097e-062
 2.98650548e-112 2.83356201e-024 1.20564260e-033 1.15870772e-045
 7.02721697e-002 6.74281337e-056 3.63448217e-080 9.37480096e-093
 4.56358196e-101 1.98143802e-001 1.32286060e-129 8.79962620e-001
 1.95075268e-066 1.09414223e-058 6.78926138e-040 1.75142518e-014
 6.95826192e-004 5.46845659e-006 2.43824228e-001 9.08489200e-002
 4.79659407e-002 1.16513404e-001 1.37418206e-001 2.36813854e-003
 3.90497659e-002 5.36328223e-001]
selected index: [ 2  3  4  9 10 1

  y = column_or_1d(y, warn=True)


# MODEL FITTING ON TRAIN DATA

In [49]:
gnb = GaussianNB()
X = X_under_train[['V2','V3','V4','V9','V10','V11','V12','V14','V16','V17']]
gnb.fit(X,y_under_train)
y_pred = gnb.predict(X)
print(confusion_matrix(y_under_train,y_pred))  
print(classification_report(y_under_train,y_pred))
print(accuracy_score(y_under_train, y_pred))

[[337   6]
 [ 45 300]]
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       343
           1       0.98      0.87      0.92       345

   micro avg       0.93      0.93      0.93       688
   macro avg       0.93      0.93      0.93       688
weighted avg       0.93      0.93      0.93       688

0.9258720930232558


  y = column_or_1d(y, warn=True)


# MODEL FITTING ON TEST DATA

In [50]:
X2 = X_under_test[['V2','V3','V4','V9','V10','V11','V12','V14','V16','V17']]
y_pred = gnb.predict(X2)
print(confusion_matrix(y_under_test,y_pred))  
print(classification_report(y_under_test,y_pred))
print(accuracy_score(y_under_test, y_pred))

[[147   2]
 [ 22 125]]
              precision    recall  f1-score   support

           0       0.87      0.99      0.92       149
           1       0.98      0.85      0.91       147

   micro avg       0.92      0.92      0.92       296
   macro avg       0.93      0.92      0.92       296
weighted avg       0.93      0.92      0.92       296

0.918918918918919


# GRIDSEARCH ON TRAIN DATA

In [12]:
# Logistic regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

logregpipe = Pipeline([('scale', StandardScaler()),
                   ('logreg',LogisticRegression(multi_class="multinomial",solver="lbfgs"))])

# Gridsearch to determine the value of C
param_grid = {'logreg__C':np.arange(0.01,100,10)}
logreg_cv = GridSearchCV(logregpipe,param_grid,cv=5,return_train_score=True)
logreg_cv.fit(X,y_under_train)
print(logreg_cv.best_params_)

bestlogreg = logreg_cv.best_estimator_
bestlogreg.fit(X,y_under_train)
bestlogreg.coef_ = bestlogreg.named_steps['logreg'].coef_
bestlogreg.score(X,y_under_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'logreg__C': 10.01}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.9331395348837209

# GRIDSEARCH ON TEST DATA

In [13]:
bestlogreg = logreg_cv.best_estimator_
bestlogreg.fit(X2,y_under_test)
bestlogreg.coef_ = bestlogreg.named_steps['logreg'].coef_
bestlogreg.score(X2,y_under_test)

  y = column_or_1d(y, warn=True)


0.9155405405405406

# CROSS VALIDATION ON TRAIN DATA

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import cross_validation

predicted = cross_validation.cross_val_predict(LogisticRegression(),X,y_under_train, cv=10)
print (metrics.accuracy_score(y_under_train, predicted))

IndexError: too many indices for array

# CROSS VALIDATION ON TEST DATA

In [15]:
predicted1 = cross_validation.cross_val_predict(LogisticRegression(),X2, y_under_test, cv=10)

print (metrics.classification_report(y_under_test, predicted1))
print (metrics.accuracy_score(y_under_test, predicted1))

IndexError: too many indices for array

# PCA ON NAIVE BAYES

In [51]:
pca = PCA(n_components=5)
X_under_train = pca.fit_transform(X_under_train)
X_under_test = pca.transform(X_under_test)
explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[9.99970246e-01 2.96341777e-05 8.05741247e-08 1.06952040e-08
 1.04275952e-08]


# MODEL FITTING ON TRAIN & TEST DATA

In [52]:
gnb = GaussianNB()
gnb.fit(X_under_train,y_under_train)
y_pred = gnb.predict(X_under_test)
print(confusion_matrix(y_under_test,y_pred))  
print(classification_report(y_under_test,y_pred))
print(accuracy_score(y_under_test, y_pred))

[[147   2]
 [ 42 105]]
              precision    recall  f1-score   support

           0       0.78      0.99      0.87       149
           1       0.98      0.71      0.83       147

   micro avg       0.85      0.85      0.85       296
   macro avg       0.88      0.85      0.85       296
weighted avg       0.88      0.85      0.85       296

0.8513513513513513


  y = column_or_1d(y, warn=True)


# GRIDSEARCH ON TEST & TRAIN DATA

In [16]:
# Logistic regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

logregpipe = Pipeline([('scale', StandardScaler()),
                   ('logreg',LogisticRegression(multi_class="multinomial",solver="lbfgs"))])

# Gridsearch to determine the value of C
param_grid = {'logreg__C':np.arange(0.01,100,10)}
logreg_cv = GridSearchCV(logregpipe,param_grid,cv=5,return_train_score=True)
logreg_cv.fit(X_under_train,y_under_train)
print(logreg_cv.best_params_)

bestlogreg = logreg_cv.best_estimator_
bestlogreg.fit(X_under_train,y_under_train)
bestlogreg.coef_ = bestlogreg.named_steps['logreg'].coef_
bestlogreg.score(X_under_train,y_under_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'logreg__C': 30.01}


0.9578488372093024

# CROSS VALIDATION ON TEST & TRAIN DATA

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import cross_validation

predicted = cross_validation.cross_val_predict(LogisticRegression(),X_under_train,y_under_train, cv=10)
print (metrics.accuracy_score(y_under_train, predicted))

IndexError: too many indices for array

In [18]:
predicted1 = cross_validation.cross_val_predict(LogisticRegression(),X_under_test, y_under_test, cv=10)

print (metrics.classification_report(y_under_test, predicted1))
print (metrics.accuracy_score(y_under_test, predicted1))

IndexError: too many indices for array