# IMPORTING LIBRARIES

In [78]:
import pandas as pd
import numpy as np
import imblearn
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA


# READING CSV FILE

In [79]:
df = pd.read_csv("./creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# UNDERSAMPLING THE DATA

In [80]:
no_frauds = len(df[df['Class'] == 1])
non_fraud_indices = df[df.Class == 0].index
non_fraud_indices = df[df.Class == 0].index
random_indices = np.random.choice(non_fraud_indices, no_frauds, replace=False)
fraud_indices = df[df.Class == 1].index
under_sample_indices = np.concatenate([fraud_indices,random_indices])
under_sample = df.loc[under_sample_indices]

# SPLITTING THE DATASET INTO TRAIN AND TEST

In [81]:
X_under = under_sample[['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V18','V19',
                        'V20','V21','V22','V23','V24','V25','V26','V27','V28','Amount']]
y_under = under_sample['Class']
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(X_under,y_under,test_size = 0.3, random_state = 0)

# RFE ON RANDOM FOREST

In [82]:
model = RandomForestClassifier()
rfe = RFE(model)
fit = rfe.fit(X_under_train,y_under_train)
print("Number of Features: %d"% fit.n_features_)
print("Selected features: %s"% fit.support_)
print("Ranking of features: %s"% fit.ranking_)  

Number of Features: 14
Selected features: [False False  True  True False False  True  True False  True  True  True
 False  True False False  True False  True  True  True  True False False
 False  True False False False]
Ranking of features: [ 5  3  1  1 12  2  1  1 10  1  1  1 11  1  6 16  1  9  1  1  1  1  7 14
  8  1  4 15 13]


# MODEL FITTING ON TRAIN DATA

In [83]:
regressor = RandomForestClassifier(n_estimators=30, random_state=0)  
X = X_under_train[['V3','V4','V6','V7','V10','V11','V12','V14','V16','V17','V19','V20','V21','V26','Amount']]
regressor.fit(X, y_under_train)  
y_pred = regressor.predict(X)  
print(confusion_matrix(y_under_train,y_pred))  
print(classification_report(y_under_train,y_pred)) 
print(accuracy_score(y_under_train, y_pred))

[[343   0]
 [  1 344]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       343
          1       1.00      1.00      1.00       345

avg / total       1.00      1.00      1.00       688

0.998546511627907


# MODEL FITTING ON TEST DATA

In [84]:
# regressor = RandomForestClassifier(n_estimators=20, random_state=0)  
X2 = X_under_test[['V3','V4','V6','V7','V10','V11','V12','V14','V16','V17','V19','V20','V21','V26','Amount']]
# regressor.fit(X2, y_under_test)  
y_pred = regressor.predict(X2)  
print(confusion_matrix(y_under_test,y_pred))  
print(classification_report(y_under_test,y_pred))  
print(accuracy_score(y_under_test, y_pred))  

[[144   5]
 [ 12 135]]
             precision    recall  f1-score   support

          0       0.92      0.97      0.94       149
          1       0.96      0.92      0.94       147

avg / total       0.94      0.94      0.94       296

0.9425675675675675


# GRIDSEARCH ON TRAIN DATA

In [85]:
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X, y_under_train)
CV_rfc.best_params_


{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 500}

In [86]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 200, max_depth=8, criterion='gini')
rfc1.fit(X, y_under_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [87]:
pred=rfc1.predict(X2)

In [88]:
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_under_test,pred))

Accuracy for Random Forest on CV data:  0.9493243243243243


# GRIDSEARCH ON TEST DATA

In [89]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X2,y_under_test)
print (CV_rfc.best_params_)
print(CV_rfc.score(X2,y_under_test))

{'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 200}
0.9763513513513513


# CROSS VALIDATION ON TRAIN DATA

In [108]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score 
classifier = RandomForestClassifier(n_estimators=300, random_state=0)
all_accuracies = cross_val_score(estimator=classifier,X=X_under_train, y=y_under_train, cv=5)  
print(all_accuracies)  

[0.92753623 0.9057971  0.96376812 0.94890511 0.94160584]


In [109]:
print(all_accuracies.mean())

0.9375224796360943


# CROSS VALIDATION ON TEST DATA

In [110]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score 
classifier = RandomForestClassifier(n_estimators=300, random_state=0)
all_accuracies = cross_val_score(estimator=classifier,X=X2, y=y_under_test, cv=5)  
print(all_accuracies)  

[0.93333333 0.95       0.91525424 0.89830508 0.9137931 ]


In [111]:
print(all_accuracies.mean())

0.9221371517631015


# KBEST ON RANDOM FOREST

In [112]:
array = under_sample.values
test = SelectKBest(score_func=f_classif, k=10)
fit = test.fit(X_under_train,y_under_train)

In [113]:
print("scores_:",test.scores_)
print("pvalues_:",test.pvalues_)
print("selected index:",test.get_support(True))
print("after transform:",test.transform(X_under_train)) 

scores_: [1.67957458e+02 2.33497159e+02 3.58349993e+02 6.63681396e+02
 1.16681318e+02 1.63885331e+02 2.28570696e+02 6.00357599e+00
 2.96291407e+02 4.87187114e+02 5.93939572e+02 6.52267112e+02
 1.17330578e+00 8.98706649e+02 4.05450722e-01 3.79352463e+02
 3.11741499e+02 1.82517746e+02 6.66556884e+01 9.38501029e+00
 2.22482344e+01 2.40689006e-01 1.10681621e+00 1.71911374e+00
 8.83391561e-02 6.77260954e-01 1.75688469e+01 7.05653405e+00
 9.52265843e-01]
pvalues_: [1.62457541e-034 1.38294905e-045 1.28723911e-064 6.71358385e-103
 3.15954654e-025 8.45286332e-034 8.80010357e-045 1.45256743e-002
 1.83300757e-055 5.49242693e-082 5.53097913e-095 1.24167100e-101
 2.79103602e-001 7.63854271e-127 5.24500033e-001 1.36670773e-067
 8.52782998e-058 4.76534289e-037 1.55007600e-015 2.27333898e-003
 2.90369677e-006 6.23865807e-001 2.93145519e-001 1.90245868e-001
 7.66389265e-001 4.10817481e-001 3.13357100e-005 8.08126243e-003
 3.29487199e-001]
selected index: [ 1  2  3  8  9 10 11 13 15 16]
after transform:

# MODEL FITTING ON TRAIN DATA

In [114]:
model = RandomForestClassifier()
X = X_under_train[['V3','V4','V9','V10','V11','V12','V14','V16','V17']]
model.fit(X, y_under_train)
y_pred = model.predict(X)
print(confusion_matrix(y_under_train,y_pred))  
print(classification_report(y_under_train,y_pred))  
print(accuracy_score(y_under_train, y_pred))


[[342   1]
 [  3 342]]
             precision    recall  f1-score   support

          0       0.99      1.00      0.99       343
          1       1.00      0.99      0.99       345

avg / total       0.99      0.99      0.99       688

0.9941860465116279


# MODEL FITTING ON TEST DATA

In [115]:
X2 = X_under_test[['V3','V4','V9','V10','V11','V12','V14','V16','V17']]
y_pred = model.predict(X2)
print(confusion_matrix(y_under_test,y_pred))  
print(classification_report(y_under_test,y_pred)) 
print(accuracy_score(y_under_test, y_pred))  

[[141   8]
 [ 15 132]]
             precision    recall  f1-score   support

          0       0.90      0.95      0.92       149
          1       0.94      0.90      0.92       147

avg / total       0.92      0.92      0.92       296

0.9222972972972973


# GRIDSEARCH ON TRAIN DATA

In [116]:
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X, y_under_train)
CV_rfc.best_params_


{'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 200}

In [117]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 200, max_depth=8, criterion='gini')
rfc1.fit(X, y_under_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [118]:
pred=rfc1.predict(X2)

In [119]:
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_under_test,pred))

Accuracy for Random Forest on CV data:  0.9425675675675675


# GRIDSEARCH ON TEST DATA

In [120]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X2,y_under_test)
print (CV_rfc.best_params_)
print(CV_rfc.score(X2,y_under_test))

{'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 500}
0.9695945945945946


# CROSS VALIDATION ON TRAIN DATA

In [121]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score 
classifier = RandomForestClassifier(n_estimators=300, random_state=0)
all_accuracies = cross_val_score(estimator=classifier,X=X_under_train, y=y_under_train, cv=5)  
print(all_accuracies) 

[0.92753623 0.9057971  0.96376812 0.94890511 0.94160584]


In [122]:
print(all_accuracies.mean())

0.9375224796360943


# CROSS VALIDATION ON TEST DATA

In [123]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score 
classifier = RandomForestClassifier(n_estimators=300, random_state=0)
all_accuracies = cross_val_score(estimator=classifier,X=X2, y=y_under_test, cv=5)  
print(all_accuracies)  

[0.93333333 0.95       0.91525424 0.89830508 0.9137931 ]


In [125]:
print(all_accuracies.mean())

0.9221371517631015


# PCA ON RANDOM FOREST

In [20]:
pca = PCA(n_components=5)
X_under_train = pca.fit_transform(X_under_train)
X_under_test = pca.transform(X_under_test)
explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[9.99979255e-01 2.06255290e-05 7.93282936e-08 1.09963661e-08
 1.05116248e-08]


# MODEL FITTING ON TRAIN & TEST DATA

In [21]:
model = RandomForestClassifier()
model.fit(X_under_train, y_under_train)
y_pred = model.predict(X_under_test)
print(confusion_matrix(y_under_test,y_pred))  
print(classification_report(y_under_test,y_pred)) 
print(accuracy_score(y_under_test, y_pred))  

[[140   9]
 [ 15 132]]
             precision    recall  f1-score   support

          0       0.90      0.94      0.92       149
          1       0.94      0.90      0.92       147

avg / total       0.92      0.92      0.92       296

0.918918918918919


  


# GRIDSEARCH ON TEST & TRAIN DATA

In [126]:
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X, y_under_train)
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 200}

In [127]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 200, max_depth=8, criterion='gini')
rfc1.fit(X, y_under_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [128]:
pred=rfc1.predict(X2)

In [129]:
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_under_test,pred))

Accuracy for Random Forest on CV data:  0.9425675675675675


In [130]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X2,y_under_test)
print (CV_rfc.best_params_)
print(CV_rfc.score(X2,y_under_test))

{'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 500}
0.9695945945945946


# CROSS VALIDATION ON TEST & TRAIN DATA

In [131]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score 
classifier = RandomForestClassifier(n_estimators=300, random_state=0)
all_accuracies = cross_val_score(estimator=classifier,X=X_under_train, y=y_under_train, cv=5)  
print(all_accuracies) 

[0.92753623 0.9057971  0.96376812 0.94890511 0.94160584]


In [132]:
print(all_accuracies.mean())

0.9375224796360943


In [133]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score 
classifier = RandomForestClassifier(n_estimators=300, random_state=0)
all_accuracies = cross_val_score(estimator=classifier,X=X2, y=y_under_test, cv=5)  
print(all_accuracies) 

[0.93333333 0.95       0.91525424 0.89830508 0.9137931 ]


In [134]:
print(all_accuracies.mean())

0.9221371517631015
