In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df.isnull().sum().sum()

0

In [4]:
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [5]:
# creating independent and dependent variable
X = df.drop(['Class'], axis=1)
y = df['Class']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=0)

#### Cross Validation use KFold,  
#### Hyperparameter tunning use GridsearchCV

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [8]:
lr = LogisticRegression()
# to variable grid for hyperparameter 
grid={'C':10.0 **np.arange(-2,3),'penalty':['l1','l2']}
# to variable cv for kfold
cv=KFold(n_splits=5,random_state=None,shuffle=False)

In [9]:
lregression = GridSearchCV(lr, grid, cv=cv,n_jobs=-1,scoring='f1_macro')

In [10]:
lregression.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [11]:
y_pred = lregression.predict(X_test)

In [12]:
print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred,  y_test))
print(classification_report(y_pred, y_test))

[[85263    50]
 [   33    97]]
0.9990285921608558
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85313
           1       0.66      0.75      0.70       130

    accuracy                           1.00     85443
   macro avg       0.83      0.87      0.85     85443
weighted avg       1.00      1.00      1.00     85443



#### Ensemble techniques with class_weight parameter

In [13]:
from sklearn.ensemble import RandomForestClassifier
# basically class weight is balanced & balanced subsample,assign more importance for 1 bcoz its imbalance dataset
Rfc = RandomForestClassifier(class_weight={0: 1, 1:100})  
Rfc.fit(X_train, y_train)

RandomForestClassifier(class_weight={0: 1, 1: 100})

In [14]:
y_pred = Rfc.predict(X_test)

In [15]:
print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred,  y_test))
print(classification_report(y_pred, y_test))

[[85291    36]
 [    5   111]]
0.9995201479348805
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85327
           1       0.76      0.96      0.84       116

    accuracy                           1.00     85443
   macro avg       0.88      0.98      0.92     85443
weighted avg       1.00      1.00      1.00     85443



#### Without using any parameter, use it as defalt

In [16]:
from sklearn.ensemble import RandomForestClassifier
Rfc = RandomForestClassifier()
Rfc.fit(X_train, y_train)

RandomForestClassifier()

In [17]:
y_pred = Rfc.predict(X_test)

In [18]:
print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred,  y_test))
print(classification_report(y_pred, y_test))

[[85290    35]
 [    6   112]]
0.9995201479348805
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85325
           1       0.76      0.95      0.85       118

    accuracy                           1.00     85443
   macro avg       0.88      0.97      0.92     85443
weighted avg       1.00      1.00      1.00     85443



#### Under sampling method:
     This technique samples down or reduces the samples of the class containing more data equivalent to the class containing the least samples. Suppose class A has 900 samples and class B has 100 samples, then the imbalance ratio is 9:1. Using the undersampling technique we keep class B as 100 samples and from class A we randomly select 100 samples out of 900. Then the ratio becomes 1:1 and we can say it’s balanced

In [19]:
from collections import Counter
Counter(y_train)

Counter({0: 199019, 1: 345})

In [20]:
from imblearn.under_sampling import NearMiss
ns=NearMiss(0.8)

In [21]:
X_train, y_train= ns.fit_resample(X_train, y_train)

In [22]:
from sklearn.ensemble import RandomForestClassifier
Rfc = RandomForestClassifier()  
Rfc.fit(X_train, y_train)

RandomForestClassifier()

In [23]:
y_pred = Rfc.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred,  y_test))
print(classification_report(y_pred, y_test))

[[61406     8]
 [23890   139]]
0.7203047645798953
              precision    recall  f1-score   support

           0       0.72      1.00      0.84     61414
           1       0.95      0.01      0.01     24029

    accuracy                           0.72     85443
   macro avg       0.83      0.50      0.42     85443
weighted avg       0.78      0.72      0.60     85443



#### Over sampling methods:
     Oversampling is just the opposite of undersampling. Here the class containing less data is made equivalent to the class containing more data. This is done by adding more data to the least sample containing class. Let’s take the same example of undersampling, then, in this case, class A will remain 900 and class B will also be 900 (which was previously 100). Hence the ratio will be 1:1 and it’ll be balanced

In [24]:
from imblearn.over_sampling import RandomOverSampler

In [25]:
OS = RandomOverSampler(1)

In [26]:
x_train_os, y_train_os = OS.fit_resample(X_train, y_train)

In [27]:
from sklearn.ensemble import RandomForestClassifier
Rfc = RandomForestClassifier()  
Rfc.fit(x_train_os, y_train_os)

RandomForestClassifier()

In [28]:
y_pred = Rfc.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred,  y_test))
print(classification_report(y_pred, y_test))

[[63744     8]
 [21552   139]]
0.7476680360006086
              precision    recall  f1-score   support

           0       0.75      1.00      0.86     63752
           1       0.95      0.01      0.01     21691

    accuracy                           0.75     85443
   macro avg       0.85      0.50      0.43     85443
weighted avg       0.80      0.75      0.64     85443



#### SMOTETomek sampling technique:
     SMOTETomek is somewhere upsampling and downsampling. SMOTETomek is a hybrid method which is a mixture of the above two methods, it uses an under-sampling method (Tomek) with an oversampling method (SMOTE).

In [29]:
from imblearn.combine import SMOTETomek

In [30]:
st=SMOTETomek(0.85)
X_train_st,y_train_st=st.fit_resample(X_train, y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_st)))

The number of classes before fit Counter({0: 431, 1: 345})
The number of classes after fit Counter({0: 430, 1: 365})


In [31]:
from sklearn.ensemble import RandomForestClassifier
Rfc = RandomForestClassifier()  
Rfc.fit(X_train_st, y_train_st)

RandomForestClassifier()

In [32]:
y_pred = Rfc.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(accuracy_score(y_pred,  y_test))
print(classification_report(y_pred, y_test))

[[61441     9]
 [23855   138]]
0.7207026906826773
              precision    recall  f1-score   support

           0       0.72      1.00      0.84     61450
           1       0.94      0.01      0.01     23993

    accuracy                           0.72     85443
   macro avg       0.83      0.50      0.42     85443
weighted avg       0.78      0.72      0.61     85443



#### EasyEnsembleClassifier Method:

In [33]:
from imblearn.ensemble import EasyEnsembleClassifier

In [34]:
easy=EasyEnsembleClassifier()
easy.fit(X_train,y_train)

EasyEnsembleClassifier()

In [35]:
y_pred=easy.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[45001 40295]
 [    5   142]]
0.5283405311143101
              precision    recall  f1-score   support

           0       1.00      0.53      0.69     85296
           1       0.00      0.97      0.01       147

    accuracy                           0.53     85443
   macro avg       0.50      0.75      0.35     85443
weighted avg       1.00      0.53      0.69     85443



#### BalancedRandomForestClassifier Method:

In [36]:
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier()
brf.fit(X_train,y_train)

BalancedRandomForestClassifier()

In [37]:
y_pred=brf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[62162 23134]
 [    9   138]]
0.7291410648034362
              precision    recall  f1-score   support

           0       1.00      0.73      0.84     85296
           1       0.01      0.94      0.01       147

    accuracy                           0.73     85443
   macro avg       0.50      0.83      0.43     85443
weighted avg       1.00      0.73      0.84     85443

