In [1]:
# !pip install imbalanced-learn

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('archive/creditcard.csv')

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
df.shape

(284807, 31)

In [6]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [7]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [8]:
x = df.drop('Class', axis = 1)
y = df['Class']

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=80,random_state=2)

In [10]:
10.0**np.arange(-2, 3)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

In [11]:
log_class = LogisticRegression()
grid = {'C': 10.0**np.arange(-2, 3), 'penalty': ['l2']}
cv = KFold(n_splits=5, random_state=None, shuffle=True)

In [12]:
clf = GridSearchCV(log_class, grid, cv = cv, n_jobs=-1, scoring='f1_macro')

In [13]:
clf.fit(x_train, y_train)

5 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/onstak/miniconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/onstak/miniconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/onstak/miniconda3/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1252, in fit
    raise ValueError(
ValueError: This solver needs samples of at least 2 classes in the dat

In [14]:
y_pred = clf.predict(x_test)

In [15]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[284176     60]
 [   452     39]]
0.9982017862724645
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    284236
           1       0.39      0.08      0.13       491

    accuracy                           1.00    284727
   macro avg       0.70      0.54      0.57    284727
weighted avg       1.00      1.00      1.00    284727



In [16]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(x_train, y_train)

In [17]:
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[284236      0]
 [   488      3]]
0.9982860775409427
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    284236
           1       1.00      0.01      0.01       491

    accuracy                           1.00    284727
   macro avg       1.00      0.50      0.51    284727
weighted avg       1.00      1.00      1.00    284727



In [18]:
class_weights = dict({0:1, 1:100})

In [19]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(class_weight=class_weights)
classifier.fit(x_train, y_train)

In [20]:
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[284236      0]
 [   488      3]]
0.9982860775409427
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    284236
           1       1.00      0.01      0.01       491

    accuracy                           1.00    284727
   macro avg       1.00      0.50      0.51    284727
weighted avg       1.00      1.00      1.00    284727



## Under Sampling

In [21]:
from collections import Counter
from imblearn.under_sampling import NearMiss
ns=NearMiss()
X_train_ns,y_train_ns=ns.fit_resample(x,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 284315, 1: 492})
The number of classes after fit Counter({0: 492, 1: 492})


In [22]:
x_train, x_test, y_train, y_test = train_test_split(X_train_ns, y_train_ns, train_size=80,random_state=2)

In [23]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(class_weight=class_weights)
classifier.fit(x_train, y_train)

In [24]:
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[441  11]
 [ 65 387]]
0.915929203539823
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       452
           1       0.97      0.86      0.91       452

    accuracy                           0.92       904
   macro avg       0.92      0.92      0.92       904
weighted avg       0.92      0.92      0.92       904



### Over Sampling

In [25]:
from imblearn.over_sampling import RandomOverSampler
os=RandomOverSampler()
X_train_ns,y_train_ns=os.fit_resample(x,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))


The number of classes before fit Counter({0: 284315, 1: 492})
The number of classes after fit Counter({0: 284315, 1: 284315})


In [26]:
x_train, x_test, y_train, y_test = train_test_split(X_train_ns, y_train_ns, train_size=80,random_state=2)

In [27]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(x_train,y_train)

In [28]:
y_pred=classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[277058   7223]
 [ 23682 260587]]
0.9456424237094363
              precision    recall  f1-score   support

           0       0.92      0.97      0.95    284281
           1       0.97      0.92      0.94    284269

    accuracy                           0.95    568550
   macro avg       0.95      0.95      0.95    568550
weighted avg       0.95      0.95      0.95    568550



### SMOTETomek

In [29]:
from imblearn.combine import SMOTETomek

In [30]:
sm=SMOTETomek()
X_train_ns,y_train_ns = sm.fit_resample(x,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 284315, 1: 492})
The number of classes after fit Counter({0: 283776, 1: 283776})


In [31]:
x_train, x_test, y_train, y_test = train_test_split(X_train_ns, y_train_ns, train_size=80,random_state=2)

In [32]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(x_train,y_train)

In [33]:
y_pred=classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[280137   3605]
 [ 18454 265276]]
0.961127597484986
              precision    recall  f1-score   support

           0       0.94      0.99      0.96    283742
           1       0.99      0.93      0.96    283730

    accuracy                           0.96    567472
   macro avg       0.96      0.96      0.96    567472
weighted avg       0.96      0.96      0.96    567472



### EasyEnsembleClassifier

In [34]:
from imblearn.ensemble import EasyEnsembleClassifier

In [35]:
easy=EasyEnsembleClassifier()
easy.(X_train,y_train)

SyntaxError: invalid syntax (3944193540.py, line 2)

In [None]:
y_pred=easy.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))