In [1]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score, plot_confusion_matrix, average_precision_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
import xgboost as xgb


In [3]:
df = pd.read_csv('../data/creditcard.csv')
X = df.drop('Class', axis=1)
y = df.Class
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

### Logit

In [9]:
logit = LogisticRegression()
params = {
    'penalty':['l1', 'l2', 'elasticnet'],
    'tol':[0.001, 0.0001, 0.00001],
    'C':[1.0, 0.9, 0.8]
}
gs = GridSearchCV(logit, params, scoring='f1', verbose=1)
gs.fit(X, y)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:  1.6min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1.0, 0.9, 0.8],
                         'penalty': ['l1', 'l2', 'elasticnet'],
                         'tol': [0.001, 0.0001, 1e-05]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=1)

In [12]:
gs.best_params_

{'C': 0.9, 'penalty': 'l2', 'tol': 0.001}

In [13]:
logit = LogisticRegression(C=0.9, penalty='l2', tol=0.001)
logit.fit(X_train, y_train)

LogisticRegression(C=0.9, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.001, verbose=0,
                   warm_start=False)

In [14]:
y_pred = logit.predict(X_test)
f1_score(y_test,y_pred)

0.7172995780590716

### XGBRFC

In [13]:
model = xgb.XGBRFClassifier(random_state=1)
model.fit(X_train, y_train)

XGBRFClassifier(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
                colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
                max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
                n_jobs=1, nthread=None, objective='binary:logistic',
                random_state=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                seed=None, silent=None, subsample=0.8, verbosity=1)

In [15]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71079
           1       0.80      0.75      0.77       123

    accuracy                           1.00     71202
   macro avg       0.90      0.87      0.89     71202
weighted avg       1.00      1.00      1.00     71202

0.773109243697479


In [19]:
cvs = cross_val_score(model, X, y, scoring='f1')

In [20]:
print(cvs)

[0.75113122 0.87431694 0.71186441 0.83040936 0.74213836]
