In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, average_precision_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
import seaborn as sns
from sklearn import preprocessing

In [2]:
# Load the dataset
data = pd.read_csv('creditcard.csv')

# Preprocessing
X = data.drop(['Class', 'Time'], axis=1)
y = data['Class']
scaler = StandardScaler()
X[['Amount']] = scaler.fit_transform(X[['Amount']])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=False, random_state=None)

In [3]:
# Skewness correction
pt = preprocessing.PowerTransformer(method='yeo-johnson', copy=True)
pt.fit(X_train)
X_train_pt = pt.transform(X_train)
X_test_pt = pt.transform(X_test)


In [4]:
# Resampling techniques
resampling_methods = {
    'SMOTE': SMOTE(random_state=0).fit_resample(X_train, y_train),
}

In [5]:
# Classifiers
classifiers = {
    'XGBoost': (XGBClassifier(n_jobs=-1, objective='binary:logistic'), {
        'max_depth': [10],
        'learning_rate': [0.1],
        'subsample': [0.9],
        'n_estimators': [300],
    })
}

In [6]:
# DataFrame to store the results
results_df = pd.DataFrame(columns=['Classifier', 'Resampling', 'AUPRC', 'F1', 'Recall', 'Precision', 'Best Parameters', 'Time'])


In [7]:
# Train and evaluate classifiers
row_index = 0
for resampling_name, (X_train_resampled, y_train_resampled) in resampling_methods.items():
    for classifier_name, (classifier, params) in classifiers.items():
        start_time = time.time()
        print(f"Training {classifier_name} classifier using {resampling_name} resampling...")
        grid_search = GridSearchCV(classifier, params, scoring='average_precision', cv=stratified_kfold, verbose=1, n_jobs=-1)
        grid_search.fit(X_train_resampled, y_train_resampled)
        y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
        y_pred = grid_search.predict(X_test)
        auprc = average_precision_score(y_true=y_test, y_score=y_pred_proba)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Finished training {classifier_name} classifier using {resampling_name} resampling. Elapsed time: {elapsed_time:.2f} seconds.")
        results_df.loc[row_index] = [classifier_name, resampling_name, auprc, f1, recall, precision, grid_search.best_params_, elapsed_time]
        row_index += 1

Training XGBoost classifier using SMOTE resampling...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Finished training XGBoost classifier using SMOTE resampling. Elapsed time: 901.75 seconds.


In [10]:
# Print the results
print("Classifier performance (AUPRC, F1 Score, Recall, Precision), best parameters, and time:")
results_df.head(len(resampling_methods) * len(classifiers))

Classifier performance (AUPRC, F1 Score, Recall, Precision), best parameters, and time:


Unnamed: 0,Classifier,Resampling,AUPRC,F1,Recall,Precision,Best Parameters,Time
0,XGBoost,SMOTE,0.882779,0.835821,0.857143,0.815534,"{'learning_rate': 0.1, 'max_depth': 10, 'n_est...",901.745067


In [None]:
x_test_predict = X_test.iloc[0:2]
x_test_predict.head()

In [None]:
y_test_ans = y_test.iloc[0:2]
y_test_ans

In [32]:
ans  = grid_search.best_estimator_.predict(x_test_predict)

In [None]:
ans


In [34]:
ans_1  = grid_search.best_estimator_.predict(X_test)


In [None]:
ans_1


In [None]:
y_test.iloc[1]

In [None]:
index = 0
for n in ans_1:
    if(y_test.iloc[index] != n):
        print('found one that is not working ')
        print(index)
        print('actual is: ', n)
        print('the predicted is: ', y_test.iloc[index])
        print('the features for this one was :')
        print(X_test.iloc[index])
        break
    index+=1