In [None]:
! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d mlg-ulb/creditcardfraud


In [None]:
! unzip creditcardfraud.zip

In [None]:
import pandas as pd
import numpy as np


from scipy.stats import reciprocal
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier


In [None]:
def load_data() -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    credidcard_fraud_df = pd.read_csv('creditcard.csv')

    y = credidcard_fraud_df['Class']
    X = credidcard_fraud_df
    X.pop('Class')

    X_train, X_validate, y_train, y_validate = train_test_split(X, y, train_size=0.8, test_size=0.2, shuffle=True,
                                                                random_state=37, stratify=y)
    return X_train, X_validate, y_train, y_validate


In [None]:
X_train, _, y_train, _ = load_data()

In [None]:
# Local train and test set
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, train_size=0.8, test_size=0.2)

# Feature Engineering Time Stamp

In [None]:
X_train["Time-Mod"]=(X_train["Time"]%(60*60*24))/(60*60*24)
X_train.pop('Time')

In [None]:
X_test["Time-Mod"]=(X_test["Time"]%(60*60*24))/(60*60*24)
X_test.pop('Time')

In [None]:
# drop all exept for max 10
# X_train1= X_train
# X_train1.pop('Time-Mod')
# X_train1.pop('V21')
# X_train1.pop('V9')
# X_train1.pop('V8')
# X_train1.pop('V20')
# X_train1.pop('V1')
# X_train1.pop('V18')
# X_train1.pop('V15')
# X_train1.pop('V28')
# X_train1.pop('V23')
# X_train1.pop('Amount')
# X_train1.pop('V25')
# X_train1.pop('V26')
# X_train1.pop('V6')
# X_train1.pop('V22')
# X_train1.pop('V5')
# X_train1.pop('V13')
# X_train1.pop('V27')
# X_train1.pop('V24')



In [None]:
# drop all exept for max 10
# X_test1= X_test
# X_test1.pop('Time-Mod')
# X_test1.pop('V21')
# X_test1.pop('V9')
# X_test1.pop('V8')
# X_test1.pop('V20')
# X_test1.pop('V1')
# X_test1.pop('V18')
# X_test1.pop('V15')
# X_test1.pop('V28')
# X_test1.pop('V23')
# X_test1.pop('Amount')
# X_test1.pop('V25')
# X_test1.pop('V26')
# X_test1.pop('V6')
# X_test1.pop('V22')
# X_test1.pop('V5')
# X_test1.pop('V13')
# X_test1.pop('V27')
# X_test1.pop('V24')


# Random Under Sample 

In [None]:
# print(f'Original dataset distribution: (Fraud: {sum(y_train)}, !Fraud: {len(y_train)-sum(y_train)})')

# rus = RandomUnderSampler(sampling_strategy=0.01)
# X_train_bal, y_train_bal = rus.fit_resample(X_train, y_train)

# print(f'Balanced dataset distribution: (Fraud: {sum(y_train_bal)}, !Fraud: {len(y_train_bal)-sum(y_train_bal)})')


# Random Forest Classifyier


In [None]:
# random forest model creation
rfc = RandomForestClassifier(n_estimators=35, criterion = "entropy", max_features = "sqrt", n_jobs= 24)
rfc.fit(X_train, y_train)
# predictions
y_pred = rfc.predict(X_test)

# Evaudate

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,matthews_corrcoef, cohen_kappa_score

def get_stats_of_results(y_gt, y_pred):

    acc= accuracy_score(y_test,y_pred)
    prec= precision_score(y_test,y_pred)
    recall= recall_score(y_test,y_pred)
    f1= f1_score(y_test,y_pred)
    mcc= matthews_corrcoef(y_test,y_pred)
    kappa= cohen_kappa_score(y_test, y_pred)
    
    return acc, prec, recall, f1, mcc, kappa

In [None]:
from matplotlib.cbook import print_cycles

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,matthews_corrcoef, cohen_kappa_score
from sklearn.metrics import confusion_matrix


acc, prec, recall, f1, mcc, kappa = get_stats_of_results(y_test, y_pred)

# pretty print
print(f"The accuracy is  {acc}")
print(f"The precision is {prec}")
print(f"The recall is {recall}")
print(f"The F1-Score is {f1}")
print(f"The Matthews correlation coefficient is {mcc}")
print(f"The Cohenâ€™s kappa is {kappa}")


conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)
plt.figure(figsize=(5,5))
sns.heatmap(conf_matrix, annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()



Save and Load

In [None]:
! pip install cpickle

In [None]:
import pickle


with open('rf_v1_original_bezt', 'wb') as f:
    pickle.dump(rfc, f)




In [None]:
with open('rf_v1_original_data_modifitime', 'rb') as f:
    rfc = pickle.load(f)
