In [None]:
import pandas as pd
import numpy as np
# Machine learning components
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    average_precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve
)
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [83]:
data = pd.read_csv('./fraudTrain.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [84]:
X = data.drop(columns=['is_fraud'])
y = data['is_fraud']

In [85]:
X.drop(columns=['Unnamed: 0','unix_time','cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'city_pop', 'trans_num'],inplace=True)

In [None]:
#We drop categorical features which have too many classes
X.drop(columns=['job','merchant'],inplace=True)

In [87]:
X['dob'] = pd.to_datetime(X['dob'])
X['trans_date_trans_time'] = pd.to_datetime(X['trans_date_trans_time'])

In [88]:
#Feature engineering 1

X['age'] = (X['trans_date_trans_time'] - X['dob']).dt.days / 365.25
X['trans_month']=pd.to_datetime(X['trans_date_trans_time']).dt.month
X['trans_day']=pd.to_datetime(X['trans_date_trans_time']).dt.day
X['trans_hour']=pd.to_datetime(X['trans_date_trans_time']).dt.hour
X.drop(columns=['trans_date_trans_time','dob'],inplace=True)

In [89]:
#Feature engineering 2. We use sin, cos transformations for hour, day, month
X["trans_month_sin"] = np.sin(2 * np.pi * X["trans_month"] / 12)
X["trans_month_cos"] = np.cos(2 * np.pi * X["trans_month"] / 12)
X["trans_day_sin"] = np.sin(2 * np.pi * X["trans_day"] / 31)
X["trans_day_cos"] = np.cos(2 * np.pi * X["trans_day"] / 31)
X["trans_hour_sin"] = np.sin(2 * np.pi * X["trans_hour"] / 24)
X["trans_hour_cos"] = np.cos(2 * np.pi * X["trans_hour"] / 24)
X.drop(columns=['trans_month','trans_day','trans_hour'],inplace=True)

In [90]:
# Train vs (Val + Test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# Validation vs Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

print("Train:", X_train.shape)
print("Val:  ", X_val.shape)
print("Test: ", X_test.shape)

Train: (907672, 14)
Val:   (194501, 14)
Test:  (194502, 14)


In [96]:
#Part 2 (Scaling) We apply it to only two columns here. Fit and transform on training set. Transform on test set. See how to use Ashenafi functions.
# amt_scaler = RobustScaler().set_output(transform="pandas")
# age_scaler = RobustScaler().set_output(transform="pandas")

# # robust_scaler = RobustScaler().set_output(transform='pandas')
# X_train['amt'] = amt_scaler.fit_transform(X_train[['amt']])
# X_val['amt']   = amt_scaler.transform(X_val[['amt']])
# X_test['amt']  = amt_scaler.transform(X_test[['amt']])

# X_train['age'] = age_scaler.fit_transform(X_train[['age']])
# X_val['age']   = age_scaler.transform(X_val[['age']])
# X_test['age']  = age_scaler.transform(X_test[['age']])

numeric_features = X_train.select_dtypes(include="number").columns.tolist()
categorical_features = X_train.select_dtypes(include="object").columns.tolist()

neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos

preprocessor = ColumnTransformer(
    transformers=[
        ("num", RobustScaler(), numeric_features),
        ("cat", OneHotEncoder(
            handle_unknown="ignore",
            drop="if_binary"
        ), categorical_features)
    ],
    remainder="drop"
)

pipeline = Pipeline([
    ("prep", preprocessor),
    ("model", xgb.XGBClassifier(
        n_estimators=400,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        eval_metric="logloss",
        tree_method="hist",
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False
    ))
])

In [97]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    pipeline,
    X_train,
    y_train,
    scoring="average_precision",
    cv=cv,
    n_jobs=-1
)

print("CV PR-AUC scores:", cv_scores)
print("Mean CV PR-AUC:", cv_scores.mean())

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



CV PR-AUC scores: [0.94020333 0.94219339 0.93385265 0.94392247 0.94171295]
Mean CV PR-AUC: 0.9403769592137442


In [98]:
pipeline.fit(X_train, y_train)

In [99]:
y_val_proba = pipeline.predict_proba(X_val)[:, 1]

thresholds = np.linspace(0.01, 0.3, 100)
f1_scores = [f1_score(y_val, y_val_proba >= t) for t in thresholds]

best_threshold = thresholds[np.argmax(f1_scores)]

print("Best validation threshold:", best_threshold)

Best validation threshold: 0.3


In [100]:
y_test_proba = pipeline.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= best_threshold).astype(int)

print("TEST PR-AUC:", average_precision_score(y_test, y_test_proba))
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

TEST PR-AUC: 0.9546613897294928
Accuracy: 0.9935681895301848

Confusion Matrix:
 [[192158   1218]
 [    33   1093]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    193376
           1       0.47      0.97      0.64      1126

    accuracy                           0.99    194502
   macro avg       0.74      0.98      0.82    194502
weighted avg       1.00      0.99      0.99    194502



In [68]:
# X_train_scaled.dtypes[X_train_scaled.dtypes != "float64"] # Series([], dtype: object)

In [69]:
# X_train_scaled.columns.duplicated().sum() # 15

#### fit the model

In [103]:
y_proba = pipeline.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.03).astype(int)

print(classification_report(y_test, y_pred))

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))# 0.001 is best for recall

              precision    recall  f1-score   support

           0       1.00      0.97      0.99    193376
           1       0.18      0.99      0.30      1126

    accuracy                           0.97    194502
   macro avg       0.59      0.98      0.64    194502
weighted avg       1.00      0.97      0.98    194502

Accuracy: 0.9734707098127525

Confusion Matrix:
[[188225   5151]
 [     9   1117]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    193376
           1       0.18      0.99      0.30      1126

    accuracy                           0.97    194502
   macro avg       0.59      0.98      0.64    194502
weighted avg       1.00      0.97      0.98    194502

