In [None]:
#Imports
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import numpy as np

In [62]:
# Load dataset
data = pd.read_csv("/Users/titus/Downloads/archive (1)/fraudTrain.csv")

In [63]:
# Define tareget
y = data['is_fraud'] # the price here is either expensive or cheap

# Define features
X = data.drop(columns=['is_fraud'])

In [64]:
#Drop columns
X.drop(columns=['Unnamed: 0','unix_time','cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'city_pop', 'trans_num'],inplace=True)

In [65]:
#Transfer to datetime
X['dob'] = pd.to_datetime(X['dob'])
X['trans_date_trans_time'] = pd.to_datetime(X['trans_date_trans_time'])

In [66]:
# Define categotical features to OHE in next step
X_categorical=X.select_dtypes(include=['object'])

In [67]:
# Drop columns
X_categorical.drop(columns=['job','merchant'],inplace=True)

In [68]:
# Enocde categorical feautures
ohe = OneHotEncoder(drop="if_binary", sparse_output=False).set_output(transform='pandas')

# Fitting it to the categorical features
ohe.fit(X_categorical)

# Storing the encoded features
encoded_features = ohe.transform(X_categorical)

In [69]:
# Drop non numerical columns
X_numerical=X.select_dtypes(exclude=['object'])

In [70]:
X_numerical['age'] = (X['trans_date_trans_time'] - X_numerical['dob']).dt.days / 365.25
X_numerical['trans_month']=pd.to_datetime(X_numerical['trans_date_trans_time']).dt.month
X_numerical['trans_day']=pd.to_datetime(X_numerical['trans_date_trans_time']).dt.day
X_numerical['trans_hour']=pd.to_datetime(X_numerical['trans_date_trans_time']).dt.hour

In [71]:
X_numerical.drop(columns=['trans_date_trans_time','dob'],inplace=True)

In [72]:
#Use cyclic (sin/cos) encoding for month, day, weekday
X_numerical["trans_month_sin"] = np.sin(2 * np.pi * X_numerical["trans_month"] / 12)
X_numerical["trans_month_cos"] = np.cos(2 * np.pi * X_numerical["trans_month"] / 12)

X_numerical["trans_day_sin"] = np.sin(2 * np.pi * X_numerical["trans_day"] / 31)
X_numerical["trans_day_cos"] = np.cos(2 * np.pi * X_numerical["trans_day"] / 31)

X_numerical["trans_hour_sin"] = np.sin(2 * np.pi * X_numerical["trans_hour"] / 24)
X_numerical["trans_hour_cos"] = np.cos(2 * np.pi * X_numerical["trans_hour"] / 24)

In [73]:
#Drop'trans_month','trans_day','trans_hour'
X_numerical.drop(columns=['trans_month','trans_day','trans_hour'],inplace=True)

In [74]:
# Instantiating a RobustScaler
"""robust_scaler = RobustScaler().set_output(transform='pandas')
X_numerical['amt']= robust_scaler.fit_transform(X_numerical[['amt']])
X_numerical['age']= robust_scaler.fit_transform(X_numerical[['age']])"""

"robust_scaler = RobustScaler().set_output(transform='pandas')\nX_numerical['amt']= robust_scaler.fit_transform(X_numerical[['amt']])\nX_numerical['age']= robust_scaler.fit_transform(X_numerical[['age']])"

In [75]:
X_preprocessed = pd.concat([X_numerical, encoded_features], axis=1)

In [None]:
# Compute distance in locations

# Earth radius in kilometers
r = 6371

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    dlat  = lat2 - lat1
    dlon  = lon2 - lon1

    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    return r * c

# Compute distance for each row in your dataframe
X_preprocessed["distance_km"] = haversine_distance(
    X_preprocessed["lat"], X_preprocessed["long"], X_preprocessed["merch_lat"], X_preprocessed["merch_long"]
)

X_preprocessed = X_preprocessed.drop(columns=["lat", "long", "merch_long", "merch_lat"])

In [77]:
X_preprocessed

Unnamed: 0,amt,age,trans_month_sin,trans_month_cos,trans_day_sin,trans_day_cos,trans_hour_sin,trans_hour_cos,category_entertainment,category_food_dining,...,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M,distance_km
0,4.97,30.814511,5.000000e-01,0.866025,0.201299,0.979530,0.000000e+00,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,78.597568
1,107.23,40.531143,5.000000e-01,0.866025,0.201299,0.979530,0.000000e+00,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.212176
2,220.11,56.950034,5.000000e-01,0.866025,0.201299,0.979530,0.000000e+00,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,108.206083
3,45.00,51.969884,5.000000e-01,0.866025,0.201299,0.979530,0.000000e+00,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,95.673231
4,41.96,32.763860,5.000000e-01,0.866025,0.201299,0.979530,0.000000e+00,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,77.556744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,15.56,58.573580,1.224647e-16,-1.000000,-0.897805,-0.440394,1.224647e-16,-1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,119.752136
1296671,51.70,40.528405,1.224647e-16,-1.000000,-0.897805,-0.440394,1.224647e-16,-1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,75.104085
1296672,105.93,52.810404,1.224647e-16,-1.000000,-0.897805,-0.440394,1.224647e-16,-1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,99.047734
1296673,74.90,39.841205,1.224647e-16,-1.000000,-0.897805,-0.440394,1.224647e-16,-1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,84.627652


### MODEL DESIGN STARTS HERE

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size= 0.3, random_state= 42, stratify= y)

In [79]:
# Modeling pipeline
pipe = Pipeline(
    steps=[
        ("scaler", RobustScaler()),
        ("smote", SMOTE(random_state=42)),
        ("clf", LogisticRegression(
            class_weight="balanced",
            max_iter=2000,
            n_jobs=-1
        )),
    ]
)

In [80]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
}

cv_results = cross_validate(
    pipe, X_train, y_train,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
    return_train_score=True
)

for metric in scoring:
    scores = cv_results[f"test_{metric}"]
    print(f"{metric:12}: {scores.mean():.4f} ± {scores.std():.4f}")

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + 

accuracy    : 0.8754 ± 0.0015
precision   : 0.0384 ± 0.0004
recall      : 0.8536 ± 0.0087
f1          : 0.0735 ± 0.0007


In [81]:
# final fit - use RAW X_train
pipe.fit(X_train, y_train)

# predictions - use RAW X_test
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

# classification report
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report)
print(report_df.transpose())

# confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


              precision    recall  f1-score        support
0              0.999038  0.875344  0.933109  386751.000000
1              0.038415  0.855240  0.073527    2252.000000
accuracy       0.875227  0.875227  0.875227       0.875227
macro avg      0.518726  0.865292  0.503318  389003.000000
weighted avg   0.993477  0.875227  0.928133  389003.000000
Confusion Matrix:
 [[338540  48211]
 [   326   1926]]


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
