In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('fraudTrain.csv')
df

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.2620,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,entertainment,15.56,Erik,Patterson,M,162 Jessica Row Apt. 072,...,37.7175,-112.4777,258,Geoscientist,1961-11-24,440b587732da4dc1a6395aba5fb41669,1371816728,36.841266,-111.690765,0
1296671,1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,food_dining,51.70,Jeffrey,White,M,8617 Holmes Terrace Suite 651,...,39.2667,-77.5101,100,"Production assistant, television",1979-12-11,278000d2e0d2277d1de2f890067dcc0a,1371816739,38.906881,-78.246528,0
1296672,1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,food_dining,105.93,Christopher,Castaneda,M,1632 Cohen Drive Suite 639,...,32.9396,-105.8189,899,Naval architect,1967-08-30,483f52fe67fabef353d552c1e662974c,1371816752,33.619513,-105.130529,0
1296673,1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,Joseph,Murray,M,42933 Ryan Underpass,...,43.3526,-102.5411,1126,Volunteer coordinator,1980-08-18,d667cdcbadaaed3da3f4020e83591c83,1371816816,42.788940,-103.241160,0


In [None]:
from datetime import datetime
current_year = datetime.now().year
#first lets get our date strings as datetime objects so we can access the year. errors='coerce' throws error if incorrect date time format.
df['dob'] = pd.to_datetime(df['dob'],errors='coerce')
#pandas lets us perform element wise operations across entire columns. It also allows us to grab a datetime string and convert it to a datetime object by just .dt from there i can grab year very easily
df['age'] = current_year - df['dob'].dt.year

# Seperate features from targets

# Transform states to numerical values
df['state_code'] = df['state'].astype('category').cat.codes
# Transform jobs to numerical values
# features['job_code'] = features['job'].astype('category').cat.codes
# Transform city to numerical values
# features['city_code'] = features['city'].astype('category').cat.codes

# Create age groups
df['age_group'] = pd.cut(df['age'], bins=[18, 25, 35, 45, 55, 65, 100], labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+'])
# Date featuring
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.day
df['is_weekend'] = df['trans_date_trans_time'].dt.weekday >= 5
# Is the transaction occuring during regular business hours
df['is_business_hours'] = df['hour'].apply(lambda x: 9 <= x <= 17)

df = pd.get_dummies(df, columns=['category'], drop_first=True)
# Split into Female and Male
df= pd.get_dummies(df, columns=['gender'])
df= pd.get_dummies(df, columns=['age_group'])
cols_drop = ['Unnamed: 0', 'first', 'last', 'is_fraud', 'cc_num', 'trans_num', 'street', 'merchant', 'dob', 'job', 'unix_time', 'state', 'city', 'job']
features = df.drop(cols_drop, axis=1)

target= df['is_fraud']

In [None]:
import numpy as np

# Define the haversine formula
def haversine(lat1, lon1, lat2, lon2):
    R = 3958.8  # Radius of Earth in miles. Use 6371 for kilometers.

    # Convert degrees to radians
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c

# Use the vectorized version of haversine
features['distance'] = haversine(
    features['lat'],
    features['long'],
    features['merch_lat'],
    features['merch_long']
)



In [None]:
# Drop the longitudes and latitudes post distance calculation
cols_drop = ['lat', 'long', 'merch_lat', 'merch_long', 'trans_date_trans_time']
features = features.drop(cols_drop, axis=1)

# Prepare scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

pd.set_option('display.max_columns', None)  # Display all columns

features

Unnamed: 0,amt,zip,city_pop,age,state_code,hour,day_of_week,is_weekend,is_business_hours,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M,age_group_18-25,age_group_26-35,age_group_36-45,age_group_46-55,age_group_56-65,age_group_65+,distance
0,4.97,28654,3495,36,27,0,1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,48.838809
1,107.23,99160,149,46,47,0,1,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,18.773185
2,220.11,83252,4154,62,13,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,67.236892
3,45.00,59632,1939,57,26,0,1,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,59.449252
4,41.96,24433,99,38,45,0,1,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,48.192064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,15.56,84735,258,63,44,12,21,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,74.411357
1296671,51.70,21790,100,45,20,12,21,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,46.668035
1296672,105.93,88325,899,57,32,12,21,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,61.546094
1296673,74.90,57756,1126,44,41,12,21,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,52.585771


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2, random_state=42)
X_train

Unnamed: 0,amt,zip,city_pop,age,state_code,hour,day_of_week,is_weekend,is_business_hours,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M,age_group_18-25,age_group_26-35,age_group_36-45,age_group_46-55,age_group_56-65,age_group_65+,distance
330201,6.51,70732,1178,73,18,14,6,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,True,38.584398
798518,20.17,98238,85,40,47,19,3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,69.215279
1260375,35.85,75662,24536,41,43,12,8,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,40.185671
412511,42.21,49621,3096,49,22,1,6,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,47.329459
344644,57.24,60432,128354,32,14,23,10,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False,75.393914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,72.38,54896,1478,59,48,2,4,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,66.416472
259178,2.33,6441,5438,60,6,6,9,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,70.369571
131932,118.27,87117,310,46,32,21,13,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,22.147680
671155,5.60,56729,140,82,23,9,13,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,72.426204


In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2, random_state=42)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Load all models required
import pickle
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [None]:
# Define a function to evaluation and save models
from math import pi
def eval_and_save_model(model, X_train, X_test, y_train, y_test, filename):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Accuracy: {accuracy}")
    print(f"Classification report: \n{classification_report(y_test, y_pred)}")
    with open(filename, 'wb') as file:
      pickle.dump(model, file)
    print(f"Model saved as {filename}")
    print("----------")

In [None]:
# Setup models
xgboost_model = xgb.XGBClassifier(random_state=25, learning_rate= 0.4)
dt_model = DecisionTreeClassifier(random_state=25)
rf_model = RandomForestClassifier(class_weight='balanced', random_state=25)


In [None]:
eval_and_save_model(xgboost_model, X_train, X_test, y_train, y_test, 'xgboost_model.pkl')
#eval_and_save_model(dt_model, X_train, X_test, y_train, y_test, 'dt_model.pkl')
#eval_and_save_model(rf_model, X_train, X_test, y_train, y_test, 'rf_model.pkl')


XGBClassifier Accuracy: 0.9989897237164286
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.96      0.86      0.91      1520

    accuracy                           1.00    259335
   macro avg       0.98      0.93      0.95    259335
weighted avg       1.00      1.00      1.00    259335

Model saved as xgboost_model.pkl
----------


# Now Let's explore Sampling techniques

In [None]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN

smote = SMOTE(random_state=25)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Apply Borderline-SMOTE
smote = BorderlineSMOTE(random_state=25)
border_X_resampled, border_y_resampled = smote.fit_resample(X_train, y_train)

# Or use ADASYN
adasyn = ADASYN(random_state=25)
adasyn_X_resampled, adasyn_y_resampled = adasyn.fit_resample(X_train, y_train)


In [None]:
# SMOTE MODELS
eval_and_save_model(xgboost_model, X_train_resampled, X_test, y_train_resampled, y_test, 'xgboost_model-SMOTE.pkl')
#eval_and_save_model(lr_model, X_train_resampled, X_test, y_train_resampled, y_test, 'lr_model-SMOTE.pkl')
#eval_and_save_model(dt_model, X_train_resampled, X_test, y_train_resampled, y_test, 'dt_model-SMOTE.pkl')

XGBClassifier Accuracy: 0.9980295756454007
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.79      0.91      0.84      1520

    accuracy                           1.00    259335
   macro avg       0.89      0.95      0.92    259335
weighted avg       1.00      1.00      1.00    259335

Model saved as xgboost_model-SMOTE.pkl
----------


In [None]:
# Borderline MODELS
eval_and_save_model(xgboost_model, border_X_resampled, X_test, border_y_resampled, y_test, 'xgboost_model-BORDER.pkl')
eval_and_save_model(lr_model, border_X_resampled, X_test, border_y_resampled, y_test, 'lr_model-BORDER.pkl')
eval_and_save_model(dt_model, border_X_resampled, X_test, border_y_resampled, y_test, 'dt_model-BORDER.pkl')

XGBClassifier Accuracy: 0.9984421693947982
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.85      0.89      0.87      1520

    accuracy                           1.00    259335
   macro avg       0.93      0.94      0.93    259335
weighted avg       1.00      1.00      1.00    259335

Model saved as xgboost_model-BORDER.pkl
----------
LogisticRegression Accuracy: 0.8796036015192704
Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.88      0.94    257815
           1       0.04      0.79      0.07      1520

    accuracy                           0.88    259335
   macro avg       0.52      0.83      0.50    259335
weighted avg       0.99      0.88      0.93    259335

Model saved as lr_model-BORDER.pkl
----------
DecisionTreeClassifier Accuracy: 0.9974550292093238
Classification report: 
              precision    recall 

In [None]:
# ADASYN MODELS
eval_and_save_model(xgboost_model, adasyn_X_resampled, X_test, adasyn_y_resampled, y_test, 'xgboost_model-ADASYN.pkl')
eval_and_save_model(lr_model, adasyn_X_resampled, X_test, adasyn_y_resampled, y_test, 'lr_model-ADASYN.pkl')
eval_and_save_model(dt_model, adasyn_X_resampled, X_test, adasyn_y_resampled, y_test, 'dt_model-ADASYN.pkl')

XGBClassifier Accuracy: 0.9977210943374399
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.75      0.91      0.82      1520

    accuracy                           1.00    259335
   macro avg       0.88      0.96      0.91    259335
weighted avg       1.00      1.00      1.00    259335

Model saved as xgboost_model-ADASYN.pkl
----------
LogisticRegression Accuracy: 0.7281122871960977
Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.73      0.84    257815
           1       0.02      0.95      0.04      1520

    accuracy                           0.73    259335
   macro avg       0.51      0.84      0.44    259335
weighted avg       0.99      0.73      0.84    259335

Model saved as lr_model-ADASYN.pkl
----------
DecisionTreeClassifier Accuracy: 0.9969306109857906
Classification report: 
              precision    recall 

In [None]:
# Find the feature importance
feature_importance = xgboost_model.feature_importances_
# Find the feature names
feature_names = features.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
12,category_grocery_pos,0.46127
10,category_gas_transport,0.179728
5,hour,0.047582
0,amt,0.04634
21,category_travel,0.040874
14,category_home,0.02403
11,category_grocery_net,0.023799
22,gender_F,0.023156
17,category_misc_pos,0.020168
16,category_misc_net,0.016421
