In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from random import seed,sample
import datetime as dt
from datetime import datetime
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler # Undersampling for Unbalanced Data
from imblearn.over_sampling import SMOTE # Oversampling for Unbalanced Data
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler # Data Encoders
from sklearn.model_selection import train_test_split
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 500)

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from imblearn.over_sampling import SMOTE
%matplotlib inline

In [2]:
X_train = pd.read_csv('../X_train.csv')
X_test = pd.read_csv('../X_test.csv')
y_train = pd.read_csv('../y_train.csv')
y_test = pd.read_csv('../y_test.csv')

In [3]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 1 columns):
 #   Column    Non-Null Count    Dtype
---  ------    --------------    -----
 0   is_fraud  1296675 non-null  int64
dtypes: int64(1)
memory usage: 9.9 MB


In [4]:
X_train.head()

Unnamed: 0,category,amount(usd),gender,state,lat,long,job,merch_lat,merch_long,hour_of_day,day_of_week,age
0,gas_transport,62.29,F,TX,33.3749,-99.8473,Warden/ranger,33.982824,-99.626096,4,Saturday,37
1,misc_pos,5.56,M,AL,33.9778,-86.5598,Aid worker,33.875221,-86.806611,18,Friday,42
2,shopping_pos,9.9,F,KS,37.6223,-97.3136,Textile designer,37.682608,-97.540664,10,Wednesday,90
3,personal_care,6.29,M,NY,42.958,-77.3083,Cytogeneticist,42.039191,-77.065052,17,Sunday,67
4,entertainment,20.71,F,WV,39.4125,-80.6352,Chief Executive Officer,40.259233,-81.28511,14,Tuesday,49


In [5]:
y_train.head()

Unnamed: 0,is_fraud
0,0
1,0
2,0
3,0
4,0


#### Local Preprocessing - Feature Scaling

Scale the features below only, we don't scale the hour_of_day since the values are already within a similar range and have a clear numerical interpretation (horurs of the day)

In [None]:
from sklearn.preprocessing import StandardScaler

features_to_scale = ["amount(usd)", "lat", "long", "merch_lat", "merch_long", "age"]


# Initialize the StandardScaler and fit/transform on numerical features
scaler = StandardScaler()

# Fit and transform the scaler on the selected features in your training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform the same selected features in your test data using the same scaler
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

X_test_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   category     555719 non-null  object 
 1   amount(usd)  555719 non-null  float64
 2   gender       555719 non-null  object 
 3   state        555719 non-null  object 
 4   lat          555719 non-null  float64
 5   long         555719 non-null  float64
 6   job          555719 non-null  object 
 7   merch_lat    555719 non-null  float64
 8   merch_long   555719 non-null  float64
 9   hour_of_day  555719 non-null  int64  
 10  day_of_week  555719 non-null  object 
 11  age          555719 non-null  float64
dtypes: float64(6), int64(1), object(5)
memory usage: 50.9+ MB


In [None]:
X_test_scaled.head()

Unnamed: 0,category,amount(usd),gender,state,lat,long,job,merch_lat,merch_long,hour_of_day,day_of_week,age
0,health_fitness,0.272269,M,AL,-0.899089,0.267408,Aid worker,-0.849623,0.212847,14,Tuesday,-0.184877
1,misc_pos,-0.414741,F,CA,-0.870119,-2.009787,Civil Service fast streamer,-0.984834,-1.957631,8,Monday,-0.587264
2,shopping_pos,-0.397939,F,SC,-0.888289,0.583966,Research scientist (physical sciences),-0.842206,0.640575,18,Saturday,-0.644748
3,shopping_net,-0.399281,F,MN,1.773167,-0.418025,Applications developer,1.686301,-0.489843,7,Monday,0.447446
4,grocery_pos,0.182959,M,OH,0.445917,0.644181,Building control surveyor,0.264207,0.612831,3,Friday,0.619897


In [None]:
unqiue_state = X_test_scaled['state'].nunique()
unique_category = X_test_scaled['category'].nunique()
unique_job = X_test_scaled['job'].nunique()

print("no of unqiue sates: ", unqiue_state)
print("no of unqiue categories: ", unique_category)
print("no of unqiue jobs: ", unique_job)

no of unqiue sates:  51
no of unqiue categories:  14
no of unqiue jobs:  495


Local Preprocessing - One-Hot Encoding

We will then be one hot encoding some of our categorical variables to run a logistic regression model on X and y.

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder with sparse=False and drop='first'
encoder = OneHotEncoder(sparse=False, drop='first')

# Fit and transform the encoder on categorical columns in the training data
X_train_encoded = encoder.fit_transform(X_train_scaled[['category', 'gender', 'day_of_week']])

# Transform the same columns in the test data
X_test_encoded = encoder.transform(X_test_scaled[['category', 'gender', 'day_of_week']])

# Create DataFrames from the encoded arrays with appropriate column names
encoded_columns = encoder.get_feature_names_out(['category', 'gender', 'day_of_week'])
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_columns)





In [None]:
X_train_final = pd.concat([X_train_encoded_df, X_train_scaled[['age', 'lat', 'long', 'amount(usd)']]], axis=1)
X_test_final = pd.concat([X_test_encoded_df, X_test_scaled[['age', 'lat', 'long', 'amount(usd)']]], axis=1)


In [None]:
X_train_final.head()

Unnamed: 0,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,age,lat,long,amount(usd)
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.52978,-1.017905,-0.697973,-0.049452
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.242361,-0.899089,0.267408,-0.411866
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.516865,-0.180855,-0.513891,-0.38414
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.194736,0.87067,0.93956,-0.407203
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.160026,0.171946,0.69785,-0.315082


In [None]:
# Check for NaN values in each column of X_train_final
nan_columns_train = X_train_final.columns[X_train_final.isna().any()]

# Check for NaN values in each column of X_test_final
nan_columns_test = X_test_final.columns[X_test_final.isna().any()]

# Print the columns with NaN values for both datasets
print("Columns with NaN values in X_train_final:", nan_columns_train)
print("Columns with NaN values in X_test_final:", nan_columns_test)

Columns with NaN values in X_train_final: Index([], dtype='object')
Columns with NaN values in X_test_final: Index([], dtype='object')


In [None]:
# Display the number of rows in X
print("Number of rows in X:", len(X_train_final))

# Display the number of rows in y
print("Number of rows in y:", len(y_train))


Number of rows in X: 1296675
Number of rows in y: 1296675


Run logistic regression model to predict on the test data

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize the logistic regression model
logistic_reg = LogisticRegression(random_state=42)

# Fit the model on the training data
logistic_reg.fit(X_train_final, y_train)

# Predict on the test data
y_pred = logistic_reg.predict(X_test_final)

# Evaluate the model
print(classification_report(y_test, y_pred))

Precision for class 1 (fraud) is 0.00, meaning that the model is making many false positive predictions. In other words, when it predicts a transaction as fraud, it's often incorrect. Recall for class 1 (fraud) is also 0.00, indicating that the model is unable to capture most of the actual fraudulent transactions. It's making many false negative errors. The F1-score for class 1 is 0.00, which is the harmonic mean of precision and recall. It's low due to the poor precision and recall.

These results are likely a consequence of the class imbalance issue. The model is biased toward the majority class (non-fraudulent transactions) because there are significantly more instances of that class in the dataset. As a result, the model tends to predict the majority class and perform poorly on the minority class.

Now, we will try to apply SMOTE to our logistic regression model

In [None]:
# Initialize SMOTE with the desired sampling strategy 
smote = SMOTE(sampling_strategy=1.0, random_state=42)

# Resample the training data using SMOTE
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_final, y_train)


In [None]:
fraudulent_before = y_train.value_counts()[1]
non_fraudulent_before = len(y_train) - fraudulent_before
fraudulent_after = y_train_resampled.value_counts()[1]
non_fraudulent_after = len(y_train_resampled) - fraudulent_after
print("Number of non fraudulent counts before SMOTE:", non_fraudulent_before)
print("Number of fraudulent counts before SMOTE:", fraudulent_before)
print("Number of non fraudulent counts after SMOTE:", non_fraudulent_after)
print("Number of fraudulent counts after SMOTE:", fraudulent_after)

Number of non fraudulent counts before SMOTE: 1289919
Number of fraudulent counts before SMOTE: 6756
Number of non fraudulent counts after SMOTE: 1289919
Number of fraudulent counts after SMOTE: 1289919


In [None]:
# Initialize the logistic regression model
logistic_reg = LogisticRegression(random_state=42)

# Fit the model on the resampled training data
logistic_reg.fit(X_train_resampled, y_train_resampled)

# Predict on the test data
y_pred = logistic_reg.predict(X_test_final)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.88      0.94    552824
           1       0.03      0.76      0.06      2895

    accuracy                           0.88    555719
   macro avg       0.52      0.82      0.50    555719
weighted avg       0.99      0.88      0.93    555719



It looks like SMOTE has helped improve the recall for the positive class (fraudulent transactions), but there is still room for improvement in precision. Here's a summary of the classification report:

For class 0 (non-fraudulent transactions), the precision is high (1.00), meaning that when the model predicts a transaction as non-fraudulent, it's usually correct. However, the recall is relatively low (0.88), which suggests that the model may miss some non-fraudulent transactions.

For class 1 (fraudulent transactions), the precision is very low (0.03), meaning that when the model predicts a transaction as fraudulent, it's often incorrect. The recall is higher (0.76), indicating that the model is better at identifying fraudulent transactions.

The overall accuracy is 0.88, which is relatively high, but it might be due to the class imbalance, where the majority of transactions are non-fraudulent.