In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score


In [2]:
# Load the dataset
df = pd.read_csv(r'C:\Users\Sujal\OneDrive\Desktop\ML Intern\Fraud detection\fraudTrain.csv')

In [3]:
# Display the first few rows of the dataset
df.head()


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:

# Drop the 'Unnamed: 0' column as it's likely just an index column
df = df.drop(columns=['Unnamed: 0'])

In [5]:
# Check for missing values
print(df.isnull().sum())

trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [6]:
# Select relevant features and drop irrelevant columns
relevant_features = ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'gender', 'city', 
                     'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'unix_time', 
                     'merch_lat', 'merch_long']
df = df[relevant_features + ['is_fraud']]

In [7]:
# Encode categorical variables
categorical_features = ['merchant', 'category', 'gender', 'city', 'state', 'job']
numerical_features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

In [8]:
# Separate features and target variable
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

In [9]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Create a pipeline for Logistic Regression
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])

In [11]:
# Train the Logistic Regression model
lr_pipeline.fit(X_train, y_train)

In [12]:
# Make predictions
y_pred_lr = lr_pipeline.predict(X_test)
print("Logistic Regression Report")
print(classification_report(y_test, y_pred_lr))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_lr)}")

Logistic Regression Report
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    257815
           1       0.42      0.06      0.10      1520

    accuracy                           0.99    259335
   macro avg       0.71      0.53      0.55    259335
weighted avg       0.99      0.99      0.99    259335

ROC-AUC Score: 0.5283895747868984


In [13]:
# Create a pipeline for Decision Tree
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

In [14]:
# Train the Decision Tree model
dt_pipeline.fit(X_train, y_train)

In [15]:
# Make predictions
y_pred_dt = dt_pipeline.predict(X_test)
print("Decision Tree Report")
print(classification_report(y_test, y_pred_dt))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_dt)}")

Decision Tree Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.72      0.69      0.70      1520

    accuracy                           1.00    259335
   macro avg       0.86      0.84      0.85    259335
weighted avg       1.00      1.00      1.00    259335

ROC-AUC Score: 0.8419757715906039


In [16]:
# Create a pipeline for Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [17]:
# Train the Random Forest model
rf_pipeline.fit(X_train, y_train)

In [18]:
# Make predictions
y_pred_rf = rf_pipeline.predict(X_test)
print("Random Forest Report")
print(classification_report(y_test, y_pred_rf))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_rf)}")

Random Forest Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.91      0.62      0.73      1520

    accuracy                           1.00    259335
   macro avg       0.95      0.81      0.87    259335
weighted avg       1.00      1.00      1.00    259335

ROC-AUC Score: 0.8077104962044387


Check Accuracy by using test dataset

In [19]:

# Load the new test dataset
new_test_df = pd.read_csv(r'C:\Users\Sujal\OneDrive\Desktop\ML Intern\Fraud detection\fraudTest.csv')

In [20]:
# Display the first few rows of the new test dataset
new_test_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [21]:
# Drop the 'Unnamed: 0' column if it exists
if 'Unnamed: 0' in new_test_df.columns:
    new_test_df = new_test_df.drop(columns=['Unnamed: 0'])

In [22]:
# Select relevant features and drop irrelevant columns
new_test_df = new_test_df[relevant_features + ['is_fraud']]

In [23]:

# Separate features and target variable
X_new_test = new_test_df.drop(columns=['is_fraud'])
y_new_test = new_test_df['is_fraud']


In [24]:
# Make predictions with Logistic Regression
y_pred_lr_new = lr_pipeline.predict(X_new_test)

In [25]:
# Make predictions with Decision Tree
y_pred_dt_new = dt_pipeline.predict(X_new_test)

In [26]:
# Make predictions with Random Forest
y_pred_rf_new = rf_pipeline.predict(X_new_test)

In [27]:
# Logistic Regression Evaluation
print("Logistic Regression Report (New Test Data)")
print(classification_report(y_new_test, y_pred_lr_new))
print(f"ROC-AUC Score: {roc_auc_score(y_new_test, y_pred_lr_new)}")


Logistic Regression Report (New Test Data)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

ROC-AUC Score: 0.4999829078468971


In [28]:
# Decision Tree Evaluation
print("Decision Tree Report (New Test Data)")
print(classification_report(y_new_test, y_pred_dt_new))
print(f"ROC-AUC Score: {roc_auc_score(y_new_test, y_pred_dt_new)}")

Decision Tree Report (New Test Data)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.53      0.56      0.54      2145

    accuracy                           1.00    555719
   macro avg       0.76      0.78      0.77    555719
weighted avg       1.00      1.00      1.00    555719

ROC-AUC Score: 0.778063564113487


In [29]:
# Random Forest Evaluation
print("Random Forest Report (New Test Data)")
print(classification_report(y_new_test, y_pred_rf_new))
print(f"ROC-AUC Score: {roc_auc_score(y_new_test, y_pred_rf_new)}")

Random Forest Report (New Test Data)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.81      0.42      0.55      2145

    accuracy                           1.00    555719
   macro avg       0.90      0.71      0.78    555719
weighted avg       1.00      1.00      1.00    555719

ROC-AUC Score: 0.7093683362404437
