In [1]:
# prompt: write a code which loads the zip file from drive and the extracts it after that uses pandas to read that csv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import zipfile
# from google.colab import drive

# warnings.filterwarnings('ignore')

# # drive.mount('/content/drive')

# # Replace 'your_zip_file.zip' with the actual name of your zip file
# zip_file_path = '/content/drive/MyDrive/fraudTrain.csv.zip' # Update with your zip file path
# extract_path = '/content/extracted_files'

# try:
#     with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#         zip_ref.extractall(extract_path)
#     print(f"Successfully extracted '{zip_file_path}' to '{extract_path}'")

#     # Replace 'your_csv_file.csv' with the actual name of your CSV file inside the zip
#     csv_file_path = '/content/extracted_files/fraudTrain.csv' # Update with your CSV file path inside the zip

#     # Read the CSV file into a pandas DataFrame
#     df = pd.read_csv(csv_file_path)
#     df.head() # Display the first few rows to verify

# except FileNotFoundError:
#     print(f"Error: Zip file '{zip_file_path}' not found.")
# except KeyError:
#     print(f"Error: CSV file 'your_csv_file.csv' not found inside the zip archive.")
# except Exception as e:
#     print(f"An unexpected error occurred: {e}")


In [9]:
df = pd.read_csv('data/fraudTrain.csv')
df_test = pd.read_csv('data/fraudTest.csv')

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [11]:
df.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

# Fraud Detection Model - Column Explanation & Feature Selection

## Column Explanation

| **Column**                | **Description** | **Keep/Drop** |
|---------------------------|---------------|--------------|
| `Unnamed: 0`              | Index column, redundant. | ❌ Drop |
| `trans_date_trans_time`   | Timestamp of transaction. Helps with fraud patterns (e.g., time-based anomalies). | ✅ Keep |
| `cc_num`                  | Credit card number. Not useful directly, but could be hashed for fraud patterns. | ✅ Keep (Sensitive Info) |
| `merchant`                | Merchant where transaction occurred. Might help detect fraud patterns by merchant. | ✅ Keep |
| `category`                | Type of transaction (e.g., groceries, electronics). Important for fraud detection. | ✅ Keep |
| `amt`                     | Transaction amount. Fraudsters may perform small or large unusual transactions. | ✅ Keep |
| `first`, `last`           | Customer’s first and last name. Irrelevant for fraud detection. | ❌ Drop |
| `gender`                  | Customer’s gender. Unlikely to contribute to fraud detection. | ❌ Drop |
| `street`, `city`, `state`, `zip` | Address details. May not provide useful fraud patterns directly. | ❌ Drop |
| `lat`, `long`             | User's latitude & longitude. Might help if geolocation fraud detection is considered. | ❌ Drop |
| `city_pop`                | Population of the city. Might help detect fraud in highly populated vs. less populated areas. | ❌ Drop |
| `job`                     | Customer's job title. Unlikely to be relevant. | ❌ Drop |
| `dob`                     | Date of birth. Not useful directly for fraud detection. | ❌ Drop |
| `trans_num`               | Unique transaction ID. Not predictive for fraud detection. | ❌ Drop |
| `unix_time`               | Transaction time in Unix format. Useful for time-based fraud detection. | ✅ Keep |
| `merch_lat`, `merch_long` | Merchant's latitude & longitude. Useful for geolocation-based fraud detection. | ✅ Keep |
| `is_fraud`                | Target variable (1 = fraud, 0 = not fraud). Required for model training. | ✅ Keep |

---

## **Final Recommended Features for Model**
### ✅ **Keep:**
- `trans_date_trans_time`
- `merchant`
- `category`
- `amt`
- `lat`, `long`
- `city_pop`
- `unix_time`
- `merch_lat`, `merch_long`
- `is_fraud` (Target)

### ❌ **Drop:**
- `Unnamed: 0`, `cc_num`, `first`, `last`, `gender`, `street`, `city`, `state`, `zip`, `job`, `dob`, `trans_num`

---

## **Next Steps**
### **1. Feature Engineering**
- Extract **hour, day, weekday, weekend flags** from `trans_date_trans_time`.

### **2. Encoding**
- Convert categorical features (`merchant`, `category`) into numerical form (e.g., One-Hot Encoding or Label Encoding).

### **3. Scaling**
- Normalize `amt`, `lat/long`, `merch_lat/merch_long`.

### **4. Time-based Features**
- Check if fraudulent transactions occur at unusual hours (e.g., night transactions).




In [12]:
# Drop unnecessary columns
drop_cols = ['Unnamed: 0', 'lat', 'long', 'city_pop', 'first', 'last', 'gender', 'street', 'city',
             'state', 'zip', 'job', 'dob', 'trans_num']
df.drop(columns=drop_cols, inplace=True)
df_test.drop(columns=drop_cols, inplace=True)
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,1325376186,38.674999,-78.632459,0


## EDA

In [13]:
print("\n🔹 DataFrame Summary 🔹")
print("=" * 50)

print(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns\n")
print("Data Types:\n", df.dtypes, "\n")

missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values.", "\n")

print("Statistics:\n", df.describe().T, "\n")
print("Unique Values:\n", df.nunique(), "\n")

print("=" * 50)


🔹 DataFrame Summary 🔹
Shape: 1296675 rows, 9 columns

Data Types:
 trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object 

Missing Values:
 No missing values. 

Statistics:
                 count          mean           std           min           25%  \
cc_num      1296675.0  4.171920e+17  1.308806e+18  6.041621e+10  1.800429e+14   
amt         1296675.0  7.035104e+01  1.603160e+02  1.000000e+00  9.650000e+00   
unix_time   1296675.0  1.349244e+09  1.284128e+07  1.325376e+09  1.338751e+09   
merch_lat   1296675.0  3.853734e+01  5.109788e+00  1.902779e+01  3.473357e+01   
merch_long  1296675.0 -9.022646e+01  1.377109e+01 -1.666712e+02 -9.689728e+01   
is_fraud    1296675.0  5.788652e-03  7.586269e-02  0.000000e+00  0.000000e+00   



1. Some fraud patterns increase on specific days (e.g., weekends, paydays, or holidays).
2. Fraudulent activity often spikes on weekends, as banks have reduced monitoring.
2. Converts the day into a binary feature (1 = Weekend, 0 = Weekday), which is useful for ML models.

In [14]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,1325376186,38.674999,-78.632459,0


In [15]:
df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'unix_time', 'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [16]:
df_test.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'unix_time', 'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [17]:
# Convert `trans_date_trans_time` to datetime and extract time-based features
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
# df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)  # 1 if Saturday or Sunday, else 0

# Drop the original datetime column (optional)
df.drop(columns=['trans_date_trans_time'], inplace=True)
df.head()

Unnamed: 0,cc_num,merchant,category,amt,unix_time,merch_lat,merch_long,is_fraud,hour,day_of_week
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,1325376018,36.011293,-82.048315,0,0,1
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,1325376044,49.159047,-118.186462,0,0,1
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,1325376051,43.150704,-112.154481,0,0,1
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,1325376076,47.034331,-112.561071,0,0,1
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,1325376186,38.674999,-78.632459,0,0,1


In [20]:
# Convert `trans_date_trans_time` to datetime and extract time-based features
df_test['trans_date_trans_time'] = pd.to_datetime(df_test['trans_date_trans_time'])
df_test['hour'] = df_test['trans_date_trans_time'].dt.hour
df_test['day_of_week'] = df_test['trans_date_trans_time'].dt.dayofweek
# df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)  # 1 if Saturday or Sunday, else 0

# Drop the original datetime column (optional)
df_test.drop(columns=['trans_date_trans_time'], inplace=True)
df_test.head()

Unnamed: 0,cc_num,merchant,category,amt,unix_time,merch_lat,merch_long,is_fraud,hour,day_of_week
0,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,1371816865,33.986391,-81.200714,0,12,6
1,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,1371816873,39.450498,-109.960431,0,12,6
2,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,1371816893,40.49581,-74.196111,0,12,6
3,3591919803438423,fraud_Haley Group,misc_pos,60.05,1371816915,28.812398,-80.883061,0,12,6
4,3526826139003047,fraud_Johnston-Casper,travel,3.19,1371816917,44.959148,-85.884734,0,12,6


In [21]:
df.to_csv('data/fraudTrain_Feature_engineered.csv', index=False)
df_test.to_csv('data/fraudTest_Feature_engineered.csv', index=False)

In [57]:
fraud_transactions = df[df['is_fraud'] == 1]
fraud_transactions

Unnamed: 0,cc_num,merchant,category,amt,unix_time,merch_lat,merch_long,is_fraud,hour,day_of_week
2449,4613314721966,fraud_Rutherford-Mertz,grocery_pos,281.06,1325466397,36.430124,-81.179483,1,1,2
2472,340187018810220,"fraud_Jenkins, Hauck and Friesen",gas_transport,11.52,1325468849,29.819364,-99.142791,1,1,2
2523,340187018810220,fraud_Goodwin-Nitzsche,grocery_pos,276.31,1325473523,29.273085,-98.836360,1,3,2
2546,4613314721966,fraud_Erdman-Kertzmann,gas_transport,7.03,1325475483,35.909292,-82.091010,1,3,2
2553,340187018810220,fraud_Koepp-Parker,grocery_pos,275.73,1325476547,29.786426,-98.683410,1,3,2
...,...,...,...,...,...,...,...,...,...,...
1295399,3524574586339330,fraud_Kassulke PLC,shopping_net,977.01,1371776408,26.888686,-80.834389,1,1,6
1295491,3524574586339330,fraud_Schumm PLC,shopping_net,1210.91,1371779615,28.216707,-79.855648,1,1,6
1295532,4005676619255478,"fraud_Tillman, Dickinson and Labadie",gas_transport,10.24,1371781016,29.700456,-91.361632,1,2,6
1295666,3560725013359375,fraud_Corwin-Collins,gas_transport,21.69,1371785180,32.675272,-103.484949,1,3,6


In [9]:
df['merchant'].value_counts()

Unnamed: 0_level_0,count
merchant,Unnamed: 1_level_1
fraud_Kilback LLC,4403
fraud_Cormier LLC,3649
fraud_Schumm PLC,3634
fraud_Kuhn LLC,3510
fraud_Boyer PLC,3493
...,...
"fraud_Douglas, DuBuque and McKenzie",775
fraud_Treutel-King,775
"fraud_Medhurst, Labadie and Gottlieb",759
fraud_Reichert-Weissnat,753


In [10]:
df['category'].value_counts()



Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
gas_transport,131659
grocery_pos,123638
home,123115
shopping_pos,116672
kids_pets,113035
shopping_net,97543
entertainment,94014
food_dining,91461
personal_care,90758
health_fitness,85879


In [11]:
df.duplicated().sum()

0

In [12]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

In [13]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [14]:
# # prompt: Apply SMOTE + Tomek Links to oversample fraud and remove noisy legitimate transactions.

# from imblearn.combine import SMOTETomek
# from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

# # Assuming 'X' contains your features and 'y' contains your target variable ('is_fraud')
# X = df.drop('is_fraud', axis=1)
# y = df['is_fraud']

# # Initialize LabelEncoder to convert 'merchant' and 'category' to numerical
# encoder = LabelEncoder()
# X['merchant'] = encoder.fit_transform(X['merchant']) # Encode 'merchant' column
# X['category'] = encoder.fit_transform(X['category']) # Encode 'category' column


# # Initialize SMOTETomek
# smote_tomek = SMOTETomek(random_state=42)

# # Apply SMOTE + Tomek Links
# X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

# # Create a new DataFrame with the resampled data
# df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
# df_resampled['is_fraud'] = y_resampled

# # Now df_resampled contains the oversampled minority class ('is_fraud' == 1) and cleaned majority class
# print(df_resampled['is_fraud'].value_counts())

In [15]:
import pickle
import pandas as pd
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is your dataset
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Initialize separate LabelEncoders for each categorical column
merchant_encoder = LabelEncoder()
category_encoder = LabelEncoder()

# Fit and transform each column separately
X['merchant'] = merchant_encoder.fit_transform(X['merchant'])
X['category'] = category_encoder.fit_transform(X['category'])

# Save the encoders
with open('merchant_encoder.pkl', 'wb') as file:
    pickle.dump(merchant_encoder, file)

with open('category_encoder.pkl', 'wb') as file:
    pickle.dump(category_encoder, file)

# Apply SMOTETomek (but do not save it)
smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

# Create a new DataFrame with the resampled data
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['is_fraud'] = y_resampled

print(df_resampled['is_fraud'].value_counts())


is_fraud
0    1289169
1    1289169
Name: count, dtype: int64


In [None]:
df_resampled.head()


Unnamed: 0,cc_num,merchant,category,amt,unix_time,merch_lat,merch_long,hour,day_of_week,is_fraud
0,2703186189652095,514,8,4.97,1325376018,36.011293,-82.048315,0,1,0
1,630423337322,241,4,107.23,1325376044,49.159047,-118.186462,0,1,0
2,38859492057661,390,0,220.11,1325376051,43.150704,-112.154481,0,1,0
3,3534093764340240,360,2,45.0,1325376076,47.034331,-112.561071,0,1,0
4,375534208663984,297,9,41.96,1325376186,38.674999,-78.632459,0,1,0


In [19]:
df = df_resampled.copy()


In [20]:

# # Scale numerical features
# scaler = MinMaxScaler()
# num_cols = ['amt', 'cc_num', 'unix_time', 'merch_lat', 'merch_long', 'hour']
# df[num_cols] = scaler.fit_transform(df[num_cols])

# # Final dataset preview
# df.head()

In [21]:
# List of numerical columns to scale
num_cols = ['amt', 'cc_num', 'unix_time', 'merch_lat', 'merch_long', 'hour']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the numerical columns
df[num_cols] = scaler.fit_transform(df[num_cols])

# Save the scaler for future use
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# Final dataset preview
df.head()

Unnamed: 0,cc_num,merchant,category,amt,unix_time,merch_lat,merch_long,hour,day_of_week,is_fraud
0,0.000541454,514,8,0.000137,0.0,0.350302,0.848602,0.0,1,0
1,1.141762e-07,241,4,0.00367,5.598526e-07,0.621488,0.486208,0.0,1,0
2,7.771712e-06,390,0,0.007569,7.105821e-07,0.497559,0.546697,0.0,1,0
3,0.0007078903,360,2,0.00152,1.248902e-06,0.577663,0.542619,0.0,1,0
4,7.520988e-05,297,9,0.001415,3.617509e-06,0.405244,0.882857,0.0,1,0


In [33]:
df.head()

Unnamed: 0,cc_num,merchant,category,amt,unix_time,merch_lat,merch_long,hour,day_of_week,is_fraud
0,0.000541454,514,8,0.000137,0.0,0.350302,0.848602,0.0,1,0
1,1.141762e-07,241,4,0.00367,5.598526e-07,0.621488,0.486208,0.0,1,0
2,7.771712e-06,390,0,0.007569,7.105821e-07,0.497559,0.546697,0.0,1,0
3,0.0007078903,360,2,0.00152,1.248902e-06,0.577663,0.542619,0.0,1,0
4,7.520988e-05,297,9,0.001415,3.617509e-06,0.405244,0.882857,0.0,1,0


In [34]:
# prompt: check the corresponding values of column is_fraud == 1

# Assuming 'df' is your DataFrame (as defined in your provided code)
fraud_transactions = df[df['is_fraud'] == 1]
fraud_transactions


Unnamed: 0,cc_num,merchant,category,amt,unix_time,merch_lat,merch_long,hour,day_of_week,is_fraud
2449,9.119757e-07,543,4,0.009675,0.001946,0.358941,0.857315,0.043478,2,1
2472,6.812961e-05,285,2,0.000363,0.001999,0.222587,0.677178,0.043478,2,1
2523,6.812961e-05,196,4,0.009511,0.002100,0.211320,0.680251,0.130435,2,1
2546,9.119757e-07,162,2,0.000208,0.002142,0.348198,0.848174,0.130435,2,1
2553,6.812961e-05,328,4,0.009490,0.002165,0.221908,0.681785,0.130435,2,1
...,...,...,...,...,...,...,...,...,...,...
2578333,3.605274e-05,270,4,0.012146,0.739701,0.446736,0.922739,0.086957,5,1
2578334,4.469094e-04,195,10,0.021911,0.824018,0.273034,0.835871,0.913043,2,1
2578335,8.211865e-04,88,4,0.009951,0.753552,0.398452,0.888099,0.000000,6,1
2578336,6.933570e-05,400,11,0.031738,0.513130,0.442072,0.922400,1.000000,1,1


In [35]:
df["day_of_week"].value_counts()

Unnamed: 0_level_0,count
day_of_week,Unnamed: 1_level_1
0,468968
5,416982
6,405198
4,348144
3,333682
1,323453
2,281911


In [None]:
# Save the processed data (optional)
df.to_csv("processed_dataset.csv", index=False)

In [36]:
!pip install xgboost
!pip install lightgbm



In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle

In [38]:
from sklearn.utils.class_weight import compute_class_weight

def compute_class_weights(y):
    """
    Compute class weights for imbalanced datasets.

    Args:
        y (array-like): Target variable.

    Returns:
        dict: Class weights.
    """
    classes = np.unique(y)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
    class_weights = dict(zip(classes, weights))
    return class_weights

In [39]:
# Define features and target
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [40]:
# Compute class weights for imbalanced data
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
scale_pos_weight_value = class_weights[1] / class_weights[0]  # Compute for XGBoost & LGBM
scale_pos_weight_value

1.0

In [41]:
# from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors import KNeighborsClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier

# from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [42]:
classifiers = {
    # "Random Forest" : RandomForestClassifier(random_state=42),
    # "LogisticRegression" : LogisticRegression(random_state=42),
    # "Gradient Boosting" : GradientBoostingClassifier(random_state=42),
    # "Suuport vector classifier" : SVC(random_state=42),
    "Naive Bayes" : GaussianNB(),
    "XGboost" : XGBClassifier(
        # scale_pos_weight=scale_pos_weight_value,
        random_state=42),
    "LGBM" : LGBMClassifier(
        # scale_pos_weight=scale_pos_weight_value,
        random_state=42)
}

# Store metrics
metrics = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": []
}



In [43]:
# Train and evaluate models
for model_name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store results
    metrics["Model"].append(model_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1 Score"].append(f1)

# Convert to DataFrame
metrics_df = pd.DataFrame(metrics)
print(metrics_df)


[LightGBM] [Info] Number of positive: 1031335, number of negative: 1031335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063086 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1575
[LightGBM] [Info] Number of data points in the train set: 2062670, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
         Model  Accuracy  Precision    Recall  F1 Score
0  Naive Bayes  0.833767   0.973220  0.686422  0.805041
1      XGboost  0.973064   0.981920  0.963876  0.972814
2         LGBM  0.950338   0.969432  0.930001  0.949307


In [44]:
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Naive Bayes,0.833767,0.97322,0.686422,0.805041
1,XGboost,0.973064,0.98192,0.963876,0.972814
2,LGBM,0.950338,0.969432,0.930001,0.949307


In [45]:
# prompt: save the lgbm model

import pickle
# Assuming 'classifier' is your trained LGBM Classifier
#  and you want to save it as 'lgbm_model.pkl'

# Save the trained LGBM model
filename = 'lgbm_model.pkl'
pickle.dump(classifiers["LGBM"], open(filename, 'wb'))


In [64]:

import pickle
from lightgbm import LGBMClassifier

# 2703186189652095 	fraud_Rippin, Kub and Mann 	misc_net 	4.97 	1325376018 	36.011293 	-82.048315 	0 	0 	1 (Non Fraud)
# 4613314721966 	fraud_Rutherford-Mertz 	grocery_pos 	281.06 	1325466397 	36.430124 	-81.179483 	1 	1 	2 (Fraud)
# Load the model from the downloaded file
with open('/content/lgbm_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Ensure the new_data columns match the model's training data columns
new_data = pd.DataFrame({
    'cc_num': [4613314721966],
    'merchant': ['fraud_Rutherford-Mertz'],
    'category': ['grocery_pos'],
    'amt': [281.06],
    'unix_time': [1325466397],
    'merch_lat': [36.430124],
    'merch_long': [-81.179483],
    'hour': [1],
    'day_of_week': [2]
})

# Preprocess the new data using the same encoders and scaler used for training
with open('merchant_encoder.pkl', 'rb') as file:
  merchant_encoder = pickle.load(file)
with open('category_encoder.pkl', 'rb') as file:
  category_encoder = pickle.load(file)
with open('scaler.pkl', 'rb') as file:
  scaler = pickle.load(file)

new_data['merchant'] = merchant_encoder.transform(new_data['merchant'])
new_data['category'] = category_encoder.transform(new_data['category'])

num_cols = ['amt', 'cc_num', 'unix_time', 'merch_lat', 'merch_long', 'hour']
new_data[num_cols] = scaler.transform(new_data[num_cols])


# Make predictions
predictions = loaded_model.predict(new_data)
predictions


array([1])

### Performing Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Define hyperparameter grids
xgb_param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

lgbm_param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [-1, 5, 10],  # -1 means no limit
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [20, 31, 40],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}


In [None]:

# Initialize models
xgb_model = XGBClassifier(
    # scale_pos_weight=scale_pos_weight_value,
    random_state=42)
lgbm_model = LGBMClassifier(
#     # scale_pos_weight=scale_pos_weight_value,
    random_state=42)

# Perform GridSearchCV
xgb_grid = GridSearchCV(xgb_model, xgb_param_grid, scoring='f1', cv=3, verbose=1, n_jobs=-1)
# lgbm_grid = GridSearchCV(lgbm_model, lgbm_param_grid, scoring='f1', cv=3, verbose=1, n_jobs=-1)

# Fit models
xgb_grid.fit(X_train, y_train)
lgbm_grid.fit(X_train, y_train)

# Get best parameters
best_xgb_params = xgb_grid.best_params_
best_lgbm_params = lgbm_grid.best_params_

print("Best XGBoost Parameters:", best_xgb_params)
print("Best LGBM Parameters:", best_lgbm_params)


Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [None]:
# Train XGBoost with best parameters
best_xgb = XGBClassifier(**best_xgb_params,
                        #  scale_pos_weight=scale_pos_weight_value,
                         random_state=42)
best_xgb.fit(X_train, y_train)
y_pred_xgb = best_xgb.predict(X_test)

# Train LGBM with best parameters
best_lgbm = LGBMClassifier(**best_lgbm_params,
                          #  scale_pos_weight=scale_pos_weight_value,
                           random_state=42)
best_lgbm.fit(X_train, y_train)
y_pred_lgbm = best_lgbm.predict(X_test)

# Evaluate Models
metrics = {
    "Model": ["XGBoost (Tuned)", "LGBM (Tuned)"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_xgb),
        # accuracy_score(y_test, y_pred_lgbm)
    ],
    "Precision": [
        precision_score(y_test, y_pred_xgb),
        # precision_score(y_test, y_pred_lgbm)
    ],
    "Recall": [
        recall_score(y_test, y_pred_xgb),
        # recall_score(y_test, y_pred_lgbm)
    ],
    "F1 Score": [
        f1_score(y_test, y_pred_xgb),
        # f1_score(y_test, y_pred_lgbm)
    ]
}

# Convert to DataFrame
metrics_df = pd.DataFrame(metrics)
metrics_df
