# Fraud Detection Model Training
## Overview
This notebook trains a machine learning model to predict fraudulent transactions based on various features like transaction time, amount, location,etc.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15012 entries, 2449 to 566921
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             15012 non-null  int64  
 1   trans_date_trans_time  15012 non-null  object 
 2   cc_num                 15012 non-null  int64  
 3   merchant               15012 non-null  object 
 4   category               15012 non-null  object 
 5   amt                    15012 non-null  float64
 6   first                  15012 non-null  object 
 7   last                   15012 non-null  object 
 8   gender                 15012 non-null  object 
 9   street                 15012 non-null  object 
 10  city                   15012 non-null  object 
 11  state                  15012 non-null  object 
 12  zip                    15012 non-null  int64  
 13  lat                    15012 non-null  float64
 14  long                   15012 non-null  float64
 15  cit

## Dataset Description
The dataset consists of transaction records with the following features:

| Feature Name           | Description                          |
|------------------------|--------------------------------------|
| `trans_date_trans_time` | Timestamp of the transaction       |
| `merchant`            | Merchant name                       |
| `category`            | Category of transaction             |
| `gender`              | Gender of the customer              |
| `city`, `state`       | Location of the transaction         |
| `is_weekend`          | Indicates if transaction is on a weekend |
| `amt`                 | Transaction amount                  |
| `lat`, `long`         | Customer's location coordinates     |
| `city_pop`            | City population                     |
| `merch_lat`, `merch_long` | Merchant's location coordinates |

The target variable is:
- `fraud`: 1 for fraudulent transactions, 0 for legitimate transactions.


In [8]:
df2 = df
df = df.iloc[:,1:]

In [10]:
cols_to_drop = ['cc_num','first','last','street','zip','job','trans_num','unix_time','dob']
df = df.drop(cols_to_drop,axis=1)

In [11]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])


In [12]:
df['year'] = df['trans_date_trans_time'].dt.year
df['month'] = df['trans_date_trans_time'].dt.month
df['day'] = df['trans_date_trans_time'].dt.day
df['hour'] = df['trans_date_trans_time'].dt.hour
df['weekday'] = df['trans_date_trans_time'].dt.weekday  # Monday=0, Sunday=6
df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)  # 1 for Sat/Sun, 0 otherwise
df['day_of_year'] = df['trans_date_trans_time'].dt.dayofyear
df['week_of_year'] = df['trans_date_trans_time'].dt.isocalendar().week
df['quarter'] = df['trans_date_trans_time'].dt.quarter


In [13]:
cat_cols = ['merchant','category','gender','city','state','is_weekend']
num_cols = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long']+['hour', 'day', 'weekday', 'day_of_year', 'week_of_year', 'month', 'quarter','year']

In [14]:
from scipy.stats import chi2_contingency

def check_chi2(df, target, cat_columns):
    results = {}

    for cat_column in cat_columns:
        # Create a contingency table
        contingency_table = pd.crosstab(df[cat_column], df[target])
        
        # Perform Chi-Square test
        chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
        
        # Interpretation
        if p_value <= 0.05:
            results[cat_column] = "Columns correlated"
        else:
            results[cat_column] = "Columns not correlated"
    
    return results

target = 'is_fraud'
cat_results = check_chi2(df, target, cat_cols)
cat_results

{'merchant': 'Columns correlated',
 'category': 'Columns correlated',
 'gender': 'Columns correlated',
 'city': 'Columns correlated',
 'state': 'Columns correlated',
 'is_weekend': 'Columns correlated'}

In [15]:
from scipy.stats import f_oneway

def check_anova(df, target, num_columns):
    results = {}
    
    for num_column in num_columns:
        grp_data = df.groupby(target)[num_column].apply(list)
        f_statistic, p_value = f_oneway(*grp_data)
        
        if p_value <= 0.05:
            results[num_column] = "Correlated"
        else:
            results[num_column] = "Not correlated"
    
    return results

anova_results = check_anova(df, 'is_fraud', num_cols)


In [16]:
for i, j in anova_results.items():
    print(i, ' : ',j)

amt  :  Correlated
lat  :  Not correlated
long  :  Correlated
city_pop  :  Correlated
merch_lat  :  Not correlated
merch_long  :  Correlated
hour  :  Correlated
day  :  Correlated
weekday  :  Not correlated
day_of_year  :  Correlated
week_of_year  :  Correlated
month  :  Correlated
quarter  :  Correlated
year  :  Correlated


In [17]:
df = df.drop(['trans_date_trans_time'],axis=1)

In [18]:
[ col for col in df.columns if col not in num_cols and col not in cat_cols]

['is_fraud']

## Data Preprocessing  
- Encoding categorical features  
- Scaling numerical features  
- Splitting into train and test sets  


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

In [20]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')  # Ignore unseen categories

# 🔹 Combine transformations into a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

In [21]:
X = df[num_cols + cat_cols]  # Features
y = df['is_fraud']           # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
X_train_transformed = preprocessor.fit_transform(X_train)

# 🔹 Transform test data (without refitting)
X_test_transformed = preprocessor.transform(X_test)

# 🔹 Convert transformed data into DataFrames (Optional)
X_train_df = pd.DataFrame(X_train_transformed.toarray(), columns=preprocessor.get_feature_names_out())
X_test_df = pd.DataFrame(X_test_transformed.toarray(), columns=preprocessor.get_feature_names_out())


## Model Selection and Training
- Selected model: [Random Forest ]
- Performance metrics: Accuracy, Precision, Recall,Classification Report


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 🔹 Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_df, y_train)

# 🔹 Make predictions
y_pred = rf_model.predict(X_test_df)

# 🔹 Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 🔹 Print results
print(f"✅ Accuracy: {accuracy:.4f}\n")
print("📌 Confusion Matrix:")
print(conf_matrix)
print("\n📌 Classification Report:")
print(class_report)


✅ Accuracy: 0.9677

📌 Confusion Matrix:
[[1465   37]
 [  60 1441]]

📌 Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1502
           1       0.97      0.96      0.97      1501

    accuracy                           0.97      3003
   macro avg       0.97      0.97      0.97      3003
weighted avg       0.97      0.97      0.97      3003



## Model Deployment
- Save model as a pickle (`.pkl`) file
- Save preprocessor as a pickle (`.pkl`) file

In [24]:
import pickle

# Assuming you have a preprocessor and a trained RandomForest model
with open("preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

with open("rf_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

print("Preprocessor and RandomForest model saved successfully!")


Preprocessor and RandomForest model saved successfully!
