<a href="https://colab.research.google.com/github/sanhiitaa/fastag-fraud-detection-classification/blob/main/fastag_detection_hyperparameter_tuning_and_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

np.random.seed(10101)

In [24]:
df=pd.read_csv('/content/drive/MyDrive/datasets/FastagFraudDetection_cleaned.csv')
df.head()

Unnamed: 0,Vehicle_Type,TollBoothID,Lane_Type,Vehicle_Dimensions,Geographical_Location,Vehicle_Speed,Fraud_indicator,state,month,time-of-day
0,Bus,A-101,Express,Large,"13.059816123454882, 77.77068662374292",65,Fraud,Karnataka,1,11
1,Car,B-102,Regular,Small,"13.059816123454882, 77.77068662374292",78,Fraud,Karnataka,1,14
2,Motorcycle,D-104,Regular,Small,"13.059816123454882, 77.77068662374292",53,Not Fraud,Karnataka,1,18
3,Truck,C-103,Regular,Large,"13.059816123454882, 77.77068662374292",92,Fraud,Karnataka,1,2
4,Van,B-102,Express,Medium,"13.059816123454882, 77.77068662374292",60,Fraud,Karnataka,1,6


In [25]:
# splitting data into dependent and independent variables
x= df.drop('Fraud_indicator', axis=1)
y= df['Fraud_indicator']

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [11]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4000, 9), (1000, 9), (4000,), (1000,))

In [10]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 4824 to 3328
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Vehicle_Type           4000 non-null   object
 1   TollBoothID            4000 non-null   object
 2   Lane_Type              4000 non-null   object
 3   Vehicle_Dimensions     4000 non-null   object
 4   Geographical_Location  4000 non-null   object
 5   Vehicle_Speed          4000 non-null   int64 
 6   state                  4000 non-null   object
 7   month                  4000 non-null   int64 
 8   time-of-day            4000 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 312.5+ KB


In [31]:
le= LabelEncoder()
y_train= le.fit_transform(y_train)
y_test= le.transform(y_test)

# Creating a pipeline

In [41]:
objcols=[cols for cols in x_train.columns if df[cols].dtypes==object]
numcols=[cols for cols in x_train.columns if df[cols].dtypes!=object]

In [43]:

# defining preprocessing steps
encode = OneHotEncoder(handle_unknown='ignore')
scale = StandardScaler()


# bundling up preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('encode', encode, objcols),
        ('scale', scale, numcols)
    ]
)

# defining model
model = KNeighborsClassifier()

# defining the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Hyperparameter tuning

In [44]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the hyperparameter space
param_distributions = {
    'model__n_neighbors': randint(1, 20),
    'model__weights': ['uniform', 'distance'],
    'model__metric': ['euclidean', 'manhattan']
}

# setting up RandomizedSearchCV
random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=50, cv=3, n_jobs=-1)

# fitting the RandomizedSearchCV
random_search.fit(x_train, y_train)

# Get the best parameters and the best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Predict with the best model
pred = best_model.predict(x_test)

print(f"Best parameters: {best_params}")
print(f"Predictions: {pred}")
print(f"Best model score: {random_search.best_score_}")


Best parameters: {'model__metric': 'euclidean', 'model__n_neighbors': 19, 'model__weights': 'uniform'}
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [45]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.20      0.01      0.03       210
           1       0.79      0.98      0.88       790

    accuracy                           0.78      1000
   macro avg       0.49      0.50      0.45      1000
weighted avg       0.67      0.78      0.70      1000



In [67]:
accuracy_score(y_test, pred)

0.781


* The model shows high recall (98%) for detecting fraud cases, meaning it is very good at identifying actual fraud.

* The precision (79%) for fraud cases suggests that when the model predicts fraud, it is correct most of the time.

* The overall accuracy of 78% indicates that the model performs reasonably well across both classes, considering the class imbalance (more instances of class 1 than class 0).

* The weighted average F1-score (0.70) indicates a reasonable overall performance, taking into account precision and recall across both classes.



# Exporting Pipeline

In [49]:
import pickle

# exporting the pipeline using pickle
with open('best_model_pipeline.pkl', 'wb') as file:
    pickle.dump(best_model, file)

# reloading the pipeline
with open('best_model_pipeline.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Use the loaded model for predictions
pred = loaded_model.predict(x_test)


In [50]:
pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

# Predicting Values using `loaded_model`

In [64]:
a=x_train.iloc[356, :]
sample= pd.DataFrame({x : [value] for x, value in a.items()})

In [65]:
sample

Unnamed: 0,Vehicle_Type,TollBoothID,Lane_Type,Vehicle_Dimensions,Geographical_Location,Vehicle_Speed,state,month,time-of-day
0,Sedan,A-101,Regular,Medium,"12.936687032945434, 77.53113977439017",43,Karnataka,9,21


In [66]:
loaded_model.predict(sample)

array([1])