In [41]:
 import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix, precision_score, f1_score, recall_score

## Data Exploration

Dataset Description:

Transaction_ID: Unique identifier for each transaction.

Timestamp: Date and time of the transaction.

Vehicle_Type: Type of vehicle involved in the transaction.

FastagID: Unique identifier for Fastag.

TollBoothID: Identifier for the toll booth.

Lane_Type: Type of lane used for the transaction.

Vehicle_Dimensions: Dimensions of the vehicle.

Transaction_Amount: Amount associated with the transaction.

Amount_paid: Amount paid for the transaction.

Geographical_Location: Location details of the transaction.

Vehicle_Speed: Speed of the vehicle during the transaction.

Vehicle_Plate_Number: License plate number of the vehicle.

Fraud_indicator: Binary indicator of fraudulent activity (target variable)

In [42]:
data = pd.read_csv('FastagFraudDetection.csv')
data.head()

Unnamed: 0,Transaction_ID,Timestamp,Vehicle_Type,FastagID,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Geographical_Location,Vehicle_Speed,Vehicle_Plate_Number,Fraud_indicator
0,1,1/6/2023 11:20,Bus,FTG-001-ABC-121,A-101,Express,Large,350,120,"13.059816123454882, 77.77068662374292",65,KA11AB1234,Fraud
1,2,1/7/2023 14:55,Car,FTG-002-XYZ-451,B-102,Regular,Small,120,100,"13.059816123454882, 77.77068662374292",78,KA66CD5678,Fraud
2,3,1/8/2023 18:25,Motorcycle,,D-104,Regular,Small,0,0,"13.059816123454882, 77.77068662374292",53,KA88EF9012,Not Fraud
3,4,1/9/2023 2:05,Truck,FTG-044-LMN-322,C-103,Regular,Large,350,120,"13.059816123454882, 77.77068662374292",92,KA11GH3456,Fraud
4,5,1/10/2023 6:35,Van,FTG-505-DEF-652,B-102,Express,Medium,140,100,"13.059816123454882, 77.77068662374292",60,KA44IJ6789,Fraud


In [43]:
data.shape

(5000, 13)

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Transaction_ID         5000 non-null   int64 
 1   Timestamp              5000 non-null   object
 2   Vehicle_Type           5000 non-null   object
 3   FastagID               4451 non-null   object
 4   TollBoothID            5000 non-null   object
 5   Lane_Type              5000 non-null   object
 6   Vehicle_Dimensions     5000 non-null   object
 7   Transaction_Amount     5000 non-null   int64 
 8   Amount_paid            5000 non-null   int64 
 9   Geographical_Location  5000 non-null   object
 10  Vehicle_Speed          5000 non-null   int64 
 11  Vehicle_Plate_Number   5000 non-null   object
 12  Fraud_indicator        5000 non-null   object
dtypes: int64(4), object(9)
memory usage: 507.9+ KB


### OneHot Encoding

In [45]:
data['Vehicle_Type'].unique()

array(['Bus ', 'Car', 'Motorcycle', 'Truck', 'Van', 'Sedan', 'SUV'],
      dtype=object)

In [46]:
Lane_order=['Express', 'Regular']
Vehicle_Dimensions_order=['Large', 'Small', 'Medium']
Fraud_indicator_order=['Not Fraud','Fraud']

In [47]:
ohe = OneHotEncoder()
encode0 = ohe.fit_transform(data[['Vehicle_Type']]).toarray()

In [48]:
feature_labels = ohe.categories_
np.array(feature_labels).ravel()

array(['Bus ', 'Car', 'Motorcycle', 'SUV', 'Sedan', 'Truck', 'Van'],
      dtype=object)

In [49]:
feature_labels = np.array(feature_labels).ravel()
print(feature_labels)

['Bus ' 'Car' 'Motorcycle' 'SUV' 'Sedan' 'Truck' 'Van']


In [50]:
features = pd.DataFrame(encode0, columns = feature_labels)

In [51]:
df_new = pd.concat([data, features], axis=1)

In [52]:
new_dataset=df_new.drop(['Timestamp','FastagID','Vehicle_Type','TollBoothID','Geographical_Location','Vehicle_Plate_Number'], axis=1)

In [53]:
new_dataset

Unnamed: 0,Transaction_ID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Vehicle_Speed,Fraud_indicator,Bus,Car,Motorcycle,SUV,Sedan,Truck,Van
0,1,Express,Large,350,120,65,Fraud,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Regular,Small,120,100,78,Fraud,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,3,Regular,Small,0,0,53,Not Fraud,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,Regular,Large,350,120,92,Fraud,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5,Express,Medium,140,100,60,Fraud,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,Regular,Large,330,330,81,Not Fraud,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4996,4997,Express,Medium,125,125,64,Not Fraud,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4997,4998,Regular,Medium,115,115,93,Not Fraud,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4998,4999,Express,Large,145,145,57,Not Fraud,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Ordinal Encoding

In [54]:
encode1 = OrdinalEncoder(categories=[Lane_order])
encode2 = OrdinalEncoder(categories=[Vehicle_Dimensions_order])
encode3 = OrdinalEncoder(categories=[Fraud_indicator_order])

In [55]:
encode1.fit(new_dataset[['Lane_Type']])
encode2.fit(new_dataset[['Vehicle_Dimensions']])
encode3.fit(new_dataset[['Fraud_indicator']])

In [56]:
new_lane=pd.DataFrame(encode1.transform(new_dataset[['Lane_Type']]))
new_dimensuions=pd.DataFrame(encode2.transform(new_dataset[['Vehicle_Dimensions']]))
new_fraud_indicator=pd.DataFrame(encode3.transform(new_dataset[['Fraud_indicator']]))

In [57]:
new_dataset['Lane_Type']= new_lane
new_dataset['Vehicle_Dimensions']= new_dimensuions
new_dataset['Fraud_indicator']=new_fraud_indicator

In [58]:
# dataset information
new_dataset

Unnamed: 0,Transaction_ID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Vehicle_Speed,Fraud_indicator,Bus,Car,Motorcycle,SUV,Sedan,Truck,Van
0,1,0.0,0.0,350,120,65,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1.0,1.0,120,100,78,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,3,1.0,1.0,0,0,53,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,1.0,0.0,350,120,92,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5,0.0,2.0,140,100,60,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,1.0,0.0,330,330,81,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4996,4997,0.0,2.0,125,125,64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4997,4998,1.0,2.0,115,115,93,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4998,4999,0.0,0.0,145,145,57,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [59]:
## Handlling missing values

# handlling missing values in dataset with dropping method
data = new_dataset.dropna(how='any')

In [60]:
# checking the missing values of each column
data.isnull().sum()

Transaction_ID        0
Lane_Type             0
Vehicle_Dimensions    0
Transaction_Amount    0
Amount_paid           0
Vehicle_Speed         0
Fraud_indicator       0
Bus                   0
Car                   0
Motorcycle            0
SUV                   0
Sedan                 0
Truck                 0
Van                   0
dtype: int64

In [61]:
#dristibution of legit and fraud transactions
data['Fraud_indicator'].value_counts()

Fraud_indicator
0.0    4017
1.0     983
Name: count, dtype: int64

### Highly Unblanced dataset

### 0-> normal transaction

### 1-> Fraud transaction

In [62]:
# separating data for analysis
legit = data[data.Fraud_indicator == 0]
fraud = data[data.Fraud_indicator == 1]

In [63]:
print(legit.shape)
print(fraud.shape)

(4017, 14)
(983, 14)


In [64]:
#statistical method of the data
legit.Transaction_Amount.describe()

count    4017.000000
mean      153.110530
std       114.435986
min         0.000000
25%        90.000000
50%       125.000000
75%       290.000000
max       350.000000
Name: Transaction_Amount, dtype: float64

In [65]:
fraud.Transaction_Amount.describe()

count    983.000000
mean     193.555443
std       97.465586
min       60.000000
25%      120.000000
50%      145.000000
75%      300.000000
max      350.000000
Name: Transaction_Amount, dtype: float64

In [66]:
#compare the values for both transactions
data.groupby('Fraud_indicator').mean()

Unnamed: 0_level_0,Transaction_ID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Vehicle_Speed,Bus,Car,Motorcycle,SUV,Sedan,Truck,Van
Fraud_indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0.0,2618.24695,0.588748,0.86582,153.11053,153.11053,67.731392,0.13418,0.147374,0.177745,0.131939,0.137665,0.138412,0.132686
1.0,2019.330621,0.501526,0.819939,193.555443,92.83825,68.340793,0.180061,0.12411,0.0,0.187182,0.163784,0.160732,0.18413


### Under Sampling

In [67]:
legit_sample = legit.sample(n=983)

### Concatenating two DataFrames

In [68]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [69]:
new_dataset.head()

Unnamed: 0,Transaction_ID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Vehicle_Speed,Fraud_indicator,Bus,Car,Motorcycle,SUV,Sedan,Truck,Van
1600,1601,0.0,2.0,120,120,70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4531,4532,1.0,1.0,90,90,95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
855,856,1.0,1.0,60,60,25,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3788,3789,0.0,0.0,340,340,61,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2759,2760,1.0,0.0,350,350,92,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
#dristibution of legit and fraud transactions in new dataset
new_dataset['Fraud_indicator'].value_counts()

Fraud_indicator
0.0    983
1.0    983
Name: count, dtype: int64

In [71]:
new_dataset.groupby('Fraud_indicator').mean()

Unnamed: 0_level_0,Transaction_ID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Vehicle_Speed,Bus,Car,Motorcycle,SUV,Sedan,Truck,Van
Fraud_indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0.0,2570.890132,0.589013,0.843337,156.012208,156.012208,68.089522,0.138352,0.16175,0.158698,0.14649,0.141404,0.133266,0.120041
1.0,2019.330621,0.501526,0.819939,193.555443,92.83825,68.340793,0.180061,0.12411,0.0,0.187182,0.163784,0.160732,0.18413


## Splitting the data into features and targets

In [72]:
X = new_dataset.drop(columns='Fraud_indicator', axis=1)
Y = new_dataset['Fraud_indicator']

## Split data into training and testing

In [73]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, train_size=0.8, stratify=Y, random_state=2)

In [74]:
print(X.shape, X_train.shape, X_test.shape)

(1966, 13) (1572, 13) (394, 13)


In [75]:
pd.DataFrame(X_train)

Unnamed: 0,Transaction_ID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Vehicle_Speed,Bus,Car,Motorcycle,SUV,Sedan,Truck,Van
4685,4686,1.0,1.0,90,90,87,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3758,3759,0.0,2.0,120,100,57,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2181,2182,1.0,2.0,125,90,95,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2881,2882,1.0,0.0,350,350,84,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1079,1080,1.0,1.0,100,100,84,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,120,1.0,0.0,350,350,78,1.0,0.0,0.0,0.0,0.0,0.0,0.0
241,242,1.0,0.0,350,350,75,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4403,4404,1.0,0.0,140,140,51,0.0,0.0,0.0,1.0,0.0,0.0,0.0
899,900,1.0,0.0,290,290,45,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [76]:
pd.DataFrame(Y_train)

Unnamed: 0,Fraud_indicator
4685,0.0
3758,1.0
2181,1.0
2881,0.0
1079,0.0
...,...
119,0.0
241,0.0
4403,0.0
899,0.0


## Model Development

In [77]:
Models = {
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Logistic Regression":LogisticRegression(),
    "SVM Classification": SVC()
}

for i in range (len(list(Models))):
    Model=list(Models.values())[i]

    #train Model
    Model.fit(X_train, Y_train)

    #Make predictions
    Y_train_pred = Model.predict(X_train)
    Y_test_pred = Model.predict(X_test)

    #Training Performance
    model_train_Accuracy = accuracy_score(Y_train, Y_train_pred)
    model_train_Precision = precision_score(Y_train, Y_train_pred)
    model_train_recall = recall_score(Y_train, Y_train_pred)
    model_train_F1 = f1_score(Y_train, Y_train_pred, average='weighted')

    #Testing Performance
    model_test_Accuracy = accuracy_score(Y_test, Y_test_pred)
    model_test_Precision = precision_score(Y_test, Y_test_pred)
    model_test_recall = recall_score(Y_test, Y_test_pred)
    model_test_F1 = f1_score(Y_test, Y_test_pred, average='weighted')

    print(list(Models.keys())[i])

    print("Models Performance for Training Set")
    print("- Accuracy: {:.4f}".format(model_train_Accuracy))
    print("- Precision: {:.4f}".format(model_train_Precision))
    print("- Recall: {:.4f}".format(model_train_recall))
    print("- F1 Score: {:.4f}".format(model_train_F1))


    print("--------------------------")

    print("Models Performance for Testing Set")
    print("- Accuracy: {:.4f}".format(model_test_Accuracy))
    print("- Precision: {:.4f}".format(model_test_Precision))
    print("- Recall: {:.4f}".format(model_test_recall))
    print("- F1 Score: {:.4f}".format(model_test_F1))

    print('='*35)
    print('\n')

Decision Tree
Models Performance for Training Set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
--------------------------
Models Performance for Testing Set
- Accuracy: 0.9949
- Precision: 1.0000
- Recall: 0.9898
- F1 Score: 0.9949


Random Forest
Models Performance for Training Set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
--------------------------
Models Performance for Testing Set
- Accuracy: 0.9797
- Precision: 0.9948
- Recall: 0.9645
- F1 Score: 0.9797


Logistic Regression
Models Performance for Training Set
- Accuracy: 0.9777
- Precision: 1.0000
- Recall: 0.9555
- F1 Score: 0.9777
--------------------------
Models Performance for Testing Set
- Accuracy: 0.9848
- Precision: 1.0000
- Recall: 0.9695
- F1 Score: 0.9848




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SVM Classification
Models Performance for Training Set
- Accuracy: 0.6966
- Precision: 0.7373
- Recall: 0.6107
- F1 Score: 0.6943
--------------------------
Models Performance for Testing Set
- Accuracy: 0.6904
- Precision: 0.7219
- Recall: 0.6193
- F1 Score: 0.6888




In [78]:
import pickle

# Create and train the model
model = RandomForestClassifier()
model.fit(X_train, Y_train)

# Save the model to a pickle file
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved to random_forest_model.pkl")


Model saved to random_forest_model.pkl
