## SVM_Binary_Classification 

In [1]:
from sklearn import svm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt # for plotting data and creating different charts.
import numpy as np # for math and arrays
import pandas as pd # data from for the data.
import seaborn as sns # for plotting.
from sklearn import metrics
import plotly.graph_objects as go #for plotting interactive 3d plots

In [2]:
df = pd.read_csv("My Labelled Data.csv")
df

Unnamed: 0,Speed,Acceleration,Heading,Label
0,19.959,0.3106,18.1086,0
1,0.862,2.2246,22.9378,0
2,7.188,0.8857,1.1095,0
3,23.404,0.0130,2.0083,0
4,14.895,-1.9161,7.9068,0
...,...,...,...,...
93615,28.542,1.3338,32.6381,0
93616,30.939,0.0198,13.0109,0
93617,25.095,2.2639,16.3054,0
93618,18.596,1.3457,6.6983,0


In [36]:
print(df.columns)

Index(['Speed', 'Acceleration', 'Heading', 'Label'], dtype='object')


In [4]:
print(f'Number of rows/examples and columns in the dataset: {df.shape}')

Number of rows/examples and columns in the dataset: (93620, 4)


In [5]:
# Summary of information on the dataset.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93620 entries, 0 to 93619
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Speed         93620 non-null  float64
 1   Acceleration  93620 non-null  float64
 2   Heading       93620 non-null  float64
 3   Label         93620 non-null  int64  
dtypes: float64(3), int64(1)
memory usage: 2.9 MB


In [6]:
# Observe correlation between Acceleration, Speed and Heading

df[["Acceleration", "Speed", "Heading"]].corr()

Unnamed: 0,Acceleration,Speed,Heading
Acceleration,1.0,-0.050897,-0.100277
Speed,-0.050897,1.0,0.291908
Heading,-0.100277,0.291908,1.0


### Cleaning the data

In [7]:
# Remove the rows with NA values.
df = df.dropna()

#Remove speed outliers
#df = df[~(df['Speed'] <= 0.5)] 
#df = df[~(df['Speed'] > 50)]

#Remove Acceleration outliers
#df = df[~(df['Acceleration'] <= -5)] 
#df = df[~(df['Acceleration'] > 4)]
#df.shape

In [8]:
#array = np.array(df)
#array.shape

### Split data into train and test

In [9]:
# Split the data into 70% train and 30% test 
train_df, test_df =  train_test_split(df, test_size=0.3, random_state=42)

print( train_df.shape )
print( test_df.shape )

(65534, 4)
(28086, 4)


In [10]:
train_labels = train_df.pop('Label')
test_labels = test_df.pop('Label')

print(train_labels.shape)
print(test_labels.shape)


(65534,)
(28086,)


### Data Normalization

In [11]:
# Statistics on the train dataset to make sure it is in a good shape. (you may display the same stat for test and validate)
train_stats = train_df.describe()
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Speed,65534.0,34.366966,22.632251,0.001,15.899,32.0445,45.757,86.988
Acceleration,65534.0,0.290177,1.911935,-4.7989,-0.969075,0.6767,1.8633,2.9
Heading,65534.0,31.548163,23.516633,0.0002,13.541625,27.0694,38.879975,92.9826


In [12]:
# define a function to normalize the data set.
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
normed_train_df = norm(train_df)
normed_test_df = norm(test_df)

In [13]:
# show a sample of the data after normalized
normed_train_df.head(10)

Unnamed: 0,Speed,Acceleration,Heading
44365,-1.158743,1.305495,-0.26007
19196,0.990535,1.084567,2.285992
7570,-1.300002,-0.332165,-1.188948
48678,-0.159594,-2.312514,0.09456
20812,1.804241,1.056271,2.503485
42767,0.197728,1.027505,-0.915572
17092,1.825715,1.217888,1.954508
90493,-0.660914,0.268745,-0.898044
48752,-1.238452,-1.167444,-1.223005
38470,1.631302,-0.096487,-0.442558


### Training the model (poly)

#Create a svm Classifier
model = svm.SVC(C = 100, # reg paramater
                kernel='poly', #kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
               ) # Linear Kernel

#Train the model using the training sets
model.fit(train_df, train_labels)

#Predict the response for test dataset
y_pred_poly = model.predict(test_df)

#Check accuracy of the prediction
print("Accuracy:",metrics.accuracy_score(test_labels, y_pred_poly))

### Training the model (rbf) 

#Create a svm Classifier
model = svm.SVC(C = 100, # reg paramater
                kernel='rbf', #kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
               ) # Linear Kernel

#Train the model using the training sets
model.fit(train_df, train_labels)

#Predict the response for test dataset
y_pred_rbf = model.predict(test_df)

print("Accuracy:",metrics.accuracy_score(test_labels, y_pred_rbf))

y_pred_rbf_test = model.predict(normed_train_df)
print("Accuracy:",metrics.accuracy_score(test_labels, y_pred_rbf_test))

### Training the model (rbf) Normed

In [14]:
#Create a svm Classifier
model = svm.SVC(C = 100, # reg paramater
                kernel='rbf', #kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
               ) # Linear Kernel

#Train the model using the training sets
model.fit(normed_train_df, train_labels)

#Predict the response for test dataset
y_pred = model.predict(normed_test_df)

In [37]:
print("Accuracy:",metrics.accuracy_score(test_labels, y_pred))

Accuracy: 0.9423200170903653


# Driver Data

In [73]:
#df1 = pd.read_csv("Mahinda.csv")
df1

Unnamed: 0,LOCAL DATE,LOCAL TIME,LATITUDE,N/S,LONGITUDE,E/W,SPEED,HEADING,DISTANCE,Speed,Acceleration,Heading
0,2023/03/24,14:37:45,5.962556,N,80.665809,E,0.307 km/h,103.758041,125188.79 M,0.307,0.000000,0.000000
1,2023/03/24,14:37:46,5.962575,N,80.665828,E,0.537 km/h,103.758041,3.73 M,0.537,0.063889,0.000000
2,2023/03/24,14:37:47,5.962564,N,80.665809,E,0.244 km/h,103.758041,3.84 M,0.244,-0.081389,0.000000
3,2023/03/24,14:37:48,5.962558,N,80.665801,E,0.105 km/h,103.758041,1.27 M,0.105,-0.038611,0.000000
4,2023/03/24,14:37:49,5.962550,N,80.665782,E,0.072 km/h,103.758041,3.39 M,0.072,-0.009167,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
44393,2023/03/24,17:46:51,5.962545,N,80.665886,E,1.315 km/h,144.077164,2.13 M,1.315,-0.007222,0.035126
44394,2023/03/24,17:46:52,5.962544,N,80.665889,E,0.258 km/h,144.077164,0.37 M,0.258,-0.293611,0.000000
44395,2023/03/24,17:46:53,5.962545,N,80.665889,E,1.442 km/h,144.077164,0.07 M,1.442,0.328889,0.000000
44396,2023/03/24,17:46:54,5.962546,N,80.665886,E,1.174 km/h,144.077164,0.40 M,1.174,-0.074444,0.000000


## Cleaning Data

In [74]:
# Remove the rows with NA values.
df1 = df1.dropna()

#Remove speed outliers
df1 = df1[~(df1['Speed'] <= 0.5)] 
df1 = df1[~(df1['Speed'] > 90)]

#Remove Acceleration outliers
df1 = df1[~(df1['Acceleration'] <= -5)] 
df1 = df1[~(df1['Acceleration'] > 4)]

#Remove Heading outliers
df1 = df1[~(df1['Heading'] > 95)] 

df1.shape

(31438, 12)

In [75]:
# Select only the applicable attribute columns
columns = ['Speed','Acceleration', 'Heading']
df1_selected = df1[columns]
df1_selected.shape

(31438, 3)

### Data Normalization

In [76]:
# Statistics on the dataset
df1_stats = df1_selected.describe()
df1_stats = df1_stats.transpose()
df1_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Speed,31438.0,24.718789,14.36457,0.501,13.574,28.752,36.228,58.888
Acceleration,31438.0,-0.001019,0.522739,-4.248333,-0.196111,0.007778,0.212778,3.930833
Heading,31438.0,3.283211,6.655603,0.0,0.403564,1.281185,3.308433,94.017212


In [77]:
# normalize the data set.
normed_df1 = norm(df1_selected)

In [78]:
#Predict the response for dataset
y_pred_driver = model.predict(normed_df1)

In [79]:
# Add a new column to the dataset called Labels
df1['Labels'] = y_pred_driver
df1.shape

(31438, 13)

In [80]:
#Add Index column
indexs = np.arange(1,len(y_pred_driver)+1)
df1['Index'] = indexs
df1

Unnamed: 0,LOCAL DATE,LOCAL TIME,LATITUDE,N/S,LONGITUDE,E/W,SPEED,HEADING,DISTANCE,Speed,Acceleration,Heading,Labels,Index
1,2023/03/24,14:37:46,5.962575,N,80.665828,E,0.537 km/h,103.758041,3.73 M,0.537,0.063889,0.000000,0,1
11,2023/03/24,14:37:56,5.962504,N,80.665729,E,0.504 km/h,103.758041,0.95 M,0.504,0.091944,0.000000,0,2
38,2023/03/24,14:38:23,5.962506,N,80.665723,E,0.706 km/h,103.758041,0.01 M,0.706,0.189722,0.000000,0,3
88,2023/03/24,14:39:13,5.962502,N,80.665728,E,0.553 km/h,103.758072,0.35 M,0.553,0.103611,0.000000,0,4
116,2023/03/24,14:39:41,5.962504,N,80.665721,E,4.742 km/h,352.405487,1.40 M,4.742,0.508889,1.528442,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44392,2023/03/24,17:46:50,5.962543,N,80.665867,E,1.341 km/h,144.042038,0.98 M,1.341,-0.114722,0.683670,0,31434
44393,2023/03/24,17:46:51,5.962545,N,80.665886,E,1.315 km/h,144.077164,2.13 M,1.315,-0.007222,0.035126,0,31435
44395,2023/03/24,17:46:53,5.962545,N,80.665889,E,1.442 km/h,144.077164,0.07 M,1.442,0.328889,0.000000,0,31436
44396,2023/03/24,17:46:54,5.962546,N,80.665886,E,1.174 km/h,144.077164,0.40 M,1.174,-0.074444,0.000000,0,31437


In [81]:
#df1.to_csv('Mahinda_output.csv', index=False)

In [82]:
countN=0
countA=0
labels = df1['Labels']
for i in labels:
    if i==0:
        countN+=1
    if i==1:
        countA+=1
print("Not aggressive count:",countN)
print("Aggressive count:",countA)
print("Totoal count:",countN+countA)

print("Aggressive percentage:",countA/(countN+countA)*100)

Not aggressive count: 30690
Aggressive count: 748
Totoal count: 31438
Aggressive percentage: 2.3792862141357594
