In [125]:
import pandas as pd
import numpy as np

# Load the CSV files
file_paths = ['./data/opel_corsa_01.csv', './data/opel_corsa_02.csv',
              './data/peugeot_207_01.csv', './data/peugeot_207_02.csv']

In [126]:
# Load the datasets, specifying semicolon as the delimiter
data_frames = [pd.read_csv(file_path, delimiter=';') for file_path in file_paths]

In [127]:
# list the columns in the dataset
print(data_frames[0].columns)

Index(['Unnamed: 0', 'AltitudeVariation', 'VehicleSpeedInstantaneous',
       'VehicleSpeedAverage', 'VehicleSpeedVariance', 'VehicleSpeedVariation',
       'LongitudinalAcceleration', 'EngineLoad', 'EngineCoolantTemperature',
       'ManifoldAbsolutePressure', 'EngineRPM', 'MassAirFlow',
       'IntakeAirTemperature', 'VerticalAcceleration',
       'FuelConsumptionAverage', 'roadSurface', 'traffic', 'drivingStyle'],
      dtype='object')


In [128]:
print(data_frames[0].head())

   Unnamed: 0  AltitudeVariation  VehicleSpeedInstantaneous  \
0          59          -2.299988                  25.670519   
1          60          -2.099976                  24.094259   
2          61          -1.500000                  22.743179   
3          62           0.100037                  22.292820   
4          63           0.099976                  23.643900   

   VehicleSpeedAverage  VehicleSpeedVariance  VehicleSpeedVariation  \
0            13.223501            121.592690              -2.476980   
1            13.638919            120.422571              -1.576260   
2            14.031043            118.456769              -1.351080   
3            14.171073            117.571308              -0.450359   
4            14.328954            117.074149               1.351080   

   LongitudinalAcceleration  EngineLoad  EngineCoolantTemperature  \
0                    0.3555    4.705883                        68   
1                    0.4492   10.588236                 

In [129]:
# Combine the datasets
data = pd.concat(data_frames)

# list the columns in the dataset
print(data.columns)

Index(['Unnamed: 0', 'AltitudeVariation', 'VehicleSpeedInstantaneous',
       'VehicleSpeedAverage', 'VehicleSpeedVariance', 'VehicleSpeedVariation',
       'LongitudinalAcceleration', 'EngineLoad', 'EngineCoolantTemperature',
       'ManifoldAbsolutePressure', 'EngineRPM', 'MassAirFlow',
       'IntakeAirTemperature', 'VerticalAcceleration',
       'FuelConsumptionAverage', 'roadSurface', 'traffic', 'drivingStyle'],
      dtype='object')


In [130]:
# remove the first column of data
data = data.drop('Unnamed: 0', axis=1)
    

In [131]:
print(data.head())

   AltitudeVariation  VehicleSpeedInstantaneous  VehicleSpeedAverage  \
0          -2.299988                  25.670519            13.223501   
1          -2.099976                  24.094259            13.638919   
2          -1.500000                  22.743179            14.031043   
3           0.100037                  22.292820            14.171073   
4           0.099976                  23.643900            14.328954   

   VehicleSpeedVariance  VehicleSpeedVariation  LongitudinalAcceleration  \
0            121.592690              -2.476980                    0.3555   
1            120.422571              -1.576260                    0.4492   
2            118.456769              -1.351080                    0.4258   
3            117.571308              -0.450359                    0.4140   
4            117.074149               1.351080                    0.3945   

   EngineLoad  EngineCoolantTemperature  ManifoldAbsolutePressure  EngineRPM  \
0    4.705883                 

In [132]:
# print the shape of the data
print(data.shape)

(23775, 17)


In [133]:
# tail of data
print(data.tail())

      AltitudeVariation  VehicleSpeedInstantaneous  VehicleSpeedAverage  \
4441           1.000000                  28.799999            28.559999   
4442           1.699997                  30.599998            28.529999   
4443           1.800003                  29.699999            28.499999   
4444           2.100006                  29.699999            28.409999   
4445           1.500000                  33.299999            28.349999   

      VehicleSpeedVariance  VehicleSpeedVariation  LongitudinalAcceleration  \
4441             57.190571               3.600000                   -0.0292   
4442             57.010266               1.799999                   -0.0304   
4443             56.883045              -0.900000                   -0.1684   
4444             56.160910               0.000000                   -0.0644   
4445             55.340843               3.600000                   -0.1817   

      EngineLoad  EngineCoolantTemperature  ManifoldAbsolutePressure  \
44

In [134]:
# info of data
print(data.info())


<class 'pandas.core.frame.DataFrame'>
Index: 23775 entries, 0 to 4445
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   AltitudeVariation          23775 non-null  float64
 1   VehicleSpeedInstantaneous  23766 non-null  float64
 2   VehicleSpeedAverage        23775 non-null  float64
 3   VehicleSpeedVariance       23775 non-null  float64
 4   VehicleSpeedVariation      23775 non-null  float64
 5   LongitudinalAcceleration   23775 non-null  float64
 6   EngineLoad                 23770 non-null  float64
 7   EngineCoolantTemperature   23770 non-null  float64
 8   ManifoldAbsolutePressure   23770 non-null  float64
 9   EngineRPM                  23770 non-null  float64
 10  MassAirFlow                23770 non-null  float64
 11  IntakeAirTemperature       23770 non-null  float64
 12  VerticalAcceleration       23775 non-null  float64
 13  FuelConsumptionAverage     23770 non-null  float64
 

In [135]:
road_surf_unique = list(data["roadSurface"].unique())
traffic_unique = list(data["traffic"].unique())
drivingStyle_unique = list(data["drivingStyle"].unique())

print(road_surf_unique)
print(traffic_unique)
print(drivingStyle_unique)

['SmoothCondition', 'UnevenCondition', 'FullOfHolesCondition']
['LowCongestionCondition', 'NormalCongestionCondition', 'HighCongestionCondition']
['EvenPaceStyle', 'AggressiveStyle']


In [136]:
final_list = []
final_list.extend(road_surf_unique)
final_list.extend(road_surf_unique)
final_list.extend(road_surf_unique)
print(final_list)

['SmoothCondition', 'UnevenCondition', 'FullOfHolesCondition', 'SmoothCondition', 'UnevenCondition', 'FullOfHolesCondition', 'SmoothCondition', 'UnevenCondition', 'FullOfHolesCondition']


In [137]:
one_hot_encoding = pd.get_dummies(data[['roadSurface', "traffic", "drivingStyle"]]) # one hot encoding of the categorical variables in the dataset 
one_hot_encoding = one_hot_encoding.astype(int) # convert the one hot encoding to integer type
one_hot_encoding


Unnamed: 0,roadSurface_FullOfHolesCondition,roadSurface_SmoothCondition,roadSurface_UnevenCondition,traffic_HighCongestionCondition,traffic_LowCongestionCondition,traffic_NormalCongestionCondition,drivingStyle_AggressiveStyle,drivingStyle_EvenPaceStyle
0,0,1,0,0,1,0,0,1
1,0,1,0,0,1,0,0,1
2,0,1,0,0,1,0,0,1
3,0,1,0,0,1,0,0,1
4,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...
4441,0,1,0,0,1,0,0,1
4442,0,1,0,0,1,0,0,1
4443,0,1,0,0,1,0,0,1
4444,0,1,0,0,1,0,0,1


In [138]:
one_hot_encoding['Target']= one_hot_encoding.values.tolist() # convert the one hot encoding to list and add it to the dataframe
one_hot_encoding 

Unnamed: 0,roadSurface_FullOfHolesCondition,roadSurface_SmoothCondition,roadSurface_UnevenCondition,traffic_HighCongestionCondition,traffic_LowCongestionCondition,traffic_NormalCongestionCondition,drivingStyle_AggressiveStyle,drivingStyle_EvenPaceStyle,Target
0,0,1,0,0,1,0,0,1,"[0, 1, 0, 0, 1, 0, 0, 1]"
1,0,1,0,0,1,0,0,1,"[0, 1, 0, 0, 1, 0, 0, 1]"
2,0,1,0,0,1,0,0,1,"[0, 1, 0, 0, 1, 0, 0, 1]"
3,0,1,0,0,1,0,0,1,"[0, 1, 0, 0, 1, 0, 0, 1]"
4,0,1,0,0,1,0,0,1,"[0, 1, 0, 0, 1, 0, 0, 1]"
...,...,...,...,...,...,...,...,...,...
4441,0,1,0,0,1,0,0,1,"[0, 1, 0, 0, 1, 0, 0, 1]"
4442,0,1,0,0,1,0,0,1,"[0, 1, 0, 0, 1, 0, 0, 1]"
4443,0,1,0,0,1,0,0,1,"[0, 1, 0, 0, 1, 0, 0, 1]"
4444,0,1,0,0,1,0,0,1,"[0, 1, 0, 0, 1, 0, 0, 1]"


In [139]:
df_processed = data.drop(["roadSurface", "traffic", "drivingStyle"], axis = 1) # drop the categorical variables from the dataset 
df_processed['Target'] = one_hot_encoding['Target'] # add the target variable to the dataset 
df_processed

Unnamed: 0,AltitudeVariation,VehicleSpeedInstantaneous,VehicleSpeedAverage,VehicleSpeedVariance,VehicleSpeedVariation,LongitudinalAcceleration,EngineLoad,EngineCoolantTemperature,ManifoldAbsolutePressure,EngineRPM,MassAirFlow,IntakeAirTemperature,VerticalAcceleration,FuelConsumptionAverage,Target
0,-2.299988,25.670519,13.223501,121.592690,-2.476980,0.3555,4.705883,68.0,106.0,1796.0,15.810000,24.0,-0.1133,19.497335,"[0, 1, 0, 0, 1, 0, 0, 1]"
1,-2.099976,24.094259,13.638919,120.422571,-1.576260,0.4492,10.588236,68.0,103.0,1689.0,14.650000,22.0,-0.1289,19.515722,"[0, 1, 0, 0, 1, 0, 0, 1]"
2,-1.500000,22.743179,14.031043,118.456769,-1.351080,0.4258,27.450981,68.0,103.0,1599.0,11.850000,21.0,-0.1328,19.441765,"[0, 1, 0, 0, 1, 0, 0, 1]"
3,0.100037,22.292820,14.171073,117.571308,-0.450359,0.4140,24.313726,69.0,104.0,1620.0,12.210000,20.0,-0.0859,19.388769,"[0, 1, 0, 0, 1, 0, 0, 1]"
4,0.099976,23.643900,14.328954,117.074149,1.351080,0.3945,20.000000,69.0,104.0,1708.0,11.910000,21.0,-0.0664,19.301638,"[0, 1, 0, 0, 1, 0, 0, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4441,1.000000,28.799999,28.559999,57.190571,3.600000,-0.0292,25.882353,81.0,115.0,1755.5,20.469999,25.0,-0.1661,14.578003,"[0, 1, 0, 0, 1, 0, 0, 1]"
4442,1.699997,30.599998,28.529999,57.010266,1.799999,-0.0304,11.764706,81.0,106.0,736.5,17.740000,25.0,-0.1987,14.585642,"[0, 1, 0, 0, 1, 0, 0, 1]"
4443,1.800003,29.699999,28.499999,56.883045,-0.900000,-0.1684,98.039215,81.0,106.0,1254.0,9.520000,24.0,-0.1156,14.547294,"[0, 1, 0, 0, 1, 0, 0, 1]"
4444,2.100006,29.699999,28.409999,56.160910,0.000000,-0.0644,79.607841,80.0,112.0,1254.0,14.910000,23.0,-0.0760,14.546828,"[0, 1, 0, 0, 1, 0, 0, 1]"


In [140]:
from sklearn.utils import shuffle
df_processed = shuffle(df_processed)

In [141]:
x_train = df_processed.drop(['Target'], axis=1)
y_train = np.array(df_processed['Target'].values.tolist())
print(x_train.shape)
print(y_train.shape)

(23775, 14)
(23775, 8)


In [142]:
print(data['drivingStyle'].value_counts()) # count the number of each driving style in the dataset  

drivingStyle
EvenPaceStyle      21016
AggressiveStyle     2759
Name: count, dtype: int64


In [143]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=1) # split the dataset into training and testing sets
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # print the shape of the training and testing sets

(19020, 14) (4755, 14) (19020, 8) (4755, 8)


In [145]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # create an instance of the standard scaler
scaler.fit(x_train) # fit the scaler to the training data
x_train_transformed = scaler.transform(x_train) # transform the training data
x_test_transformed = scaler.transform(x_test) # transform the testing data

print(x_train_transformed.shape, x_test_transformed.shape) # print the shape of the transformed training and testing sets

(19020, 14) (4755, 14)


In [146]:
print(y_train.shape, y_test.shape) # print the shape of the training and testing sets

(19020, 8) (4755, 8)
