In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the dataset

In [None]:
import pandas as pd

df_peugot_1 = pd.read_csv("/kaggle/input/traffic-driving-style-road-surface-condition/peugeot_207_01.csv", delimiter=';')
df_peugot_2 = pd.read_csv("/kaggle/input/traffic-driving-style-road-surface-condition/peugeot_207_02.csv", delimiter=';')

df_corsa_1 = pd.read_csv("/kaggle/input/traffic-driving-style-road-surface-condition/opel_corsa_01.csv", delimiter=';')
df_corsa_2 = pd.read_csv("/kaggle/input/traffic-driving-style-road-surface-condition/opel_corsa_02.csv", delimiter=';')



df_new = pd.DataFrame(columns=list(df_corsa_1.columns))
print(df_new.head(5))

df_new = pd.concat([df_peugot_1, df_peugot_2, df_corsa_1, df_corsa_2], axis=0)
print(df_new.shape)

In [None]:
print(df_peugot_1.shape)
print(df_peugot_2.shape)
print(df_corsa_1.shape)
print(df_corsa_2.shape)

In [None]:
df_new.tail(5)

In [None]:
df_new.info()

In [None]:
df_new.describe()
df_new = df_new.fillna(0)

In [None]:
def process_col_values(df_new, col_name):
    col_list = list(df_new[col_name])
    processed_value = []
    for each_value in col_list:
        if each_value == 0:
            processed_value.append(0)
        elif type(each_value) == str:
            processed_value.append(float(each_value.replace(",",".")))
        else:
            processed_value.append(float(each_value))
    
    df_new[col_name] = processed_value

In [None]:
cols_to_change = ["AltitudeVariation", "VehicleSpeedInstantaneous", "VehicleSpeedAverage", "VehicleSpeedVariance", "VehicleSpeedVariation",
                 "LongitudinalAcceleration", "EngineLoad", "EngineRPM", "MassAirFlow", "VerticalAcceleration", "FuelConsumptionAverage"]

for col in cols_to_change:
    print(col)
    process_col_values(df_new, col)

In [None]:
df_new.info()

In [None]:
df_new.tail()

In [None]:
road_surf_unique = list(df_new["roadSurface"].unique())
traffic_unique = list(df_new["traffic"].unique())
drivingStyle_unique = list(df_new["drivingStyle"].unique())

print(road_surf_unique)
print(traffic_unique)
print(drivingStyle_unique)

In [None]:
final_list = []
final_list.extend(road_surf_unique)
final_list.extend(road_surf_unique)
final_list.extend(road_surf_unique)
print(final_list)

In [None]:
one_hot_encoding = pd.get_dummies(df_new[['roadSurface', "traffic", "drivingStyle"]])
one_hot_encoding

In [None]:
one_hot_encoding['Target']= one_hot_encoding.values.tolist()
one_hot_encoding

In [None]:
df_processed = df_new.drop(["roadSurface", "traffic", "drivingStyle"], axis = 1)
df_processed['Target'] = one_hot_encoding['Target']
df_processed

In [None]:
from sklearn.utils import shuffle
df_processed = shuffle(df_processed)

In [None]:
x_train = df_processed.drop(['Target'], axis=1)
y_train = np.array(df_processed['Target'].values.tolist())
print(x_train.shape)
print(y_train.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit(X_train)

x_train_transformed = min_max_scaler.transform(X_train)
x_test_transformed = min_max_scaler.transform(X_test)


print(x_train_transformed.shape)
print(x_test_transformed.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
multi_target_forest.fit(x_train_transformed, y_train)

In [None]:
y_test_predicted = multi_target_forest.predict(x_test_transformed)
y_test_predicted

In [None]:
print(y_test.shape)
print(y_test_predicted.shape)

In [None]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_test_predicted)
score

In [None]:
from sklearn.tree import DecisionTreeClassifier

forest = DecisionTreeClassifier(random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
multi_target_forest.fit(x_train_transformed, y_train)

y_test_predicted = multi_target_forest.predict(x_test_transformed)
score = accuracy_score(y_test, y_test_predicted)
print(score)

In [None]:
cols = list(one_hot_encoding.columns)[:-1]

ordered_values =[]
for i in cols:
    ordered_values.append(i.split("_")[1])
    
print(ordered_values)
print(len(ordered_values))

In [None]:
print(y_test_predicted.shape)

In [None]:
def convert_results(y_test_predicted, ordered_values):
    final_prediction = []
    for each_prediction in y_test_predicted:
        sub_list = []
        for i, j in enumerate(list(each_prediction)):
            if j == 1:
                sub_list.append(ordered_values[i])
        final_prediction.append(sub_list)
    return final_prediction


final_prediction = convert_results(y_test_predicted, ordered_values)
final_prediction[:10]

# Modelling

1. Standardize the data.
2. Split the data into train, cross_Validate_test.
3. Try out multiple models.
4. Plot accuracuies for multiple models.

Try out - 

1. Different imputation vlaues for Nan
2. Try to0 balance the data if there is imbalance.
3. Do exploratory data analysis.