# 1. Introduction: Business Goal & Problem Definition

This project´s goal is creating a model to identify a vehicle based on its 18 features below. The dataset is "Features extracted from the silhouette of vehicles" and it´s available in Kaggle. The possible labels for the dependent variables are "bus", "car" and "van".

IF YOU LIKE IT OR IF IT HELPS YOU SOMEHOW, COULD YOU PLEASE UPVOTE? THANK YOU VERY MUCH!!!

* compactness
* circularity
* distance_circularity
* radius_ratio
* pr.axis_aspect_ratio
* max.length_aspect_ratio
* scatter_ratio
* elongatedness
* pr.axis_rectangularity
* max.length_rectangularity
* scaled_variance
* scaled_variance.1
* scaled_radius_of_gyration
* scaled_radius_of_gyration.1
* skewness_about
* skewness_about.1
* skewness_about.2
* hollows_ratio

# 2. Importing Basic Libraries

In [None]:
import io
import openpyxl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 3. Data Collection

In [None]:
vehicles_ds = pd.read_csv("../input/vehicle/vehicle.csv", sep=",")

vehicles_ds

# 4. Data Preliminary Exploration

In [None]:
#Checking a dataset sample

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
pd.options.display.float_format="{:,.2f}".format
vehicles_ds.sample(n=10, random_state=0)

In [None]:
#Checking dataset info by feature

vehicles_ds.info(verbose=True, null_counts=True)

In [None]:
#Checking the existence of zeros in rows

(vehicles_ds==0).sum(axis=0).to_excel("zeros_per_feature.xlsx")
(vehicles_ds==0).sum(axis=0)

In [None]:
#Checking the existence of duplicated rows

vehicles_ds.duplicated().sum()

In [None]:
#Checking data balancing (for classification)

data_balancing = pd.DataFrame()
data_balancing["Count"] = vehicles_ds["class"].value_counts()
data_balancing["Count%"] = vehicles_ds["class"].value_counts()/vehicles_ds.shape[0]*100

data_balancing

In [None]:
#Checking basic statistical data by feature

vehicles_ds.describe(include="all")

# 5. Data Cleaning

    We´ll perform the following:

    1. Use one-hot encoding to change the dependent variable to numerical

    2. Replace all rows with NA by their means, so we are able to keep relevant info in all columns


    * no outliers found

In [None]:
#1

#for the ML model (label encoding)
vehicles_ds["class_encoding"] = vehicles_ds["class"].apply(lambda x: ["bus", "car", "van"].index(x))+1

#for the DL model (one-hot encoding)
vehicles_ds = pd.concat([vehicles_ds, pd.get_dummies(vehicles_ds["class"], prefix="class")], axis=1)
class_encoding_dl = np.asarray(vehicles_ds[["class_bus", "class_car", "class_van"]]) #creating for the DL model the response variable through the concatenation of the created dummy columns, forming an array

#2

vehicles_ds.fillna(vehicles_ds.mean(), inplace=True)

vehicles_ds.to_excel("vehicles_ds_clean.xlsx")

# 6. Data Exploration

In [None]:
#Plotting Categorical Variables

fig, ax = plt.subplots(1, 2)
vehicles_ds["class"].value_counts().plot.bar(color="purple", ax=ax[0])
vehicles_ds["class"].value_counts().plot.pie(autopct='%1.1f%%',shadow=True,textprops={"fontsize": 10},ax=ax[1])
fig.suptitle("Class Frequency", fontsize=15)
plt.xticks(rotation=90)
plt.yticks(rotation=45)


#Plotting Numerical Variables
fig, ax = plt.subplots(1,3)
fig.suptitle("Compactness Distribution", fontsize=15)
sns.distplot(vehicles_ds["compactness"], ax=ax[0])
sns.boxplot(vehicles_ds["compactness"], ax=ax[1])
sns.violinplot(vehicles_ds["compactness"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Circularity Distribution", fontsize=15)
sns.distplot(vehicles_ds["circularity"], ax=ax[0])
sns.boxplot(vehicles_ds["circularity"], ax=ax[1])
sns.violinplot(vehicles_ds["circularity"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Distance_circularity Distribution", fontsize=15)
sns.distplot(vehicles_ds["distance_circularity"], ax=ax[0])
sns.boxplot(vehicles_ds["distance_circularity"], ax=ax[1])
sns.violinplot(vehicles_ds["distance_circularity"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Radius_ratio Distribution", fontsize=15)
sns.distplot(vehicles_ds["radius_ratio"], ax=ax[0])
sns.boxplot(vehicles_ds["radius_ratio"], ax=ax[1])
sns.violinplot(vehicles_ds["radius_ratio"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Pr.axis_aspect_ratio Distribution", fontsize=15)
sns.distplot(vehicles_ds["pr.axis_aspect_ratio"], ax=ax[0])
sns.boxplot(vehicles_ds["pr.axis_aspect_ratio"], ax=ax[1])
sns.violinplot(vehicles_ds["pr.axis_aspect_ratio"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Max.length_aspect_ratio Distribution", fontsize=15)
sns.distplot(vehicles_ds["max.length_aspect_ratio"], ax=ax[0])
sns.boxplot(vehicles_ds["max.length_aspect_ratio"], ax=ax[1])
sns.violinplot(vehicles_ds["max.length_aspect_ratio"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Scatter_ratio Distribution", fontsize=15)
sns.distplot(vehicles_ds["scatter_ratio"], ax=ax[0])
sns.boxplot(vehicles_ds["scatter_ratio"], ax=ax[1])
sns.violinplot(vehicles_ds["scatter_ratio"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Elongatedness Distribution", fontsize=15)
sns.distplot(vehicles_ds["elongatedness"], ax=ax[0])
sns.boxplot(vehicles_ds["elongatedness"], ax=ax[1])
sns.violinplot(vehicles_ds["elongatedness"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Pr.axis_rectangularity Distribution", fontsize=15)
sns.distplot(vehicles_ds["pr.axis_rectangularity"], ax=ax[0])
sns.boxplot(vehicles_ds["pr.axis_rectangularity"], ax=ax[1])
sns.violinplot(vehicles_ds["pr.axis_rectangularity"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Max.length_rectangularity Distribution", fontsize=15)
sns.distplot(vehicles_ds["max.length_rectangularity"], ax=ax[0])
sns.boxplot(vehicles_ds["max.length_rectangularity"], ax=ax[1])
sns.violinplot(vehicles_ds["max.length_rectangularity"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Scaled_variance Distribution", fontsize=15)
sns.distplot(vehicles_ds["scaled_variance"], ax=ax[0])
sns.boxplot(vehicles_ds["scaled_variance"], ax=ax[1])
sns.violinplot(vehicles_ds["scaled_variance"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Scaled_variance.1 Distribution", fontsize=15)
sns.distplot(vehicles_ds["scaled_variance.1"], ax=ax[0])
sns.boxplot(vehicles_ds["scaled_variance.1"], ax=ax[1])
sns.violinplot(vehicles_ds["scaled_variance.1"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Scaled_radius_of_gyration Distribution", fontsize=15)
sns.distplot(vehicles_ds["scaled_radius_of_gyration"], ax=ax[0])
sns.boxplot(vehicles_ds["scaled_radius_of_gyration"], ax=ax[1])
sns.violinplot(vehicles_ds["scaled_radius_of_gyration"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Scaled_radius_of_gyration.1 Distribution", fontsize=15)
sns.distplot(vehicles_ds["scaled_radius_of_gyration.1"], ax=ax[0])
sns.boxplot(vehicles_ds["scaled_radius_of_gyration.1"], ax=ax[1])
sns.violinplot(vehicles_ds["scaled_radius_of_gyration.1"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Skewness_about Distribution", fontsize=15)
sns.distplot(vehicles_ds["skewness_about"], ax=ax[0])
sns.boxplot(vehicles_ds["skewness_about"], ax=ax[1])
sns.violinplot(vehicles_ds["skewness_about"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Skewness_about.1 Distribution", fontsize=15)
sns.distplot(vehicles_ds["skewness_about.1"], ax=ax[0])
sns.boxplot(vehicles_ds["skewness_about.1"], ax=ax[1])
sns.violinplot(vehicles_ds["skewness_about.1"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Skewness_about Distribution", fontsize=15)
sns.distplot(vehicles_ds["skewness_about"], ax=ax[0])
sns.boxplot(vehicles_ds["skewness_about"], ax=ax[1])
sns.violinplot(vehicles_ds["skewness_about"], ax=ax[2])

fig, ax = plt.subplots(1,3)
fig.suptitle("Hollows_ratio Distribution", fontsize=15)
sns.distplot(vehicles_ds["hollows_ratio"], ax=ax[0])
sns.boxplot(vehicles_ds["hollows_ratio"], ax=ax[1])
sns.violinplot(vehicles_ds["hollows_ratio"], ax=ax[2])

In [None]:
#Alternatively using Profile Report to see variables statistics and correlations

# from pandas_profiling import ProfileReport
# profile = ProfileReport(vehicles_ds, title="Vehicles Silhouettes Classification with PCA & DL")
# profile.to_file(output_file="Vehicles Silhouettes Classification with PCA & DL.html")

# 7. Correlations Analysis & Features Selection

We´ll study the correlations but for the purpose of this exercise we´ll first use all features and then use PCA to see how it affects the model accuracy

In [None]:
#Deleting original categorical columns

vehicles_ds.drop(["class"], axis=1)

#Plotting a Heatmap

fig, ax = plt.subplots(1, figsize=(25,25))
sns.heatmap(vehicles_ds.corr(), annot=True, fmt=",.2f")
plt.title("Heatmap Correlation", fontsize=20)
plt.tick_params(labelsize=12)
plt.xticks(rotation=90)
plt.yticks(rotation=45)

#Plotting a Pairplot

# sns.pairplot(vehicles_ds)

In [None]:
#Plotting a Feature Importance

from xgboost import XGBClassifier
from matplotlib import pyplot
#Defining Xs and y
X = vehicles_ds[["compactness", "circularity", "distance_circularity", "radius_ratio", "pr.axis_aspect_ratio", "max.length_aspect_ratio", "scatter_ratio", "elongatedness", "pr.axis_rectangularity", "max.length_rectangularity", "scaled_variance", "scaled_variance.1", "scaled_radius_of_gyration", "scaled_radius_of_gyration.1", "skewness_about", "skewness_about.1", "skewness_about.2", "hollows_ratio"]]
y = vehicles_ds["class_encoding"]
#Defining the model
model = XGBClassifier().fit(X, y)
#Getting importance
importance = model.feature_importances_
#Summarizing feature importance
for i,v in enumerate(importance):
    print("Feature:{0:}, Score:{1:,.4f}".format(X.columns[i], v))
#Plotting feature importance
pd.Series(model.feature_importances_[::-1], index=X.columns[::-1]).plot(kind="barh", figsize=(25,25))

# 8. Data Modelling

In [None]:
#Defining Xs and y

X = vehicles_ds[["compactness", "circularity", "distance_circularity", "radius_ratio", "pr.axis_aspect_ratio", "max.length_aspect_ratio", "scatter_ratio", "elongatedness", "pr.axis_rectangularity", "max.length_rectangularity", "scaled_variance", "scaled_variance.1", "scaled_radius_of_gyration", "scaled_radius_of_gyration.1", "skewness_about", "skewness_about.1", "skewness_about.2", "hollows_ratio"]]
y = vehicles_ds["class_encoding"]
y_dl = class_encoding_dl #for the DL model

#Scaling all features

from sklearn.preprocessing import MinMaxScaler
sc_X = MinMaxScaler()
X_scaled = sc_X.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)

#Setting train/test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=0)
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_scaled, y_dl, random_state=0) #for the DL model

# 9. Machine Learning Algorithms Implementation & Assessment

# 9.1 Logistic Regression

In [None]:
#Creating a Logistic Regression model and checking its Metrics

from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

#Trying different polynomial degrees
degrees = [1, 2, 3, 4, 5]
print("Testing degrees:")
for a in degrees:
    poly = PolynomialFeatures(degree=a)
    X_train_degree = poly.fit_transform(X_train)
    X_test_degree = poly.fit_transform(X_test)
    model_lr = linear_model.LogisticRegression(max_iter=1000000).fit(X_train_degree, y_train)
    y_preds_train = model_lr.predict(X_train_degree)
    y_preds_test = model_lr.predict(X_test_degree)
    accuracy_train = accuracy_score(y_train, y_preds_train)
    accuracy_test = accuracy_score(y_test, y_preds_test)
    precision_train = precision_score(y_train, y_preds_train, average="weighted")
    precision_test = precision_score(y_test, y_preds_test, average="weighted")
    recall_train = recall_score(y_train, y_preds_train, average="weighted")
    recall_test = recall_score(y_test, y_preds_test, average="weighted")
    f1_train = f1_score(y_train, y_preds_train, average="weighted")
    f1_test = f1_score(y_test, y_preds_test, average="weighted")
    print("Train: Degree:{0:,.0f}, Accuracy:{1:,.3f}, Precision:{2:,.3f}, Recall:{3:,.3f}, F1:{4:,.3f}".format(a, accuracy_train, precision_train, recall_train, f1_train))
    print("Test : Degree:{0:,.0f}, Accuracy:{1:,.3f}, Precision:{2:,.3f}, Recall:{3:,.3f}, F1:{4:,.3f}".format(a, accuracy_test, precision_test, recall_test, f1_test))
print("")

#Choosing the best polynomial degree
chosen_degree = 4
poly = PolynomialFeatures(degree=chosen_degree)

#Working on X_train & X_test in the polynomial chosen degree
X_train_degree = poly.fit_transform(X_train)
X_test_degree = poly.fit_transform(X_test)

#Fitting to the model
model_lr = linear_model.LogisticRegression().fit(X_train_degree, y_train)
print(f"Linear Regression Intercept: {model_lr.intercept_}")
print(f"Linear Regression Coefficients: {model_lr.coef_}, \n")

#Getting the predictions & Metrics
y_preds_train = model_lr.predict(X_train_degree)
y_preds_test = model_lr.predict(X_test_degree)
accuracy_train = accuracy_score(y_train, y_preds_train)
accuracy_test = accuracy_score(y_test, y_preds_test)
precision_train = precision_score(y_train, y_preds_train, average="weighted")
precision_test = precision_score(y_test, y_preds_test, average="weighted")
recall_train = recall_score(y_train, y_preds_train, average="weighted")
recall_test = recall_score(y_test, y_preds_test, average="weighted")
f1_train = f1_score(y_train, y_preds_train, average="weighted")
f1_test = f1_score(y_test, y_preds_test, average="weighted")
print("Chosen degree:")
print("Train: Degree:{0:,.0f}, Accuracy:{1:,.3f}, Precision:{2:,.3f}, Recall:{3:,.3f}, F1:{4:,.3f}".format(chosen_degree, accuracy_train, precision_train, recall_train, f1_train))
print("Test : Degree:{0:,.0f}, Accuracy:{1:,.3f}, Precision:{2:,.3f}, Recall:{3:,.3f}, F1:{4:,.3f}".format(chosen_degree, accuracy_test, precision_test, recall_test, f1_test))
print("\nConfusion matrix:")
confusion_matrix = pd.crosstab(y_test, y_preds_test, rownames=["Actual"], colnames=["Predicted"])
print(f"{confusion_matrix}, \n")
sns.heatmap(confusion_matrix, annot=True, fmt='0f')

#Visualizing y_pred in the dataset
X_degree = poly.fit_transform(X_scaled)
y_preds_all = model_lr.predict(X_degree)
vehicles_ds["class_predicted"] = y_preds_all
vehicles_ds.to_excel("model_lr.xlsx")

# 9.2 Deep Learning

In [None]:
#Creating a Deep Learning model and checking its Metrics

from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

#Creating a model
model_dl = Sequential()

#Input and First Hidden Layer
model_dl.add(Dense(units=256, activation="relu", input_dim=X_train.shape[1]))

#Output Layer
model_dl.add(Dense(units=3, activation="softmax"))

#Compiling the neural network
model_dl.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"])

#Fitting to the model
model_dl.fit(X_train_dl, y_train_dl, epochs=100, callbacks=[EarlyStopping(monitor="val_loss", patience="10")])

#Getting the predictions & Metrics
y_preds_train = model_dl.predict(X_train_dl)
y_preds_train = (y_preds_train>0.5)
y_preds_test = model_dl.predict(X_test_dl)
y_preds_test = (y_preds_test>0.5)
accuracy_train = accuracy_score(y_train_dl, y_preds_train)
accuracy_test = accuracy_score(y_test_dl, y_preds_test)
precision_train = precision_score(y_train_dl, y_preds_train, average="weighted")
precision_test = precision_score(y_test_dl, y_preds_test, average="weighted")
recall_train = recall_score(y_train_dl, y_preds_train, average="weighted")
recall_test = recall_score(y_test_dl, y_preds_test, average="weighted")
f1_train = f1_score(y_train_dl, y_preds_train, average="weighted")
f1_test = f1_score(y_test_dl, y_preds_test, average="weighted")
print("Train: Accuracy:{0:,.3f}, Precision:{1:,.3f}, Recall:{2:,.3f}, F1:{3:,.3f}".format(accuracy_train, precision_train, recall_train, f1_train))
print("Test : Accuracy:{0:,.3f}, Precision:{1:,.3f}, Recall:{2:,.3f}, F1:{3:,.3f}".format(accuracy_test, precision_test, recall_test, f1_test))
# print("\nConfusion matrix:")
# from sklearn.metrics import confusion_matrix
# confusion_matrix = confusion_matrix(y_test_dl, y_preds_test)
# print(f"{confusion_matrix}, \n")
# sns.heatmap(confusion_matrix, annot=True, fmt='.0f')

#Visualizing y_pred in the dataset
y_preds_all = model_dl.predict(X_scaled)
y_preds_all = (y_preds_all>0.5)
vehicles_ds = pd.concat([vehicles_ds, pd.DataFrame(y_preds_all, columns=["class_bus", "class_car", "class_van"])], axis=1)
vehicles_ds.to_excel("model_dl.xlsx")

# 10. Applying PCA

In [None]:
#Applying PCA

from sklearn.decomposition import PCA

#Creating a model
pca = PCA(n_components=X_scaled.shape[1], random_state=0) #there are 18 features at the dataset

#Fitting to the model
pca.fit(X_scaled)

#Generating all components in an array
X_pca = pca.transform(X_scaled)
# X_pca_output = pd.DataFrame(X_pca)
# X_pca_output.to_excel("X_pca_file.xlsx",index=False)

#Displaying the explained variance by number of components
for n in range(0, X_scaled.shape[1]):
    print(f"Variance explained by the first {n+1} principal components = {np.cumsum(pca.explained_variance_ratio_ *100)[n]:.1f}%")
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("Number of components")
plt.ylabel("Explained variance")

#Creating a model with the chosen number of components (#96% explainability = 6 components)
pca_selected = PCA(n_components=6, random_state=0)
pca_selected.fit(X_scaled)
X_pca_selected = pca_selected.transform(X_scaled)
# X_pca_selected_output = pd.DataFrame(X_pca_selected)
# X_pca_selected_output.to_excel("X_pca_selected_file.xlsx",index=False)

X_train, X_test, y_train, y_test = train_test_split(X_pca_selected, y, random_state=0)
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_pca_selected, y_dl, random_state=0) #for the DL model

# 11. Machine Learning Algorithms Implementation & Assessment with PCA

# 11.1 Logistic Regression

In [None]:
#Creating a Logistic Regression model and checking its Metrics

from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

#Trying different polynomial degrees
degrees = [1, 2, 3, 4, 5]
print("Testing degrees:")
for a in degrees:
    poly = PolynomialFeatures(degree=a)
    X_train_degree = poly.fit_transform(X_train)
    X_test_degree = poly.fit_transform(X_test)
    model_lr = linear_model.LogisticRegression(max_iter=100000).fit(X_train_degree, y_train)
    y_preds_train = model_lr.predict(X_train_degree)
    y_preds_test = model_lr.predict(X_test_degree)
    accuracy_train = accuracy_score(y_train, y_preds_train)
    accuracy_test = accuracy_score(y_test, y_preds_test)
    precision_train = precision_score(y_train, y_preds_train, average="weighted")
    precision_test = precision_score(y_test, y_preds_test, average="weighted")
    recall_train = recall_score(y_train, y_preds_train, average="weighted")
    recall_test = recall_score(y_test, y_preds_test, average="weighted")
    f1_train = f1_score(y_train, y_preds_train, average="weighted")
    f1_test = f1_score(y_test, y_preds_test, average="weighted")
    print("Train: Degree:{0:,.0f}, Accuracy:{1:,.3f}, Precision:{2:,.3f}, Recall:{3:,.3f}, F1:{4:,.3f}".format(a, accuracy_train, precision_train, recall_train, f1_train))
    print("Test : Degree:{0:,.0f}, Accuracy:{1:,.3f}, Precision:{2:,.3f}, Recall:{3:,.3f}, F1:{4:,.3f}".format(a, accuracy_test, precision_test, recall_test, f1_test))
print("")

#Choosing the best polynomial degree
chosen_degree = 5
poly = PolynomialFeatures(degree=chosen_degree)

#Working on X_train & X_test in the polynomial chosen degree
X_train_degree = poly.fit_transform(X_train)
X_test_degree = poly.fit_transform(X_test)

#Fitting to the model
model_lr = linear_model.LogisticRegression().fit(X_train_degree, y_train)
print(f"Linear Regression Intercept: {model_lr.intercept_}")
print(f"Linear Regression Coefficients: {model_lr.coef_}, \n")

#Getting the predictions & Metrics
y_preds_train = model_lr.predict(X_train_degree)
y_preds_test = model_lr.predict(X_test_degree)
accuracy_train = accuracy_score(y_train, y_preds_train)
accuracy_test = accuracy_score(y_test, y_preds_test)
precision_train = precision_score(y_train, y_preds_train, average="weighted")
precision_test = precision_score(y_test, y_preds_test, average="weighted")
recall_train = recall_score(y_train, y_preds_train, average="weighted")
recall_test = recall_score(y_test, y_preds_test, average="weighted")
f1_train = f1_score(y_train, y_preds_train, average="weighted")
f1_test = f1_score(y_test, y_preds_test, average="weighted")
print("Chosen degree:")
print("Train: Degree:{0:,.0f}, Accuracy:{1:,.3f}, Precision:{2:,.3f}, Recall:{3:,.3f}, F1:{4:,.3f}".format(chosen_degree, accuracy_train, precision_train, recall_train, f1_train))
print("Test : Degree:{0:,.0f}, Accuracy:{1:,.3f}, Precision:{2:,.3f}, Recall:{3:,.3f}, F1:{4:,.3f}".format(chosen_degree, accuracy_test, precision_test, recall_test, f1_test))
print("\nConfusion matrix:")
confusion_matrix = pd.crosstab(y_test, y_preds_test, rownames=["Actual"], colnames=["Predicted"])
print(f"{confusion_matrix}, \n")
sns.heatmap(confusion_matrix, annot=True, fmt='0f')

#Visualizing y_pred in the dataset
X_degree = poly.fit_transform(X_pca_selected)
y_preds_all = model_lr.predict(X_degree)
vehicles_ds["class_predicted"] = y_preds_all
vehicles_ds.to_excel("model_lr_pca.xlsx")

# 11.2 Deep Learning

In [None]:
#Creating a Deep Learning model and checking its Metrics

from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

#Creating a model
model_dl = Sequential()

#Input and First Hidden Layer
model_dl.add(Dense(units=256, activation="relu", input_dim=X_train.shape[1]))

#Output Layer
model_dl.add(Dense(units=3, activation="softmax"))

#Compiling the neural network
model_dl.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"])

#Fitting to the model
model_dl.fit(X_train_dl, y_train_dl, epochs=100, callbacks=[EarlyStopping(monitor="val_loss", patience="10")])

#Getting the predictions & Metrics
y_preds_train = model_dl.predict(X_train_dl)
y_preds_train = (y_preds_train>0.5)
y_preds_test = model_dl.predict(X_test_dl)
y_preds_test = (y_preds_test>0.5)
accuracy_train = accuracy_score(y_train_dl, y_preds_train)
accuracy_test = accuracy_score(y_test_dl, y_preds_test)
precision_train = precision_score(y_train_dl, y_preds_train, average="weighted")
precision_test = precision_score(y_test_dl, y_preds_test, average="weighted")
recall_train = recall_score(y_train_dl, y_preds_train, average="weighted")
recall_test = recall_score(y_test_dl, y_preds_test, average="weighted")
f1_train = f1_score(y_train_dl, y_preds_train, average="weighted")
f1_test = f1_score(y_test_dl, y_preds_test, average="weighted")
print("Train: Accuracy:{0:,.3f}, Precision:{1:,.3f}, Recall:{2:,.3f}, F1:{3:,.3f}".format(accuracy_train, precision_train, recall_train, f1_train))
print("Test : Accuracy:{0:,.3f}, Precision:{1:,.3f}, Recall:{2:,.3f}, F1:{3:,.3f}".format(accuracy_test, precision_test, recall_test, f1_test))
# print("\nConfusion matrix:")
# from sklearn.metrics import confusion_matrix
# confusion_matrix = confusion_matrix(y_test_dl, y_preds_test)
# print(f"{confusion_matrix}, \n")
# sns.heatmap(confusion_matrix, annot=True, fmt='.0f')

#Visualizing y_pred in the dataset
y_preds_all = model_dl.predict(X_pca_selected)
y_preds_all = (y_preds_all>0.5)
vehicles_ds = pd.concat([vehicles_ds, pd.DataFrame(y_preds_all, columns=["class_bus", "class_car", "class_van"])], axis=1)
vehicles_ds.to_excel("model_dl_pca.xlsx")

# 12. Conclusions

IF YOU LIKE IT OR IF IT HELPS YOU SOMEHOW, COULD YOU PLEASE UPVOTE? THANK YOU VERY MUCH!!!

We were able to implement highly accurate models, testing Logistic Regression and Neural Networks, with and without PCA. Neural Networks without PCA are the most accurate of all (98%), but it´s interesting to note that using PCA on it keeps a high accuracy as well (88%), even using only one third of the columns to train the model. The columns reduction with PCA (from 18 to 6) allows the model train faster but it loses explainability, so if the problem requires business clarity on the model I´d choose Logistic Regression without PCA (97% accuracy).