<a href="https://colab.research.google.com/github/sarahajbane/notebooks/blob/main/car_shapes_supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import tree
from sklearn.pipeline import make_pipeline

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

## Import and clean data

In [None]:
data = !wget -O 'data.csv' 'https://drive.google.com/uc?id=1MnXNtmjQcFjAtM4IUrGlPY3_P820cRD3'
df = pd.read_csv("/content/data.csv")
df.head()

# OR load from local current directory or drive content folder if already present & connected
# df = pd.read_csv("./vehicle.csv")

Unnamed: 0,compactness,circularity,distance_circularity,radius_ratio,pr.axis_aspect_ratio,max.length_aspect_ratio,scatter_ratio,elongatedness,pr.axis_rectangularity,max.length_rectangularity,scaled_variance,scaled_variance.1,scaled_radius_of_gyration,scaled_radius_of_gyration.1,skewness_about,skewness_about.1,skewness_about.2,hollows_ratio,class
0,95,48.0,83.0,178.0,72.0,10,162.0,42.0,20.0,159,176.0,379.0,184.0,70.0,6.0,16.0,187.0,197,van
1,91,41.0,84.0,141.0,57.0,9,149.0,45.0,19.0,143,170.0,330.0,158.0,72.0,9.0,14.0,189.0,199,van
2,104,50.0,106.0,209.0,66.0,10,207.0,32.0,23.0,158,223.0,635.0,220.0,73.0,14.0,9.0,188.0,196,car
3,93,41.0,82.0,159.0,63.0,9,144.0,46.0,19.0,143,160.0,309.0,127.0,63.0,6.0,10.0,199.0,207,van
4,85,44.0,70.0,205.0,103.0,52,149.0,45.0,19.0,144,241.0,325.0,188.0,127.0,9.0,11.0,180.0,183,bus


In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

All NaNs are float64, but all floats seem to be integers (.0 only). Few NaNs overall and no fractions, so better to drop the NaNs than impute non-floats or turn all numerical data into integers before imputation.

In [None]:
df.dropna(inplace=True)  # Drop NaN values from df
df.to_csv("./clean_vehicle.csv", index=False)  # Save the cleaned dataframe to a new CSV file

## EDA

In [None]:
# Reload the cleaned dataframe if necessary
# df = pd.read_csv("./clean_vehicle.csv")

In [None]:
df['class'].unique()

In [None]:
plt.figure(figsize=(4, 2))
sns.countplot(df['class'])
plt.show()

Data seems a little unbalanced off the bat with twice as many cars as buses and vans, due to combining two car types in the same class.

In [None]:
df.hist(figsize=(15, 10), bins=15)
plt.tight_layout()

pr.axis_rectangularly might deliniate the different classes but unclear from this figure alone, and counts seem off.

In [None]:
# replot histograms with class colour and kde to understand distribution of each category

i = 0  # Initialize the subplot index
plt.figure(figsize=(10, 20))
for column in df.drop(columns=['class']).columns:
    plt.subplot(6, 3, i + 1)
    sns.histplot(data=df, x=column, hue='class', bins=15, kde=True, palette='Set2')
    plt.title(f'{column} by class')
    plt.xlabel('')
    plt.ylabel('')
    if column == 'distance_circularity':
        plt.legend(title='vehicle class', loc='upper left', labels=df['class'].unique())
    else:
        plt.legend([],[], frameon=False)
    plt.tight_layout()
    i += 1  # Increment the subplot index

plt.show()

no clear separation in distribution between the classes. Higher peaks for cars are just an artefact of the class being twice as numerous.
Will need to use a model using all features to predict the class, as none stand out in particular for feature selection, and 18 are manageable.

In [None]:
plt.figure(figsize=(15, 10))
mask = np.zeros_like(df.drop(columns=['class']).corr())
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df.drop(columns=['class']).corr(), annot=True, cmap='coolwarm', mask=mask)
plt.xticks(rotation=35, ha='right')
plt.show()

Ignoring the labelled targets, features of circularity, rectangularity, and elongatedness have strong correlations with each other and features of skewedness, variance and scatter/hollows ratio, but the meaning of the latter or how they are derived is unclear/unknown.


In [None]:
# Look at boxplots by car type with scatter to complete the EDA
selected_features = ['compactness','circularity', 'elongatedness', 'max.length_rectangularity',
                     'skewness_about', 'scaled_variance', 'scatter_ratio','hollows_ratio']

plt.figure(figsize=(20, 15))
num_columns = 4
num_rows = 2

for i, feature in enumerate(selected_features):
    plt.subplot(num_rows, num_columns, i + 1)
    sns.boxplot(x='class', y=feature, data=df)
    sns.stripplot(x = 'class', y = feature, data = df,
              jitter=True,
              marker='o',
              alpha=0.8,
              color="red")
    plt.title(feature)

plt.tight_layout()
plt.show()

Nothing obviously different here between the 3 target classes, but scatter_ratio and scaled_variance show some more significant clustered patterns for cars and bus types.

## Split and pre-process data for model training and testing

In [None]:
#Scale the feature data
scaler = MinMaxScaler()
X = df.drop(columns=['class'])  # Features
y = df['class']  # Target
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled

In [None]:
#split data into features, target, test, and train
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# assign numeric class through encoder
enc = LabelEncoder()
y_train_encoded = enc.fit_transform(y_train)
y_train_encoded[:10]

In [None]:
#same for test set
y_test_encoded = enc.fit_transform(y_test)
y_test_encoded[:10]

## Try different classification models to find the best results

In [None]:
lr = LogisticRegression(max_iter=10000, random_state=42, multi_class='ovr')
knn = KNeighborsClassifier()
svm = SVC(random_state=42)
dt = DecisionTreeClassifier(random_state=42)
rfc = RandomForestClassifier(random_state=42)
ada = AdaBoostClassifier(random_state=42)

In [None]:
lr_model = make_pipeline(lr)
knn_model = make_pipeline(knn)
svm_model = make_pipeline(svm)
dtc_model = make_pipeline(dt)
rfc_model = make_pipeline(rfc)
abc_model = make_pipeline(ada)

In [None]:
lr_model.fit(X_train, y_train_encoded)
knn_model.fit(X_train, y_train_encoded)
svm_model.fit(X_train, y_train_encoded)
dtc_model.fit(X_train, y_train_encoded)
rfc_model.fit(X_train, y_train_encoded)
abc_model.fit(X_train, y_train_encoded)

In [None]:
# quick look at feature importances - highest are scaled_variance.1 and elongatedness
rfc_model.fit(X_train, y_train)
rfc_model.named_steps['randomforestclassifier'].feature_importances_

In [None]:
print( f"Logistic Regression: {lr_model.score(X_train, y_train_encoded)}")
print( f"KNN: {knn_model.score(X_train, y_train_encoded)}")
print( f"SVM: {svm_model.score(X_train, y_train_encoded)}")
print( f"Decision Tree: {dtc_model.score(X_train, y_train_encoded)}")
print( f"Random Forest: {rfc_model.score(X_train, y_train_encoded)}")
print( f"AdaBoost: {abc_model.score(X_train, y_train_encoded)}")

DecisionTreeClassifier and RandomForestClassifier have the highest accuracy scores, both with 100%. Danger of overfitting, so we will test all the models for evaluation. Logistic Regression also performs very well.

## Evaluate the models

In [None]:
names = ['Logistic Regression', 'KNN', 'SVM', 'Decision Tree', 'Random Forest', 'AdaBoost']
classifiers = [lr_model, knn_model, svm_model, dtc_model, rfc_model, abc_model]

scores = []
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    scores.append(score)

df_models = pd.DataFrame()
df_models['name'] = names
df_models['score'] = scores
cm = sns.light_palette("brown", as_cmap=True)
s = df_models.style.background_gradient(cmap=cm)

Both tree based models still performed well on the test data, but Logistic Regression showed the highest accuracy and smallest drop for the test set.
Random Forest had the second highest accuracy score of 0.95, while Decision Tree dropped to 0.9.
The other models had scores between 0.68 and 0.87.

In [None]:
# Classification report for best 3 models
print(f"Logistic Regression:\n{classification_report(y_test, lr_model.predict(X_test))}")
print(f"Random Forrest Classification:\n{classification_report(y_test, rfc_model.predict(X_test))}")
print(f"Decision Tree Classification:\n{classification_report(y_test, dtc_model.predict(X_test))}")

Logistic Regression has the highest accuracy overall, incl. average precision, recall, and f-1 scores for the different vehicle classes, whereas Random Forrest performs a bit worse on classifying vans, and Decision Tree struggles comparatively more with cars.

In [None]:
y_pred = rfc_model.predict(X_test)
accuracy_score(y_pred, y_test)

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Reassign numeric class through encoder
    # enc = LabelEncoder()
    # y_train = enc.fit_transform(y_train)
    # y_test = enc.fit_transform(y_test)

    # Train predictions
    train_pred = model.predict(X_train)

    # Test predictions
    test_pred = model.predict(X_test)

    # Train confusion matrix
    train_conf_matrix = confusion_matrix(y_train, train_pred)
    print("\nTrain Confusion Matrix:")
    print(train_conf_matrix)

    # Test confusion matrix
    test_conf_matrix = confusion_matrix(y_test, test_pred)
    print("\nTest Confusion Matrix:")
    print(test_conf_matrix)
    return train_pred, test_pred, train_conf_matrix, test_conf_matrix


In [None]:
train_pred_lr, test_pred_lr, train_conf_matrix_lr, test_conf_matrix_lr = evaluate_model(lr_model, X_train, y_train, X_test, y_test)

In [None]:
# Min-max scale the confusion matrices
train_conf_matrix_scaled = (train_conf_matrix_lr - np.min(train_conf_matrix_lr)) / (np.max(train_conf_matrix_lr) - np.min(train_conf_matrix_lr))
test_conf_matrix_scaled = (test_conf_matrix_lr - np.min(test_conf_matrix_lr)) / (np.max(test_conf_matrix_lr) - np.min(test_conf_matrix_lr))


In [None]:
# Plot the heatmaps of the confusion matrices

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(train_conf_matrix_scaled , annot=True, fmt=".2f", cmap="Blues", ax=axes[0])
axes[0].set_title("Train Confusion Matrix (Scaled)")
axes[0].set_xlabel("Predicted")
axes[0].set_ylabel("Actual")

sns.heatmap(test_conf_matrix_scaled , annot=True, fmt=".2f", cmap="Blues", ax=axes[1])
axes[1].set_title("Test Confusion Matrix (Scaled)")
axes[1].set_xlabel("Predicted")
axes[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

Something seems wrong here.

In [None]:
# Train ROC AUC score
train_pred_proba = lr_model.predict_proba(X_train)
train_roc_auc = roc_auc_score(y_train_encoded, train_pred_proba, multi_class='ovr')
print("\nTrain ROC AUC Score:", train_roc_auc)

# Test ROC AUC score
test_pred_proba = lr_model.predict_proba(X_test)
test_roc_auc = roc_auc_score(y_test_encoded, test_pred_proba, multi_class='ovr')
print("Test ROC AUC Score:", test_roc_auc)

In [None]:
# plot ROC curve for each class
fpr = {}
tpr = {}
thresholds = {}
n_classes = test_pred_proba.shape[1]

for i in range(n_classes):
    fpr[i], tpr[i], thresholds[i] = roc_curve((y_test_encoded == i).astype(int), test_pred_proba[:, i])


fig, axes = plt.subplots(1, n_classes, figsize=(12, 4))

for i, ax in enumerate(axes):
    ax.plot(fpr[i], tpr[i], label=f"Class {i} ROC curve")
    ax.plot([0, 1], [0, 1], 'k--', label='Random guess')
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_title(f"ROC Curve for Class {i} - {enc.classes_[i]}")
    ax.legend(loc="best")

plt.tight_layout()
plt.show()


## Decision made:

Happy with this result and should be a very good and simple model to use for car classification for the customer.

## Some hyperparameter tuning for Random Forrest Classifier & other Visualisations

In [None]:
param_grid = {'n_estimators': [50, 100, 150, 200],
              'max_depth': [10, 20, 30, 40, 50]}
grid = GridSearchCV(estimator=rfc,
                    param_grid=param_grid,
                    scoring='accuracy',
                    cv=5)

grid.fit(X_train, y_train)  # Fit the model
grid.best_params_  # Get the best parameters
grid.best_score_  # Get the best score
grid.best_estimator_  # Get the best estimator


In [None]:
max_features_range = np.arange(1,11,1)
n_estimators_range = np.arange(10,100,10)
param_grid = dict(max_features=max_features_range, n_estimators=n_estimators_range)

rf = RandomForestClassifier()

grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

grid.fit(X_train, y_train_encoded)

In [None]:
print(f"The best parameters are {grid.best_params_}\nwith a score of {grid.best_score_:.2f}")


Did not improve results a whole lot with the data currently at hand.

### Contour Plots for GridSeachCV

In [None]:
# plot contour plots for the grid search output anyway
grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
grid_results.head()

In [None]:
grid_contour = grid_results.groupby(['max_features','n_estimators']).mean()
grid_contour

In [None]:
grid_reset = grid_contour.reset_index()
grid_reset

In [None]:
grid_pivot = grid_reset.pivot(index='max_features', columns='n_estimators', values='Accuracy')
grid_pivot

In [None]:
x = grid_pivot.columns.values
y = grid_pivot.index.values
z = grid_pivot.values

In [None]:
# 2D contour plot of accuracy
layout = go.Layout(
            xaxis=go.layout.XAxis(
              title=go.layout.xaxis.Title(
              text='n_estimators')
             ),
             yaxis=go.layout.YAxis(
              title=go.layout.yaxis.Title(
              text='max_features')
            ) )

fig = go.Figure(data = [go.Contour(z=z, x=x, y=y)], layout=layout )

fig.update_layout(title='Hyperparameter tuning', autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))

In [None]:
# 3D contour plot of accuracy

fig = go.Figure(data= [go.Surface(z=z, y=y, x=x)], layout=layout )
fig.update_layout(title='Hyperparameter tuning',
                  scene = dict(
                    xaxis_title='n_estimators',
                    yaxis_title='max_features',
                    zaxis_title='Accuracy'),
                  autosize=False,
                  width=800, height=800,
                  margin=dict(l=65, r=50, b=65, t=90))
fig.show()

### Visualise Decision Tree

In [None]:
# Visualize the decision tree
plt.figure(figsize=(10, 6))
tree.plot_tree(dtc_model.named_steps['decisiontreeclassifier'], feature_names=X_train.columns, class_names=[str(el) for el in y_train.unique()], filled=True)
plt.show()