# Tire Degradation Prediction Model
This notebook loads a simulated motorsport tire wear dataset, performs exploratory data analysis (EDA), engineers features, and compares multiple regression models to predict tire degradation.

## 1. Imports & Configuration

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

# Plotting style
sns.set_theme(style='whitegrid', palette='muted')
RANDOM_STATE = 42
TARGET = 'Tire_Degradation'

## 2. Load Data

In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "samwelnjehia/simple-tire-wear-and-degradation-simulated-dataset",
    "simulated_dataset.csv",
)

# Standardise the column name (typo in original dataset)
df = df.rename(columns={'Tire degreadation': TARGET})

print(f"Shape: {df.shape}")
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
print("=== Dataset Info ===")
df.info()

print("\n=== Missing Values ===")
print(df.isnull().sum()[df.isnull().sum() > 0])

print("\n=== Categorical Columns ===")
for col in ['Motorsport_Type', 'Team', 'Event', 'Driving_Style']:
    print(f"  {col}: {df[col].unique()}")

In [None]:
# Correlation heatmap (numeric columns only)
corr = df.select_dtypes(include='number').corr()

fig, ax = plt.subplots(figsize=(15, 13))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1, ax=ax)
ax.set_title('Correlation Matrix', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Key scatter plots against target
numeric_features = ['Tire_wear', 'Humidity', 'Ambient_Temperature', 'Lap', 'Surface_Roughness']

fig, axes = plt.subplots(2, 3, figsize=(16, 9))
axes = axes.flatten()

for i, feat in enumerate(numeric_features):
    axes[i].scatter(df[feat], df[TARGET], alpha=0.3, s=10)
    axes[i].set_xlabel(feat)
    axes[i].set_ylabel(TARGET)
    axes[i].set_title(f'{feat} vs {TARGET}')

# Hide the unused subplot
axes[-1].set_visible(False)
plt.suptitle('Feature vs Target Scatter Plots', fontsize=16, y=1.01)
plt.tight_layout()
plt.show()

In [None]:
# Categorical comparisons
cat_features = ['Event', 'Driving_Style', 'Track']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, feat in zip(axes, cat_features):
    df.groupby(feat)[TARGET].mean().sort_values().plot(kind='bar', ax=ax)
    ax.set_title(f'Mean {TARGET} by {feat}')
    ax.set_xlabel(feat)
    ax.set_ylabel(f'Mean {TARGET}')
    ax.tick_params(axis='x', rotation=30)

plt.suptitle('Categorical Feature Impact on Tire Degradation', fontsize=16)
plt.tight_layout()
plt.show()

## 4. Feature Engineering & Preprocessing

In [None]:
# Encode categorical features and select model inputs
FEATURES = ['Tire_wear', 'Humidity', 'Ambient_Temperature', 'Event']

df['Event'] = df['Event'].astype(str).fillna('Unknown')
df['Event_enc'], event_categories = pd.factorize(df['Event'], sort=True)
print("Event encoding:", dict(enumerate(event_categories)))

feature_cols = ['Tire_wear', 'Humidity', 'Ambient_Temperature', 'Event_enc']

X = df[feature_cols]
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

# Scale features (important for MLP)
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc  = scaler.transform(X_test)

print(f"Train size: {len(X_train):,} | Test size: {len(X_test):,}")

## 5. Model Training & Evaluation

In [None]:
def evaluate_model(name, model, X_tr, X_te, y_tr, y_te):
    """Fit a model, return a metrics dict, and print a summary."""
    model.fit(X_tr, y_tr)
    preds = model.predict(X_te)
    metrics = {
        'Model': name,
        'R²':   round(r2_score(y_te, preds), 4),
        'RMSE': round(np.sqrt(mean_squared_error(y_te, preds)), 4),
        'MAE':  round(mean_absolute_error(y_te, preds), 4),
    }
    print(f"{name:25s} | R²={metrics['R²']:.4f}  RMSE={metrics['RMSE']:.4f}  MAE={metrics['MAE']:.4f}")
    return metrics, preds


models = [
    ('Linear Regression',   LinearRegression(),                                           X_train,    X_test),
    ('Decision Tree',       DecisionTreeRegressor(max_depth=10, random_state=RANDOM_STATE), X_train, X_test),
    ('Random Forest',       RandomForestRegressor(n_estimators=100, max_depth=10, random_state=RANDOM_STATE, n_jobs=-1), X_train, X_test),
    ('MLP (Neural Net)',    MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=300, random_state=RANDOM_STATE), X_train_sc, X_test_sc),
]

results = []
predictions = {}

print(f"{'Model':25s} | {'R²':>8}  {'RMSE':>10}  {'MAE':>10}")
print('-' * 60)
for name, model, X_tr, X_te in models:
    m, p = evaluate_model(name, model, X_tr, X_te, y_train, y_test)
    results.append(m)
    predictions[name] = p

## 6. Results Comparison

In [None]:
results_df = pd.DataFrame(results).set_index('Model')
display(results_df)

# Bar chart comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, metric in zip(axes, ['R²', 'RMSE', 'MAE']):
    results_df[metric].plot(kind='bar', ax=ax, color=sns.color_palette('muted', len(results_df)))
    ax.set_title(metric)
    ax.set_xlabel('')
    ax.tick_params(axis='x', rotation=25)

plt.suptitle('Model Comparison', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Actual vs Predicted plots for all models
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for ax, (name, preds) in zip(axes, predictions.items()):
    ax.scatter(y_test, preds, alpha=0.3, s=10)
    lims = [min(y_test.min(), preds.min()), max(y_test.max(), preds.max())]
    ax.plot(lims, lims, 'r--', linewidth=1.5, label='Perfect fit')
    ax.set_xlabel('Actual Tire Degradation')
    ax.set_ylabel('Predicted Tire Degradation')
    ax.set_title(name)
    ax.legend()

plt.suptitle('Actual vs Predicted — All Models', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Feature importances from Random Forest
rf_model = next(m for name, m, *_ in models if name == 'Random Forest')
importances = pd.Series(rf_model.feature_importances_, index=feature_cols).sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(7, 4))
importances.plot(kind='barh', ax=ax, color=sns.color_palette('muted', len(importances)))
ax.set_title('Random Forest — Feature Importances')
ax.set_xlabel('Importance')
plt.tight_layout()
plt.show()