
# 🧠 Final Challenge – Machine Learning Engineer Bootcamp
This notebook presents the complete solution for the **Final Challenge** of the *Machine Learning Engineer Bootcamp*, using the `cars.csv` dataset.  
The pipeline follows the **7 key steps of a Machine Learning Engineer**, covering the entire process from exploratory data analysis to supervised modeling.


## 🥇 Step 1 – Understanding the problem and the dataset

In [None]:

import pandas as pd
import numpy as np

# Load dataset (adjust the path according to your environment)
df = pd.read_csv('/content/drive/MyDrive/cars.csv')

# Initial exploration
print("Dataset shape:", df.shape)
print("\nAvailable columns:\n", df.columns.tolist())
display(df.head())
print("\nGeneral info:")
df.info()
display(df.describe())


## 🧩 Step 2 – Data collection and initial exploration

In [None]:

# Check for missing and duplicate values
print("Missing values per column:")
print(df.isnull().sum())

print("\nDuplicate rows:", df.duplicated().sum())

# Unique values per column
print("\nUnique values per column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

# Identify numeric and categorical variables
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("\nNumeric columns:", num_cols)
print("Categorical columns:", cat_cols)


## 🧹 Step 3 – Preprocessing (cleaning and normalization using `pd.to_numeric`)

In [None]:

from sklearn.preprocessing import StandardScaler

# Safe numeric conversion with errors='coerce'
for col in ['cubicinches', 'weightlbs']:
    df[col] = df[col].astype(str).str.replace(',', '').str.strip()
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Replace missing values with mean
df['cubicinches'].fillna(df['cubicinches'].mean(), inplace=True)
df['weightlbs'].fillna(df['weightlbs'].mean(), inplace=True)

# Create efficiency column based on mpg
df['efficiency'] = pd.cut(df['mpg'], bins=[0, 20, 30, df['mpg'].max()], labels=['Low', 'Medium', 'High'])

# Normalize numerical features
scaler = StandardScaler()
num_features = ['cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60']

df_scaled = df.copy()
df_scaled[num_features] = scaler.fit_transform(df[num_features])

print("Data types after conversion:\n", df_scaled.dtypes)
display(df_scaled.head())


## 🔍 Step 4 – Exploratory analysis and correlations

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Correlation matrix
corr = df_scaled[['mpg', 'cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60']].corr()

print("Correlation with mpg:\n")
print(corr['mpg'].sort_values(ascending=False))

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='RdBu', center=0)
plt.title('Correlation Matrix – Numerical Variables')
plt.show()


## 🧭 Step 5 – Dimensionality reduction with PCA

In [None]:

from sklearn.decomposition import PCA

X = df_scaled[['cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60']]
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)

explained_var = pca.explained_variance_ratio_
print("Explained variance per component:\n", explained_var)
print("\nCumulative explained variance:", explained_var.cumsum())

plt.figure(figsize=(8,5))
plt.plot(range(1, len(explained_var)+1), explained_var.cumsum(), marker='o')
plt.xlabel('Principal Component Number')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA – Cumulative Explained Variance')
plt.grid(True)
plt.show()


## 🧮 Step 6 – Clustering with K-Means

In [None]:

from sklearn.cluster import KMeans

X_pca2 = X_pca[:, :2]
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_pca2)

df_scaled['cluster'] = clusters

plt.figure(figsize=(8,6))
plt.scatter(X_pca2[:, 0], X_pca2[:, 1], c=clusters, cmap='viridis', s=60)
plt.title('Vehicle Clusters (K-Means + PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

print("Cluster distribution:")
print(df_scaled['cluster'].value_counts())


## 🧠 Step 7 – Supervised modeling and evaluation

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df_scaled[['cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60']]
y = df_scaled['efficiency']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

log_reg = LogisticRegression(random_state=42, max_iter=500)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

print("=== Decision Tree ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

print("\n=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_tree), annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Decision Tree')
sns.heatmap(confusion_matrix(y_test, y_pred_log), annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Logistic Regression')
plt.show()
