# Horsepower Prediction

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Loading the Dataset

Dataset: [Fuel Consumption Based on HP - Linear Regression](https://www.kaggle.com/datasets/ohiedulhaquemdasad/fuel-consumption-based-on-hp-linear-regression/data)

This dataset contains:
- **Horse Power**: Engine horsepower (feature)
- **Fuel Economy (MPG)**: Miles per gallon (target variable)

We'll predict Fuel Economy based on Horse Power using linear regression.

In [51]:
# Load the dataset
df = pd.read_csv('FuelEconomy.csv')

# Display first few rows
df.head()

Unnamed: 0,Horse Power,Fuel Economy (MPG)
0,118.770799,29.344195
1,176.326567,24.695934
2,219.262465,23.95201
3,187.310009,23.384546
4,218.59434,23.426739


In [52]:
# Basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nDataset Info:")
df.info()
print("\nBasic Statistics:")
df.describe()

Dataset Shape: (100, 2)

Column Names:
['Horse Power', 'Fuel Economy (MPG)']

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Horse Power         100 non-null    float64
 1   Fuel Economy (MPG)  100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB

Basic Statistics:


Unnamed: 0,Horse Power,Fuel Economy (MPG)
count,100.0,100.0
mean,213.67619,23.178501
std,62.061726,4.701666
min,50.0,10.0
25%,174.996514,20.439516
50%,218.928402,23.143192
75%,251.706476,26.089933
max,350.0,35.0


In [53]:
# Check for missing values
print("Missing Values:")
missing_counts = df.isnull().sum()
print(missing_counts)
print(f"\nTotal missing values: {missing_counts.sum()}")
print("\nPercentage of missing values:")
print((missing_counts / len(df) * 100).round(2))

# Visualize missing values
if missing_counts.sum() > 0:
    plt.figure(figsize=(8, 4))
    missing_counts.plot(kind='bar')
    plt.title('Missing Values by Column')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("\n✓ No missing values found in the dataset!")

print("\nDuplicate Rows:", df.duplicated().sum())

Missing Values:
Horse Power           0
Fuel Economy (MPG)    0
dtype: int64

Total missing values: 0

Percentage of missing values:
Horse Power           0.0
Fuel Economy (MPG)    0.0
dtype: float64

✓ No missing values found in the dataset!

Duplicate Rows: 0


## 2. Handling Missing Values

Even though our current dataset has no missing values, it's important to understand different strategies for handling missing data in real-world scenarios. Let's explore various approaches:

### Strategy 1: Deletion Methods

**When to use:**
- Missing data is < 5% of the dataset
- Missingness is completely at random (MCAR)
- Dataset is large enough to handle loss of rows

In [54]:
# Example: Deletion strategies (commented out since we have no missing values)

# 1. Listwise deletion (drop rows with ANY missing value)
# df_clean = df.dropna()

# 2. Pairwise deletion (drop rows only if BOTH target and feature are missing)
# df_clean = df.dropna(subset=['Horse Power', 'Fuel Economy (MPG)'])

# 3. Drop rows where target variable is missing (critical for supervised learning)
# df_clean = df.dropna(subset=['Fuel Economy (MPG)'])

# 4. Drop columns with too many missing values (e.g., > 50%)
# threshold = len(df) * 0.5
# df_clean = df.dropna(axis=1, thresh=threshold)

print("Deletion methods:")
print("- Listwise deletion: df.dropna() - removes rows with ANY missing value")
print("- Target-specific deletion: df.dropna(subset=['target_column']) - removes rows where target is missing")
print("- Column deletion: df.dropna(axis=1, thresh=threshold) - removes columns with too many missing values")

Deletion methods:
- Listwise deletion: df.dropna() - removes rows with ANY missing value
- Target-specific deletion: df.dropna(subset=['target_column']) - removes rows where target is missing
- Column deletion: df.dropna(axis=1, thresh=threshold) - removes columns with too many missing values


### Strategy 2: Imputation Methods

**When to use:**
- Missing data is > 5% of the dataset
- We want to preserve all observations
- Missingness is not completely random but can be handled appropriately

In [55]:
# Example: Imputation strategies

# Create a copy for demonstration (we'll create synthetic missing values)
df_demo = df.copy()

# Simulate missing values for demonstration (5% random missing)
np.random.seed(42)
missing_indices_hp = np.random.choice(df_demo.index, size=int(len(df_demo) * 0.05), replace=False)
missing_indices_mpg = np.random.choice(df_demo.index, size=int(len(df_demo) * 0.05), replace=False)

df_with_missing = df_demo.copy()
df_with_missing.loc[missing_indices_hp, 'Horse Power'] = np.nan
df_with_missing.loc[missing_indices_mpg, 'Fuel Economy (MPG)'] = np.nan

print("Original dataset missing values:", df_demo.isnull().sum().sum())
print("Simulated missing values:")
print(df_with_missing.isnull().sum())
print(f"\nTotal missing: {df_with_missing.isnull().sum().sum()} values")

Original dataset missing values: 0
Simulated missing values:
Horse Power           5
Fuel Economy (MPG)    5
dtype: int64

Total missing: 10 values


In [56]:
# 1. Mean/Median Imputation (for numerical features)
from sklearn.impute import SimpleImputer

# Mean imputation
mean_imputer = SimpleImputer(strategy='mean')
df_mean_imputed = df_with_missing.copy()
df_mean_imputed['Horse Power'] = mean_imputer.fit_transform(df_with_missing[['Horse Power']])

# Median imputation (more robust to outliers)
median_imputer = SimpleImputer(strategy='median')
df_median_imputed = df_with_missing.copy()
df_median_imputed['Horse Power'] = median_imputer.fit_transform(df_with_missing[['Horse Power']])

print("Mean imputation:")
print(f"  Mean of Horse Power: {df_demo['Horse Power'].mean():.2f}")
print(f"  Imputed values: {df_mean_imputed.loc[missing_indices_hp, 'Horse Power'].values[:3]}")

print("\nMedian imputation:")
print(f"  Median of Horse Power: {df_demo['Horse Power'].median():.2f}")
print(f"  Imputed values: {df_median_imputed.loc[missing_indices_hp, 'Horse Power'].values[:3]}")

Mean imputation:
  Mean of Horse Power: 213.68
  Imputed values: [213.69656262 213.69656262 213.69656262]

Median imputation:
  Median of Horse Power: 218.93
  Imputed values: [218.107081 218.107081 218.107081]


## 3. Train-Test Split

In [57]:
# Prepare features and target
X = df[['Horse Power']]  # Features
y = df['Fuel Economy (MPG)']  # Target variable

# 70/30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set size: {X_train.shape[0]} ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]} ({X_test.shape[0]/len(df)*100:.1f}%)")

Training set size: 70 (70.0%)
Test set size: 30 (30.0%)


## 4. Model Training

We'll train 4 different regression models to predict Fuel Economy from Horse Power:

### Model 1: Linear Regression

**Equation:** y = β₀ + β₁x  
**Description:** Fits a straight line to the data. The relationship between Horse Power and Fuel Economy is modeled as a linear function with one coefficient for the feature and an intercept term.

In [58]:
# Model 1: Linear Regression
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

# Predictions
y_pred_linear_train = model_linear.predict(X_train)
y_pred_linear_test = model_linear.predict(X_test)

### Model 2: Polynomial Regression (Degree 2)

**Equation:** y = β₀ + β₁x + β₂x²  
**Description:** Fits a quadratic curve to the data. This allows for a curved relationship where the effect of Horse Power on Fuel Economy can change direction, capturing non-linear patterns with a single bend.

In [59]:
# Model 2: Polynomial Regression (Degree 2)
poly_features_2 = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly2 = poly_features_2.fit_transform(X_train)
X_test_poly2 = poly_features_2.transform(X_test)

model_poly2 = LinearRegression()
model_poly2.fit(X_train_poly2, y_train)

# Predictions
y_pred_poly2_train = model_poly2.predict(X_train_poly2)
y_pred_poly2_test = model_poly2.predict(X_test_poly2)

### Model 3: Polynomial Regression (Degree 3)

**Equation:** y = β₀ + β₁x + β₂x² + β₃x³  
**Description:** Fits a cubic curve with two bends, allowing for more complex non-linear relationships. This can capture S-shaped patterns and more intricate variations in how Horse Power affects Fuel Economy.

In [60]:
# Model 3: Polynomial Regression (Degree 3)
poly_features_3 = PolynomialFeatures(degree=3, include_bias=False)
X_train_poly3 = poly_features_3.fit_transform(X_train)
X_test_poly3 = poly_features_3.transform(X_test)

model_poly3 = LinearRegression()
model_poly3.fit(X_train_poly3, y_train)

# Predictions
y_pred_poly3_train = model_poly3.predict(X_train_poly3)
y_pred_poly3_test = model_poly3.predict(X_test_poly3)

### Model 4: Polynomial Regression (Degree 4)

**Equation:** y = β₀ + β₁x + β₂x² + β₃x³ + β₄x⁴  
**Description:** Fits a quartic curve with three bends, providing maximum flexibility to capture highly complex non-linear patterns. This model can fit very intricate relationships but may be prone to overfitting with limited data.

In [61]:
# Model 4: Polynomial Regression (Degree 4)
poly_features_4 = PolynomialFeatures(degree=4, include_bias=False)
X_train_poly4 = poly_features_4.fit_transform(X_train)
X_test_poly4 = poly_features_4.transform(X_test)

model_poly4 = LinearRegression()
model_poly4.fit(X_train_poly4, y_train)

# Predictions
y_pred_poly4_train = model_poly4.predict(X_train_poly4)
y_pred_poly4_test = model_poly4.predict(X_test_poly4)

## 5. Evaluation Metrics

We'll evaluate each model using three key metrics: Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (R²).

### 5.1 Metric Descriptions

**Mean Squared Error (MSE):**
- **Formula:** MSE = (1/n) × Σ(yᵢ - ŷᵢ)²
- **Description:** Measures the average squared difference between predicted and actual values. It penalizes larger errors more heavily due to squaring. Lower values indicate better model performance. Units are squared (e.g., MPG²).

**Mean Absolute Error (MAE):**
- **Formula:** MAE = (1/n) × Σ|yᵢ - ŷᵢ|
- **Description:** Measures the average absolute difference between predicted and actual values. It treats all errors equally and is more interpretable than MSE since it's in the same units as the target variable (e.g., MPG). Lower values indicate better performance.

**R-squared (R²):**
- **Formula:** R² = 1 - (SS_res / SS_tot) where SS_res = Σ(yᵢ - ŷᵢ)² and SS_tot = Σ(yᵢ - ȳ)²
- **Description:** Measures the proportion of variance in the target variable explained by the model. Values range from -∞ to 1, where 1 indicates perfect predictions, 0 means the model performs as well as predicting the mean, and negative values indicate worse performance than the mean. Higher values indicate better model fit.

### 5.2 Practical Interpretation

**In the context of predicting Fuel Economy (MPG) from Horse Power:**

**Mean Absolute Error (MAE):**
- **What it means:** On average, how many MPG off is our prediction?
- **Example:** If MAE = 1.2 MPG, this means our model's predictions are typically off by about 1.2 miles per gallon. If a car actually gets 25 MPG, we might predict anywhere from 23.8 to 26.2 MPG on average.
- **Why it's useful:** Easy to understand - it's in the same units as what we're predicting. A car buyer can easily grasp "the model is usually within 1-2 MPG of the actual fuel economy."

**Mean Squared Error (MSE):**
- **What it means:** How much do large prediction errors hurt our model?
- **Example:** If MSE = 2.5 MPG², this tells us about the magnitude of errors, but it's harder to interpret directly. However, if we have two models and one has MSE = 2.0 and another has MSE = 3.0, the first is better.
- **Why it's useful:** Penalizes big mistakes more than small ones. If you're building a model for car manufacturers, you care more about avoiding predictions that are way off (like predicting 30 MPG when it's actually 20 MPG) than small errors. MSE helps identify models that avoid these large errors.

**R-squared (R²):**
- **What it means:** What percentage of the variation in fuel economy can we explain using horsepower?
- **Example:** If R² = 0.91 (or 91%), this means horsepower explains 91% of why different cars have different fuel economies. The remaining 9% is due to other factors (weight, aerodynamics, driving conditions, etc.).
- **Why it's useful:** Gives a sense of how well the model captures the relationship. An R² of 0.91 is excellent - it means horsepower is a very strong predictor. If R² was 0.50, it would mean horsepower only explains half the variation, suggesting other factors are equally important.

In [62]:
# Calculate evaluation metrics for all models on test set
models = {
    'Linear Regression': {
        'y_pred_train': y_pred_linear_train,
        'y_pred_test': y_pred_linear_test
    },
    'Polynomial (Degree 2)': {
        'y_pred_train': y_pred_poly2_train,
        'y_pred_test': y_pred_poly2_test
    },
    'Polynomial (Degree 3)': {
        'y_pred_train': y_pred_poly3_train,
        'y_pred_test': y_pred_poly3_test
    },
    'Polynomial (Degree 4)': {
        'y_pred_train': y_pred_poly4_train,
        'y_pred_test': y_pred_poly4_test
    }
}

# Create results dictionary
results = []

for model_name, predictions in models.items():
    # Training metrics
    mse_train = mean_squared_error(y_train, predictions['y_pred_train'])
    mae_train = mean_absolute_error(y_train, predictions['y_pred_train'])
    r2_train = r2_score(y_train, predictions['y_pred_train'])

    # Test metrics
    mse_test = mean_squared_error(y_test, predictions['y_pred_test'])
    mae_test = mean_absolute_error(y_test, predictions['y_pred_test'])
    r2_test = r2_score(y_test, predictions['y_pred_test'])

    results.append({
        'Model': model_name,
        'MSE (Train)': round(mse_train, 4),
        'MAE (Train)': round(mae_train, 4),
        'R² (Train)': round(r2_train, 4),
        'MSE (Test)': round(mse_test, 4),
        'MAE (Test)': round(mae_test, 4),
        'R² (Test)': round(r2_test, 4)
    })

# Create DataFrame and display as table
results_df = pd.DataFrame(results)
print("Evaluation Metrics for All Models:\n")
print(results_df.to_string(index=False))

Evaluation Metrics for All Models:

                Model  MSE (Train)  MAE (Train)  R² (Train)  MSE (Test)  MAE (Test)  R² (Test)
    Linear Regression       2.1157       1.2100      0.9063      1.6749      1.0313     0.9133
Polynomial (Degree 2)       2.1151       1.2103      0.9063      1.6570      1.0254     0.9142
Polynomial (Degree 3)       2.0606       1.2115      0.9088      1.9037      1.0872     0.9015
Polynomial (Degree 4)       1.9177       1.1683      0.9151      2.5485      1.2034     0.8681


In [63]:
# Display formatted table
results_df

Unnamed: 0,Model,MSE (Train),MAE (Train),R² (Train),MSE (Test),MAE (Test),R² (Test)
0,Linear Regression,2.1157,1.21,0.9063,1.6749,1.0313,0.9133
1,Polynomial (Degree 2),2.1151,1.2103,0.9063,1.657,1.0254,0.9142
2,Polynomial (Degree 3),2.0606,1.2115,0.9088,1.9037,1.0872,0.9015
3,Polynomial (Degree 4),1.9177,1.1683,0.9151,2.5485,1.2034,0.8681


## Handling Missing Values

Even though our current dataset has no missing values, it's important to understand different strategies for handling missing data in real-world scenarios. Let's explore various approaches:

### Strategy 1: Deletion Methods

**When to use:**
- Missing data is < 5% of the dataset
- Missingness is completely at random (MCAR)
- Dataset is large enough to handle loss of rows

In [64]:
# Example: Deletion strategies (commented out since we have no missing values)

# 1. Listwise deletion (drop rows with ANY missing value)
# df_clean = df.dropna()

# 2. Pairwise deletion (drop rows only if BOTH target and feature are missing)
# df_clean = df.dropna(subset=['Horse Power', 'Fuel Economy (MPG)'])

# 3. Drop rows where target variable is missing (critical for supervised learning)
# df_clean = df.dropna(subset=['Fuel Economy (MPG)'])

# 4. Drop columns with too many missing values (e.g., > 50%)
# threshold = len(df) * 0.5
# df_clean = df.dropna(axis=1, thresh=threshold)

print("Deletion methods:")
print("- Listwise deletion: df.dropna() - removes rows with ANY missing value")
print("- Target-specific deletion: df.dropna(subset=['target_column']) - removes rows where target is missing")
print("- Column deletion: df.dropna(axis=1, thresh=threshold) - removes columns with too many missing values")

Deletion methods:
- Listwise deletion: df.dropna() - removes rows with ANY missing value
- Target-specific deletion: df.dropna(subset=['target_column']) - removes rows where target is missing
- Column deletion: df.dropna(axis=1, thresh=threshold) - removes columns with too many missing values


### Strategy 2: Imputation Methods

**When to use:**
- Missing data is > 5% of the dataset
- We want to preserve all observations
- Missingness is not completely random but can be handled appropriately

In [65]:
# Example: Imputation strategies

# Create a copy for demonstration (we'll create synthetic missing values)
df_demo = df.copy()

# Simulate missing values for demonstration (5% random missing)
np.random.seed(42)
missing_indices_hp = np.random.choice(df_demo.index, size=int(len(df_demo) * 0.05), replace=False)
missing_indices_mpg = np.random.choice(df_demo.index, size=int(len(df_demo) * 0.05), replace=False)

df_with_missing = df_demo.copy()
df_with_missing.loc[missing_indices_hp, 'Horse Power'] = np.nan
df_with_missing.loc[missing_indices_mpg, 'Fuel Economy (MPG)'] = np.nan

print("Original dataset missing values:", df_demo.isnull().sum().sum())
print("Simulated missing values:")
print(df_with_missing.isnull().sum())
print(f"\nTotal missing: {df_with_missing.isnull().sum().sum()} values")

Original dataset missing values: 0
Simulated missing values:
Horse Power           5
Fuel Economy (MPG)    5
dtype: int64

Total missing: 10 values


In [66]:
# 1. Mean/Median Imputation (for numerical features)
from sklearn.impute import SimpleImputer

# Mean imputation
mean_imputer = SimpleImputer(strategy='mean')
df_mean_imputed = df_with_missing.copy()
df_mean_imputed['Horse Power'] = mean_imputer.fit_transform(df_with_missing[['Horse Power']])

# Median imputation (more robust to outliers)
median_imputer = SimpleImputer(strategy='median')
df_median_imputed = df_with_missing.copy()
df_median_imputed['Horse Power'] = median_imputer.fit_transform(df_with_missing[['Horse Power']])

print("Mean imputation:")
print(f"  Mean of Horse Power: {df_demo['Horse Power'].mean():.2f}")
print(f"  Imputed values: {df_mean_imputed.loc[missing_indices_hp, 'Horse Power'].values[:3]}")

print("\nMedian imputation:")
print(f"  Median of Horse Power: {df_demo['Horse Power'].median():.2f}")
print(f"  Imputed values: {df_median_imputed.loc[missing_indices_hp, 'Horse Power'].values[:3]}")

Mean imputation:
  Mean of Horse Power: 213.68
  Imputed values: [213.69656262 213.69656262 213.69656262]

Median imputation:
  Median of Horse Power: 218.93
  Imputed values: [218.107081 218.107081 218.107081]
