# Horsepower Prediction

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

## Loading the Dataset

Dataset: [Fuel Consumption Based on HP - Linear Regression](https://www.kaggle.com/datasets/ohiedulhaquemdasad/fuel-consumption-based-on-hp-linear-regression/data)

This dataset contains:
- **Horse Power**: Engine horsepower (feature)
- **Fuel Economy (MPG)**: Miles per gallon (target variable)

We'll predict Fuel Economy based on Horse Power using linear regression.

In [28]:
# Load the dataset
df = pd.read_csv('FuelEconomy.csv')

# Display first few rows
df.head()

Unnamed: 0,Horse Power,Fuel Economy (MPG)
0,118.770799,29.344195
1,176.326567,24.695934
2,219.262465,23.95201
3,187.310009,23.384546
4,218.59434,23.426739


In [29]:
# Basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nDataset Info:")
df.info()
print("\nBasic Statistics:")
df.describe()

Dataset Shape: (100, 2)

Column Names:
['Horse Power', 'Fuel Economy (MPG)']

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Horse Power         100 non-null    float64
 1   Fuel Economy (MPG)  100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB

Basic Statistics:


Unnamed: 0,Horse Power,Fuel Economy (MPG)
count,100.0,100.0
mean,213.67619,23.178501
std,62.061726,4.701666
min,50.0,10.0
25%,174.996514,20.439516
50%,218.928402,23.143192
75%,251.706476,26.089933
max,350.0,35.0


In [30]:
# Check for missing values
print("Missing Values:")
missing_counts = df.isnull().sum()
print(missing_counts)
print(f"\nTotal missing values: {missing_counts.sum()}")
print("\nPercentage of missing values:")
print((missing_counts / len(df) * 100).round(2))

# Visualize missing values
if missing_counts.sum() > 0:
    plt.figure(figsize=(8, 4))
    missing_counts.plot(kind='bar')
    plt.title('Missing Values by Column')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("\n✓ No missing values found in the dataset!")

print("\nDuplicate Rows:", df.duplicated().sum())

Missing Values:
Horse Power           0
Fuel Economy (MPG)    0
dtype: int64

Total missing values: 0

Percentage of missing values:
Horse Power           0.0
Fuel Economy (MPG)    0.0
dtype: float64

✓ No missing values found in the dataset!

Duplicate Rows: 0


## Train-Test Split

In [31]:
# Prepare features and target
X = df[['Horse Power']]  # Features
y = df['Fuel Economy (MPG)']  # Target variable

# 70/30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set size: {X_train.shape[0]} ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]} ({X_test.shape[0]/len(df)*100:.1f}%)")

Training set size: 70 (70.0%)
Test set size: 30 (30.0%)


## Handling Missing Values

Even though our current dataset has no missing values, it's important to understand different strategies for handling missing data in real-world scenarios. Let's explore various approaches:

### Strategy 1: Deletion Methods

**When to use:**
- Missing data is < 5% of the dataset
- Missingness is completely at random (MCAR)
- Dataset is large enough to handle loss of rows

In [32]:
# Example: Deletion strategies (commented out since we have no missing values)

# 1. Listwise deletion (drop rows with ANY missing value)
# df_clean = df.dropna()

# 2. Pairwise deletion (drop rows only if BOTH target and feature are missing)
# df_clean = df.dropna(subset=['Horse Power', 'Fuel Economy (MPG)'])

# 3. Drop rows where target variable is missing (critical for supervised learning)
# df_clean = df.dropna(subset=['Fuel Economy (MPG)'])

# 4. Drop columns with too many missing values (e.g., > 50%)
# threshold = len(df) * 0.5
# df_clean = df.dropna(axis=1, thresh=threshold)

print("Deletion methods:")
print("- Listwise deletion: df.dropna() - removes rows with ANY missing value")
print("- Target-specific deletion: df.dropna(subset=['target_column']) - removes rows where target is missing")
print("- Column deletion: df.dropna(axis=1, thresh=threshold) - removes columns with too many missing values")

Deletion methods:
- Listwise deletion: df.dropna() - removes rows with ANY missing value
- Target-specific deletion: df.dropna(subset=['target_column']) - removes rows where target is missing
- Column deletion: df.dropna(axis=1, thresh=threshold) - removes columns with too many missing values


### Strategy 2: Imputation Methods

**When to use:**
- Missing data is > 5% of the dataset
- We want to preserve all observations
- Missingness is not completely random but can be handled appropriately

In [33]:
# Example: Imputation strategies

# Create a copy for demonstration (we'll create synthetic missing values)
df_demo = df.copy()

# Simulate missing values for demonstration (5% random missing)
np.random.seed(42)
missing_indices_hp = np.random.choice(df_demo.index, size=int(len(df_demo) * 0.05), replace=False)
missing_indices_mpg = np.random.choice(df_demo.index, size=int(len(df_demo) * 0.05), replace=False)

df_with_missing = df_demo.copy()
df_with_missing.loc[missing_indices_hp, 'Horse Power'] = np.nan
df_with_missing.loc[missing_indices_mpg, 'Fuel Economy (MPG)'] = np.nan

print("Original dataset missing values:", df_demo.isnull().sum().sum())
print("Simulated missing values:")
print(df_with_missing.isnull().sum())
print(f"\nTotal missing: {df_with_missing.isnull().sum().sum()} values")

Original dataset missing values: 0
Simulated missing values:
Horse Power           5
Fuel Economy (MPG)    5
dtype: int64

Total missing: 10 values


In [34]:
# 1. Mean/Median Imputation (for numerical features)
from sklearn.impute import SimpleImputer

# Mean imputation
mean_imputer = SimpleImputer(strategy='mean')
df_mean_imputed = df_with_missing.copy()
df_mean_imputed['Horse Power'] = mean_imputer.fit_transform(df_with_missing[['Horse Power']])

# Median imputation (more robust to outliers)
median_imputer = SimpleImputer(strategy='median')
df_median_imputed = df_with_missing.copy()
df_median_imputed['Horse Power'] = median_imputer.fit_transform(df_with_missing[['Horse Power']])

print("Mean imputation:")
print(f"  Mean of Horse Power: {df_demo['Horse Power'].mean():.2f}")
print(f"  Imputed values: {df_mean_imputed.loc[missing_indices_hp, 'Horse Power'].values[:3]}")

print("\nMedian imputation:")
print(f"  Median of Horse Power: {df_demo['Horse Power'].median():.2f}")
print(f"  Imputed values: {df_median_imputed.loc[missing_indices_hp, 'Horse Power'].values[:3]}")

Mean imputation:
  Mean of Horse Power: 213.68
  Imputed values: [213.69656262 213.69656262 213.69656262]

Median imputation:
  Median of Horse Power: 218.93
  Imputed values: [218.107081 218.107081 218.107081]
