### Regression Homework by sameh shehata 

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge

%matplotlib inline


### Data Preparation

### Question 3. 


In [66]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [67]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [68]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [69]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

['origin', 'fuel_type', 'drivetrain']

In [70]:
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [71]:
df.dtypes

engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

### Question 1. Missing values



In [72]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

### Question 2. Median for horse power


In [73]:
df['horsepower'].median()

149.0

### Question 3. 


In [74]:
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [75]:
n

9704

In [76]:
n_val, n_test, n_train

(1940, 1940, 5824)

In [77]:
idx = np.arange(n)

In [78]:
np.random.seed(42)
np.random.shuffle(idx)

In [79]:


train_idx = idx[:n_train]
val_idx = idx[n_train:n_train + n_val]
test_idx = idx[n_train + n_val:]

In [80]:
X = df.drop(['fuel_efficiency_mpg',"origin","fuel_type","drivetrain"], axis=1)
y = df.fuel_efficiency_mpg.values

In [81]:
X_train = X.iloc[train_idx].reset_index(drop=True)
X_val = X.iloc[val_idx].reset_index(drop=True)
X_test = X.iloc[test_idx].reset_index(drop=True)

y_train = y[train_idx]
y_val = y[val_idx]
y_test = y[test_idx]

# Option A: fill missing values with 0
X_train_zero = X_train.fillna(0)
X_val_zero = X_val.fillna(0)

model_zero = LinearRegression()
model_zero.fit(X_train_zero, y_train)
y_pred_zero = model_zero.predict(X_val_zero)
rmse_zero = float(np.sqrt(((y_val - y_pred_zero) ** 2).mean()))
rmse_zero_round = round(rmse_zero, 2)

# Option B: fill missing values with mean (computed from training set)
train_mean = X_train.mean()
X_train_mean = X_train.fillna(train_mean)
X_val_mean = X_val.fillna(train_mean)

model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train)
y_pred_mean = model_mean.predict(X_val_mean)
rmse_mean = float(np.sqrt(((y_val - y_pred_mean) ** 2).mean()))
rmse_mean_round = round(rmse_mean, 2)

print("RMSE (fill 0):", rmse_zero_round)
print("RMSE (fill mean):", rmse_mean_round)



RMSE (fill 0): 0.51
RMSE (fill mean): 0.39


### Question 4


In [82]:
# Regularized linear regression (Ridge) — Question 4
rs = [0, 0.01, 0.1, 1, 5, 10, 100]

scores = {}
for r in rs:
    model_r = Ridge(alpha=r)
    model_r.fit(X_train_zero, y_train)
    y_pred_r = model_r.predict(X_val_zero)
    rmse_r = float(np.sqrt(((y_val - y_pred_r) ** 2).mean()))
    scores[r] = round(rmse_r, 4)

print("RMSE (rounded) for each r:", scores)

# select best r (if ties, choose smallest r)
best_rmse = min(scores.values())
best_rs = [r for r, s in scores.items() if s == best_rmse]
best_r = min(best_rs)
print("Best r:", best_r)

RMSE (rounded) for each r: {0: 0.5061, 0.01: 0.5061, 0.1: 0.5061, 1: 0.5061, 5: 0.5061, 10: 0.5061, 100: 0.5061}
Best r: 0



### Question 5 

In [83]:

seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rmse_scores = []

X = df.drop(['fuel_efficiency_mpg', "origin", "fuel_type", "drivetrain"], axis=1)
y = df.fuel_efficiency_mpg.values

n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

for seed in seeds:
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)

    train_idx = idx[:n_train]
    val_idx = idx[n_train:n_train + n_val]
 
    # Create the training and validation sets for this specific seed
    X_train = X.iloc[train_idx].reset_index(drop=True)
    X_val = X.iloc[val_idx].reset_index(drop=True)
    
    y_train = y[train_idx]
    y_val = y[val_idx]
    
    # Fill missing values with 0
    X_train_filled = X_train.fillna(0)
    X_val_filled = X_val.fillna(0)

    # Initialize and train the linear regression model
    model = LinearRegression()
    model.fit(X_train_filled, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val_filled)
    
    # Calculate the RMSE and append it to our list of scores
    rmse = np.sqrt(((y_val - y_pred) ** 2).mean())
    rmse_scores.append(rmse)

# Print the RMSE for each seed to see the variation
for seed, score in zip(seeds, rmse_scores):
    print(f"Seed: {seed}, RMSE: {score}")

# Calculate the standard deviation of all collected RMSE scores
std_dev = np.std(rmse_scores)

# Round the result to 3 decimal places as requested
rounded_std_dev = round(std_dev, 3)

print(f"\nStandard Deviation of RMSE scores: {rounded_std_dev}")

Seed: 0, RMSE: 0.5089438738515523
Seed: 1, RMSE: 0.5096433180914997
Seed: 2, RMSE: 0.5112824494606966
Seed: 3, RMSE: 0.506415096642651
Seed: 4, RMSE: 0.4980329555349279
Seed: 5, RMSE: 0.5153771300332379
Seed: 6, RMSE: 0.5191886267022854
Seed: 7, RMSE: 0.49584545408132663
Seed: 8, RMSE: 0.5015121580081838
Seed: 9, RMSE: 0.4997357052983891

Standard Deviation of RMSE scores: 0.007


## Question 6


In [None]:
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd


X = df.drop(['fuel_efficiency_mpg', "origin", "fuel_type", "drivetrain"], axis=1)
y = df.fuel_efficiency_mpg.values

seed = 9
np.random.seed(seed)

n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.shuffle(idx)

train_idx = idx[:n_train]
val_idx = idx[n_train:n_train + n_val]
test_idx = idx[n_train + n_val:]


full_train_idx = np.concatenate([train_idx, val_idx])

X_full_train = X.iloc[full_train_idx].reset_index(drop=True)
y_full_train = y[full_train_idx]

X_test = X.iloc[test_idx].reset_index(drop=True)
y_test = y[test_idx]

X_full_train_filled = X_full_train.fillna(0)
X_test_filled = X_test.fillna(0)

r = 0.001
model = Ridge(alpha=r, random_state=seed) # Using the same seed for the model for full reproducibility
model.fit(X_full_train_filled, y_full_train)

y_pred = model.predict(X_test_filled)

rmse = np.sqrt(((y_test - y_pred) ** 2).mean())

print(f"The RMSE on the test dataset is: {rmse}")

print(f"Rounded RMSE on the test dataset: {round(rmse, 3)}")

The RMSE on the test dataset is: 0.5017368518055851
Rounded RMSE on the test dataset: 0.502
