In [1]:
import pandas as pd
import numpy as np

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
!wget $data
df = pd.read_csv('car_fuel_efficiency.csv')
df.columns = df.columns.str.lower().str.replace(' ','_')
df.head()

--2025-10-14 14:31:23--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.6’


2025-10-14 14:31:23 (111 MB/s) - ‘car_fuel_efficiency.csv.6’ saved [874188/874188]



Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [8]:
df.dtypes

engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

In [7]:
strings = list(df.dtypes[df.dtypes=='object'].index)
strings

['origin', 'fuel_type', 'drivetrain']

In [3]:
#find the column with missing values
selected_columns = df[['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']]
selected_columns.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [5]:
#find the column with missing values
columns_with_missing_values = selected_columns.columns[selected_columns.isna().any()].tolist()
print(columns_with_missing_values)

['horsepower']


In [4]:
#horsepower has missing values among the selected columns
#find the median value 
selected_columns.describe()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
count,9704.0,8996.0,9704.0,9704.0,9704.0
mean,199.708368,149.657292,3001.280993,2011.484027,14.985243
std,49.455319,29.879555,497.89486,6.659808,2.556468
min,10.0,37.0,952.681761,2000.0,6.200971
25%,170.0,130.0,2666.248985,2006.0,13.267459
50%,200.0,149.0,2993.226296,2012.0,15.006037
75%,230.0,170.0,3334.957039,2017.0,16.707965
max,380.0,271.0,4739.077089,2023.0,25.967222


In [5]:
#median for horsepower is 149
# split the dataset for training, validation and testing
#shuffle the data first with seed of 42
n= len(selected_columns)

n_test= int(n* 0.2)
n_val= int(n*0.2)
n_train= n-n_test-n_val

#check the values added are equal so we don't miss any values
n,n_val+n_test+n_train
#shuffle the dataset before you split it for training,test and validation
idx = np.arange(n)
#use random.seed to make the shuffle reproduceable
np.random.seed(42)
np.random.shuffle(idx)
#get the dataset for tain,val and test
df_train = selected_columns.iloc[idx[:n_train]]
df_val = selected_columns.iloc[idx[n_train:n_train+n_val]]
df_test = selected_columns.iloc[idx[n_train+n_val:]]
#check the length of datasets to match the original length
len(df_train),len(df_val),len(df_test)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
#find y_train, y_val & y_test
#turn fuel_efficiency_mpg into logarithmic values
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)
y_train,y_val, y_test
#delete the y values from the training datasets to avoid errors
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [12]:
df_train.dtypes

engine_displacement      int64
horsepower             float64
vehicle_weight         float64
model_year               int64
dtype: object

In [6]:
#crearte two training dataset
#one with replacing nan with mean
#two with replacing nan with zero
#calculate the mean from the training data
df_train.describe()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year
count,5824.0,5395.0,5824.0,5824.0
mean,199.429945,149.544764,3006.672919,2011.444712
std,49.543177,29.610498,497.856998,6.669193
min,10.0,46.0,952.681761,2000.0
25%,170.0,129.0,2669.313374,2006.0
50%,200.0,149.0,2993.348283,2011.0
75%,230.0,170.0,3342.15152,2017.0
max,380.0,246.0,4610.973229,2023.0


In [7]:
hp_mean = 149.544764
#since its only horsepower which has empty values
#we can replace the whole dataset with the mean
df_train_mean = df_train.fillna(hp_mean)
#we create another dataset for filling with 0
df_train_zero = df_train.fillna(0)
#create a linear_regression function
# first step is do the training without regularization
# the first row will be just 1s
def train_regression_model(X,y):
    ones =np.ones(X.shape[0])
    X = np.column_stack([ones,X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0],w_full[1:]
#calculate the predictions based on different train datasets    
w0, w_mean = train_regression_model(df_train_mean,y_train)
w1, w_zero = train_regression_model(df_train_zero,y_train)
y_pred_mean = w0 + df_train.dot(w_mean)
y_pred_zero = w1 + df_train.dot(w_zero)
#calculate the RMSE for both the predictions
def RMSE_func(y, y_pred):
    se = (y-y_pred)** 2
    mse = se.mean()
    return np.sqrt(mse)
#calculate the prediction values over the validation data
y_pred_val_mean = w0 + df_val.dot(w_mean) 
y_pred_val_zero = w1 + df_val.dot(w_zero)
#calculate the RMSE values for the two predictions made
RMSE_mean = RMSE_func(y_val, y_pred_val_mean) 
RMSE_zero =RMSE_func(y_val, y_pred_val_zero)
print('RMSE with mean:', round(RMSE_mean,2))
print('RMSE with zero:', round(RMSE_zero,2))
    
    

RMSE with mean: 0.04
RMSE with zero: 0.04


In [None]:
# for my model the values came the same when rounded to two decimal places
#for my evaluation both came equally good

In [9]:
#train a regression model with regulation
# we will use training data replaced with zero
def train_regression_model_reg(X,y,r):
    ones =np.ones(X.shape[0])
    X = np.column_stack([ones,X])
    XTX = X.T.dot(X)
    XTX =XTX + r * np.eye((XTX.shape[0]))
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0],w_full[1:]
    
#for a given list of r, find the best RMSE
r_list =[0, 0.01, 0.1, 1, 5, 10, 100]
rmses={}
for r in r_list:
    print(r)
    w0,w = train_regression_model_reg(df_train_zero,y_train,r)
    y_pred_new = w0 + df_val.dot(w)
    rmse_val = round (RMSE_func(y_val,y_pred_new),2)
    rmses[r] = rmse_val
print(rmses.items())

0
0.01
0.1
1
5
10
100
dict_items([(0, np.float64(0.04)), (0.01, np.float64(0.04)), (0.1, np.float64(0.04)), (1, np.float64(0.04)), (5, np.float64(0.04)), (10, np.float64(0.04)), (100, np.float64(0.04))])


In [22]:
# according to the observation all values of r gives 0.04 so the smallest r to be chosen is 0
# shuffle the data wit different seed values
# the length of the dataset remains the same, n as calculated previously
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
RMSE_values =[]
#find the rmse values for each seed value and calculate the std for all the scores
for seed in seeds:
    idx = np.arange(n)
#use random.seed to make the shuffle reproduceable
    np.random.seed(seed)
    np.random.shuffle(idx)
#get the dataset for tain,val and test
    df_train = selected_columns.iloc[idx[:n_train]]
    df_val = selected_columns.iloc[idx[n_train:n_train+n_val]]
    df_test = selected_columns.iloc[idx[n_train+n_val:]]
#find y_train, y_val & y_test
#turn fuel_efficiency_mpg into logarithmic values
    y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
    y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
    y_test = np.log1p(df_test.fuel_efficiency_mpg.values)
#we create another dataset for filling with 0
    df_train_zero = df_train.fillna(0)
#train regression model without regularization    
    w1, w_zero = train_regression_model(df_train_zero,y_train)
    y_pred = w1 + df_val.dot(w_zero)
# find the RMSE value against the validation data
    RMSE_values.append(round(RMSE_func(y_val,y_pred),2))
    
#print(RMSE_values)    
#calculate the standard deviation of the RMSE values
print('std:',round(np.std(RMSE_values),3)) 

std: 0.0


In [32]:
# the closest option I have for the stanadard deviation according question is 0.001
# find RMSE using seed =9, r=0.001 combining training and validation dataset and using test data for prediction

idx =np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)
df_train_new = selected_columns.iloc[idx[:n_train+n_val]]
df_test_new = selected_columns.iloc[idx[n_train +n_val:]]
y_train_new = np.log1p(df_train_new.fuel_efficiency_mpg.values)
y_test_new = np.log1p(df_test_new.fuel_efficiency_mpg.values)
df_train_zero_new = df_train_new.fillna(0)
w_r, w_zero_r = train_regression_model_reg(df_train_new,y_train_new,0.01)
w_r, w_zero_r
y_pred_new = w_r + df_test_new.dot(w_zero_r)
RMSE_val_new = RMSE_func(y_test_new,y_pred_new)
round(RMSE_val_new,2)

np.float64(nan)

In [None]:
# the value is nan so approximately close to zero so answer is 0.01