In [None]:
# imoprting packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# importing the data
car_fuel_eff_df = pd.read_csv('car_fuel_efficiency.csv')
car_fuel_eff_df.head()

In [None]:
# normalizing string columns values - lowercase & replacing spaces with underscores
object_cols = car_fuel_eff_df.dtypes[car_fuel_eff_df.dtypes == 'object'].index
car_fuel_eff_df[object_cols] = car_fuel_eff_df[object_cols].apply(lambda val: val.str.lower().str.replace(' ', '_'))

car_fuel_eff_df.head()

In [None]:
# data prep for regression
# subsetting the needed columns
feature_list = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']

cfe_reg_df = car_fuel_eff_df[feature_list]

cfe_reg_df.head()

In [None]:
# exploring the data
cfe_reg_df.describe(include='all')

In [None]:
# visualizing the variables
cfe_reg_df.hist(figsize=(10, 12), bins=50, edgecolor='black')
plt.tight_layout()
plt.show()

In [None]:
# QUESTION 1
# finding column with null values
print(cfe_reg_df.isnull().sum())

In [None]:
# QUESTION 2
print('Median horsepower with null values:', cfe_reg_df['horsepower'].median())

In [None]:
# fn for splitting the train test validation sets

def split_train_val_test(df, val_prop, test_prop, seed):
    # total records
    n = len(df)

    # train test validation proportions
    n_val = int(n*val_prop)
    n_test = int(n*test_prop)
    n_train = n - (n_val + n_test)
    
    # creating index
    idx = np.arange(n)

    # setting a seed and shuffling the index using random shuffle
    np.random.seed(seed)
    np.random.shuffle(idx)

    # splitting the sets
    df_train = df.iloc[ idx[:n_train] ]
    df_val = df.iloc[ idx[n_train : n_train+n_val] ]
    df_test = df.iloc[ idx[n_train+n_val:] ]

    # reset indices
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    return df_train, df_val, df_test

In [None]:
# performing the split
df_train, df_val, df_test = split_train_val_test(cfe_reg_df, 0.2, 0.2, 42)

# number of records in each train val test sets
print(len(cfe_reg_df), len(df_train)+len(df_val)+len(df_test))
print(len(df_train), len(df_val), len(df_test))
print(round(len(df_train)/len(cfe_reg_df),2), round(len(df_val)/len(cfe_reg_df),2), round(len(df_test)/len(cfe_reg_df),2))

In [None]:
# subsetting target variables from feature set
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

# removing y variables from train val test dataframes
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

print(df_train.columns)

In [None]:
# fn to train a linear regression model

def train_linear_regression(X, y):
    
    # create column of ones as the 1st col of feature matrix
    ones_col = np.ones(X.shape[0]) 
    X = np.column_stack([ones_col, X])
    
    # calculate weights (train) using feature matrix
    weights_full = np.linalg.inv( X.T.dot( X ) ).dot( X.T ).dot( y )
    
    return weights_full[0], weights_full[1:]

In [None]:
# fn to calculate rmse

def rmse(y_actual, y_pred):
    
    # standard error calc
    std_err = (y_actual - y_pred) ** 2
    
    # mean squared error calc
    mse = std_err.mean()
    
    # rmse
    return np.sqrt(mse)

In [None]:
# QUESTION 3

# training a lin reg model with horsepower nulls filled with zeroes

# filling nulls in horsepower with zeroes
X_train_null_zero = df_train[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values
X_val_null_zero = df_val[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values

# training the lin reg model
w0, w = train_linear_regression(X_train_null_zero, y_train)

# performing predictions on the validation set
y_pred_null_zero = w0 + X_val_null_zero.dot(w)

# calculating the rmse
print('RMSE of lin reg model trained on horsepower nulls filled with zeroes:', round(rmse(y_val, y_pred_null_zero), 4))

In [None]:
# training a lin reg model with horsepower nulls filled with training horsepower mean

# filling nulls in horsepower with training hp mean
X_train_null_mean = df_train[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(df_train['horsepower'].mean()).values
X_val_null_mean = df_val[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(df_train['horsepower'].mean()).values

# training the lin reg model
w0, w = train_linear_regression(X_train_null_mean, y_train)

# performing predictions on the validation set
y_pred_null_mean = w0 + X_val_null_mean.dot(w)

# calculating the rmse
print('RMSE of lin reg model trained on horsepower nulls filled with training horsepower mean:', round(rmse(y_val, y_pred_null_mean), 4))

Since, the rmse of HP nulls filled with training hp mean (0.4636) is LESSER than the rmse of HP nulls filled with zeroes (0.5174),

the linear regression model with HP filled with training hp mean performs better with more accurate predictions, thus lower RMS error.
<br /><br /><br />
(IMPORTANT: selecting a model solely based on this difference may not be sustainable. Further investigation is needed)
<br /><br /><br />



In [None]:
# custom fn for regularized linear regression
def train_lin_reg_regularized(X, y, reg_param=0.001):
    
    # create column of ones as the 1st col of feature matrix
    ones_col = np.ones(X.shape[0]) 
    X = np.column_stack([ones_col, X])
    
    # establishing the gram matrix
    XTX = X.T.dot( X )
    
    # introducing regularized param into the leading diagonal of gram matrix
    XTX = XTX + reg_param * np.eye(XTX.shape[0])
    
    # calculate weights (train) using feature matrix
    weights_full = np.linalg.inv( XTX ).dot( X.T ).dot( y )
    
    return weights_full[0], weights_full[1:]

In [None]:
# QUESTION 4

rmse_dict = {}
reg_param_list = [0, 0.01, 0.1, 1, 5, 10, 100]

# loop through the needed 
for reg_param in reg_param_list:

    # filling nulls in horsepower with zeroes
    X_train_null_zero = df_train[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values
    X_val_null_zero = df_val[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values

    # training the lin reg model
    w0, w = train_lin_reg_regularized(X_train_null_zero, y_train, reg_param)

    # performing predictions on the validation set
    y_pred_null_zero = w0 + X_val_null_zero.dot(w)

    # calculating the rmse
    print('RMSE of lin reg model with reg param:', reg_param, '==>', rmse(y_val, y_pred_null_zero))
    
    # storing all rmses in the dict
    rmse_dict[reg_param] = round(rmse(y_val, y_pred_null_zero), 2)
    

print(rmse_dict)

# the best and lowest rmse value, regression parameter tuple 
print(min(rmse_dict.items(), key=lambda x: x[1]))

when rounded off to 2 decimal places, since multiple rmse values for different regression parameters are the same, as per the question, the best and lowest regression parameter value is 0

If the rmse values are not rounded off, then the lowest rmse value was generated when regression parameter 0.01 was used

In [None]:
# QUESTION 5

seed_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rmse_list = []

# testing all the seeds
for seed in seed_list:
    
    # clearing the dataframes
    del df_train
    del df_val
    del df_test
    
    # performing the split
    df_train, df_val, df_test = split_train_val_test(cfe_reg_df, 0.2, 0.2, seed)

    # filling nulls in horsepower with zeroes
    X_train_null_zero = df_train[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values
    X_val_null_zero = df_val[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values
    
    # subsetting target variables from feature set
    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    y_test = df_test.fuel_efficiency_mpg.values

    # training the lin reg model
    w0, w = train_linear_regression(X_train_null_zero, y_train)

    # performing predictions on the validation set
    y_pred_null_zero = w0 + X_val_null_zero.dot(w)

    # calculating the rmse
    rmse_list.append(rmse(y_val, y_pred_null_zero))
    

# rmse scores for different seeds
print(rmse_list)
print()

# standard deviation of the rmse scores for different seeds
print('Standard deviation of rmse scores for the given list of seeds:', round(np.std(rmse_list), 3))


In [None]:
# QUESTION 6

# clearing the dataframes
del df_train
del df_val
del df_test

# performing the split with seed set 9
df_train, df_val, df_test = split_train_val_test(cfe_reg_df, 0.2, 0.2, 9)

# filling nulls in horsepower with zeroes
df_full_train = pd.concat([df_train, df_val])
X_full_train_null_zero = df_full_train[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values
X_test_null_zero = df_test[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].fillna(0).values

# subsetting target variables from feature set
y_full_train = df_full_train.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

# training the lin reg model
w0, w = train_lin_reg_regularized(X_full_train_null_zero, y_full_train, 0.001)

# performing predictions on the validation set
y_pred_null_zero = w0 + X_test_null_zero.dot(w)

# calculating the rmse
print('RMSE of a lin reg model (trained on combined train & val data) with regression parameter 0.01 and tested against test set is:', rmse(y_test, y_pred_null_zero), sep='\n\n')