In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


print("pandas version: "+pd.__version__)
print("numpy version: "+np.__version__)
print("python version: "+sys.version)

pandas version: 2.3.1
numpy version: 2.3.3
python version: 3.13.7 (main, Aug 15 2025, 12:34:02) [GCC 15.2.1 20250813]


In [2]:
# source file : wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv 
df = pd.read_csv('car_fuel_efficiency.csv')


In [3]:
# Data Preparation

# In this step the lowering of the column names happens.
# and the replacement of spaces ' ' with underscores '_'


df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
# select only the objects 
# the objects are the "values"
# the parameters is the "index"
# as the index is needed the index function is used 
# and converted to a list
strings = list(df.dtypes[df.dtypes == 'object'].index)

['origin', 'fuel_type', 'drivetrain']

In [5]:
for col in strings:
        df[col] = df[col].str.lower().str.replace(' ', '_')

In [9]:
# Splitting up 
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [10]:
n, n_val, n_test, n_train 

(9704, 1940, 1940, 5824)

In [11]:
df_val = df.iloc[:n_val]
df_test = df.iloc[n_val:n_val+n_test]
df_train = df.iloc[n_val+n_test:]

In [12]:
idx = np.arange(n)

In [13]:
np.random.seed(42)
np.random.shuffle(idx)

In [14]:
df_train = df.iloc[idx[n_train:]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

In [15]:
df_train.iloc[10]

engine_displacement                270
num_cylinders                      2.0
horsepower                       117.0
vehicle_weight             2443.507303
acceleration                      11.5
model_year                        2015
origin                            asia
fuel_type                       diesel
drivetrain             all-wheel_drive
num_doors                         -2.0
fuel_efficiency_mpg          17.802756
Name: 490, dtype: object

In [16]:
# Q1: There's one column with missing values. What is it?

columns_to_check = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
for col in columns_to_check:
    if col in df.columns:
        missing_count = df[col].isnull().sum()
        print(f"Column '{col}' has {missing_count} missing values.")
    else:
        print(f"Column '{col}' is not found in the DataFrame.")


Column 'engine_displacement' has 0 missing values.
Column 'horsepower' has 708 missing values.
Column 'vehicle_weight' has 0 missing values.
Column 'model_year' has 0 missing values.


In [17]:
# Q2: What's the median (50% percentile) for variable 'horsepower'?

median_value = df['horsepower'].median()
print(f"The median (50% percentile) for variable 'horsepower' is: {median_value}")

The median (50% percentile) for variable 'horsepower' is: 149.0


In [18]:
# Q3: Filling NAs

#   We need to deal with missing values for the column from Q1.
#   We have two options: fill it with 0 or with the mean of this variable.
#   Try both options. For each, train a linear regression model without regularization using the code from the lessons.
#   For computing the mean, use the training only!
#   Use the validation dataset to evaluate the models and compare the RMSE of each option.
#   Round the RMSE scores to 2 decimal digits using round(score, 2)
#   Which option gives better RMSE?

#  Values from lecture 
w0 = 7.17
w = [0.01,0.04,0.002]

# Create a list concatenation for prepending
w_new = [w0] + w

In [19]:
# vector vector multiplication
def dot(xi, w):
    n = len(xi)
    res = 0.0    
    for j in range(n):
        res = res + xi[j] * w[j]
    return res

In [20]:
# linear_regression - shorter notation with "list concatenation for prepending"
def linear_regression_short(xi):
    xi = [1] + xi
    return dot(xi, w_new)

In [21]:
def linear_regression(X):
    return X.dot(w_new)

In [22]:
def train_linear_regression(x, y):
    ones = np.ones(x.shape[0])
    x = np.column_stack([ones, x])
    XTX = x.T.dot(x)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(x.T).dot(y)
    return w_full[0], w_full[1:]

In [23]:
# logarithmic transformation to stabilize variance in regression targets due containing zero's.
y_train = np.log1p(df_train.horsepower.values)
y_val = np.log1p(df_val.horsepower.values)
y_test = np.log1p(df_test.horsepower.values)

In [24]:
base = ['horsepower']

In [25]:
#  Fill target 'y' with 0 
y_train_filled_with_Zero = df_train[base].fillna(0).values
y_val_filled_with_Zero = df_val[base].fillna(0).values

# Fill with 'y' with mean
y_train_filled_with_Mean = df_train[base].fillna(df_train[base].mean()).values
y_val_filled_with_Mean = df_val[base].fillna(df_val[base].mean()).values

In [26]:
# logarithmic transformation to stabilize variance in regression targets due containing zero's.

y_train_0 = np.log1p(y_train_filled_with_Zero)
y_val_0 = np.log1p(y_val_filled_with_Zero)

y_train_mean = np.log1p(y_train_filled_with_Mean)
y_val_mean = np.log1p(y_val_filled_with_Mean)

In [34]:
#### Step 1: Fill missing values
#  Fill with 0 
x_train_filled_with_Zero = df_train[base].fillna(0).values
x_val_filled_with_Zero = df_val[base].fillna(0).values

# Fill with mean
x_train_filled_with_Mean = df_train[base].fillna(df_train[base].mean()).values
x_val_filled_with_Mean = df_val[base].fillna(df_val[base].mean()).values

#### Step 2: Train models
bias_0, model_0        =train_linear_regression(x_train_filled_with_Zero, y_train_0)
bias_mean, model_mean  =train_linear_regression(x_train_filled_with_Mean, y_train_0)

y_train_back_from_0 = np.expm1(y_train_0)
y_val_back_from_0 = np.expm1(y_val_0)

y_train_back_from_Mean = np.expm1(y_train_mean)
y_val_back_from_Mean = np.expm1(y_val_mean)

#### Step 3: predictions
y_pred_0    = bias_0 + x_val_filled_with_Zero.dot(model_0)
y_pred_mean = bias_mean + x_val_filled_with_Mean.dot(model_mean)

#### Step 4: RMSE calculation
rmse_0 = np.sqrt(np.mean((y_val_back_from_0 - np.argmax(y_pred_0)) ** 2))
rmse_mean = np.sqrt(np.mean((y_val_back_from_Mean - np.argmax(y_pred_mean)) ** 2))

#### Step 5: Round
print("rmse_0:", round(rmse_0, 2))
print("rmse_mean:", round(rmse_mean, 2))
print()
print("A lower RMSE indicates that the model’s predictions are closer to the true values on average,\n meaning better predictive accuracy.")

rmse_0: 109.86
rmse_mean: 92.96

A lower RMSE indicates that the model’s predictions are closer to the true values on average,
 meaning better predictive accuracy.
