In [55]:
import httpx
import mlcroissant as mlc
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import math

from pathlib import Path

In [56]:
# CONSTANTS
PARENT_FOLDER = Path.cwd()
DATA_FOLDER = PARENT_FOLDER / "data"
DATASET_URL = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"

In [57]:
print("### HW2 SOLUTIONS ####")
file_name = DATA_FOLDER / "hw2_car_fuel_efficiency.csv"
file_name.parent.mkdir(parents=True, exist_ok=True)


# check if the file already exists
if file_name.exists():
    print("Download skipped, file already exists.")
else:
    # Ensure the parent directory exists
    with httpx.Client(timeout=30.0) as client:
        res = client.get(DATASET_URL)

        with open(file_name, "wb") as f:
            f.write(res.content)

df_car = pd.read_csv(file_name)
print(df_car.columns)
print(df_car.describe())
print(df_car.count())

### HW2 SOLUTIONS ####
Download skipped, file already exists.
Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')
       engine_displacement  num_cylinders   horsepower  vehicle_weight  \
count          9704.000000    9222.000000  8996.000000     9704.000000   
mean            199.708368       3.962481   149.657292     3001.280993   
std              49.455319       1.999323    29.879555      497.894860   
min              10.000000       0.000000    37.000000      952.681761   
25%             170.000000       3.000000   130.000000     2666.248985   
50%             200.000000       4.000000   149.000000     2993.226296   
75%             230.000000       5.000000   170.000000     3334.957039   
max             380.000000      13.000000   271.000000     4739.077089   

       acceleration   model_year    num_doors  fuel_

In [58]:
# clean column names
df_car.columns = (
    df_car.columns.str.lower().str.replace(" ", "_")
)
df_car.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [59]:
cols_to_keep = [
    "engine_displacement",
    "horsepower",
    "vehicle_weight",
    "model_year",
    "fuel_efficiency_mpg",
]

df_car = df_car[cols_to_keep]
df_car

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


In [60]:
# answerss
answers = {}

# Q1: col with missing values
print(df_car.isna().sum())

answers["q1"] = 'horsepower'

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64


In [61]:
# Q2: Median of horsepower
print(df_car["horsepower"].median())
answers["q2"] = df_car["horsepower"].median()

149.0


In [62]:
# Splitting data sets (validation, test, training)

# calculate the indices
n_total = len(df_car)
n_validation = int(0.2 * n_total)  # ~20% of the total data
n_test = int(0.2 * n_total)  # ~20% of the total data
n_training = n_total - (n_validation + n_test)  # remaining ~60%

# get the random shuffling of data
np.random.seed(42)  # for reproducibility
idx = np.arange(n_total)
np.random.shuffle(idx)

# create the shuffled datasets
df_shuffled = df_car.iloc[idx]
df_training = df_shuffled.iloc[
    :n_training
].copy()  # from 0-index but before n_training index
df_validation = df_shuffled.iloc[
    n_training : (n_training + n_validation)
].copy()  # from n_training index up to (n_training + n_validation) rows
df_test = df_shuffled.iloc[
    (n_training + n_validation) :
].copy()  # start from the `to` index from above to the rest of the rows. This is the remaining data

df_shuffled

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
483,220,144.0,2535.887591,2009,16.642943
7506,160,141.0,2741.170484,2019,16.298377
8795,230,155.0,2471.880237,2017,18.591822
1688,150,206.0,3748.164469,2015,11.818843
6217,300,111.0,2135.716359,2006,19.402209
...,...,...,...,...,...
5734,210,163.0,1972.029124,2011,19.961672
5191,160,126.0,3011.588014,2009,14.651056
5390,290,187.0,2440.508039,2019,18.404435
860,260,129.0,1865.404480,2019,20.502460


In [63]:
# apply distribution normalization with log1p to remove long tail
y_training = np.log1p(df_training["fuel_efficiency_mpg"])
y_validation = np.log1p(df_validation["fuel_efficiency_mpg"])
y_test = np.log1p(df_test["fuel_efficiency_mpg"])

# remove the target variable to prevent us from accidentally using it
df_training = df_training.drop(columns=["fuel_efficiency_mpg"])
df_validation = df_validation.drop(columns=["fuel_efficiency_mpg"])
df_test = df_test.drop(columns=["fuel_efficiency_mpg"])

In [64]:
# define the linear regression function
def train_linear_regression(X: np.ndarray, y: np.ndarray):
    ones = np.ones(
        X.shape[0]
    )  # create a 1's array based on the column size of X or X.shape[0]
    X = np.column_stack(
        [ones, X]
    )  # stack the 1's array in front of each observation/row of X

    # normal equation
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    weights = XTX_inv.dot(X.T).dot(y)

    return weights[0], weights[1:]

In [65]:
# Evaluate model quality with Root Mean Squared Error (RMSE)
def rmse(y_target: np.ndarray, y_prediction: np.ndarray):
    # calculate the diff between the prediction and target
    error = y_prediction - y_target
    # square the prediction to make obtain non-negative values, then get the mean
    mse = np.power(error, 2).mean()
    # obtain the square root (since we squared it)
    return np.sqrt(mse)

In [None]:
# Q3: Value imputation
horsepower_mean = df_training["horsepower"].mean()
print(horsepower_mean)

df_training_imputed_w_mean = df_training.copy().fillna(horsepower_mean)
df_training_imputed_w_0 = df_training.copy().fillna(0)

df_validation_imputed_w_mean = df_validation.copy().fillna(horsepower_mean)
df_validation_imputed_w_0 = df_validation.copy().fillna(0)

X_training_imputed_w_mean = df_training_imputed_w_mean.values
X_training_imputed_w_0 = df_training_imputed_w_0.values
X_validation_imputed_w_mean = df_validation_imputed_w_mean.values
X_validation_imputed_w_0 = df_validation_imputed_w_0.values

w_0, w1 = train_linear_regression(X_training_imputed_w_mean, y_training)
w_0, w2 = train_linear_regression(X_training_imputed_w_0, y_training)
w_0, w3 = train_linear_regression(X_validation_imputed_w_mean, y_validation)
w_0, w4 = train_linear_regression(X_validation_imputed_w_0, y_validation)


y_pred_1 = w_0 + X_training_imputed_w_mean.dot(w1)
y_pred_2 = w_0 + X_training_imputed_w_0.dot(w2)
y_pred_3 = w_0 + X_validation_imputed_w_mean.dot(w3)
y_pred_4 = w_0 + X_validation_imputed_w_0.dot(w4)


# rmse
rmse_training_imputed_w_mean = rmse(y_training, y_pred_1)
rmse_training_imputed_w_0 = rmse(y_training, y_pred_2)
rmse_validation_imputed_w_mean = rmse(y_validation, y_pred_3)
rmse_validation_imputed_w_0 = rmse(y_validation, y_pred_4)

print(round(rmse_training_imputed_w_mean, 2))
print(round(rmse_training_imputed_w_0, 2))
print(round(rmse_validation_imputed_w_mean, 2))
print(round(rmse_validation_imputed_w_0, 2))

answers["q3"] = "With mean"

149.54476367006487
0.26
0.26
0.07
0.04
