In [111]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns 

In [112]:
pd.__version__

'2.3.3'

In [113]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [114]:
# normalizing col names 
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

Index(['laptop', 'status', 'brand', 'model', 'cpu', 'ram', 'storage',
       'storage_type', 'gpu', 'screen', 'touch', 'final_price'],
      dtype='object')

In [119]:
# filter to columns 
cols = ['ram', 'storage', 'screen', 'final_price']
df_filtered = df[cols].copy()
df.head()

cols2 = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
df2 = df[cols2].copy()
df2.head()

KeyError: "None of [Index(['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year'], dtype='object')] are in the [columns]"

In [116]:
df_filtered.dtypes

ram              int64
storage          int64
screen         float64
final_price    float64
dtype: object

In [117]:
df_filtered.dtypes == 'object'

ram            False
storage        False
screen         False
final_price    False
dtype: bool

### Question 1
#### There's one column with missing values. What is it?
* `'ram'`
* `'storage'`
* `'screen'` <---- 4 missing values
* `'final_price'`

In [118]:
# find which columns have missing values
df_filtered.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

### Question 2
#### What's the median (50% percentile) for variable `'ram'`?
- 8
- 16 <<--- Q2 value
- 24
- 32
### Question 2.b
#### What's the median (50% percentile) for variable `'horsepower'`?

In [None]:
df_filtered['ram'].median()

In [None]:
d

In [None]:
sns.histplot(df_filtered.ram, bins=20)

In [None]:
# * Shuffle the dataset (the filtered one you created above), use seed `42`.
# * Split your data in train/val/test sets, with 60%/20%/20% distribution.


### Question 3 
* We need to deal with missing values for the column from Q1.
* We have two options: fill it with 0 or with the mean of this variable.
* Try both options. For each, train a linear regression model without regularization using the code from the lessons.
* For computing the mean, use the training only!
* Use the validation dataset to evaluate the models and compare the RMSE of each option.
* Round the RMSE scores to 2 decimal digits using `round(score, 2)`
* Which option gives better RMSE?

Options:

- With 0
- With mean
- Both are equally good

In [None]:
# getting size of DataFrame
n = len(df_filtered) # 2160
print(f'df size: {n}')
n_val = int(n*0.2) 
n_test = int(n*0.2) 
n_train = n - int(n_val) - int(n_test)
print(f'train size: {n_train}, val size: {n_val}, test size: {n_test}')

In [None]:
# setting seed and shuffling data
np.random.seed(42)
idx = np.arange(n)
np.random.shuffle(idx)

In [None]:
# shuffle DataFrames 
df_shuffled = df_filtered.iloc[idx]
df_shuffled.head()

In [None]:
df_train = df_shuffled.iloc[:n_train]
df_val = df_shuffled.iloc[idx[n_train:n_train+n_val]]
df_test = df_shuffled.iloc[idx[n_train+n_val:]]

In [None]:
df_train.head()

In [None]:
print(f'train size: {len(df_train)}\nval size: {len(df_val)}\ntest size: {len(df_test)}')

In [None]:
# dropping the indices 
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True) 
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = np.log(df_train.final_price.values)
y_val = np.log(df_val.final_price.values)   
y_test = np.log(df_test.final_price.values)

In [None]:
# features we are using 
features = ['ram', 'storage', 'screen']

In [None]:
# calculation functions to answer 3-6
def rmse(y, y_pred):
    return np.sqrt(((y - y_pred) ** 2).mean())

# with regularization linear_regression
def linear_regression(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX_reg = XTX + reg
    XTX_inv = np.linalg.inv(XTX_reg)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:] #w0 is bias, w1 is weights

# without regularization linear_regression
def linear_regression_no_reg(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:] #w0 is bias, w1 is weights

In [None]:
# comparing 0 vs mean for Question #3 ~ no regularization 
X_train_0 = df_train[features].fillna(0).values
X_val_0 = df_val[features].fillna(0).values

#training model with 0 fill
w0_0, w_0 = linear_regression_no_reg(X_train_0, y_train)
# predicting and evaluating
y_pred_0 = w0_0 + X_val_0.dot(w_0)
rmse_0 = round(rmse(y_val, y_pred_0),4)
print(f'RMSE with 0 fill: {rmse_0}')

# mean fill w/o regularization
mean_storage = df_train['storage'].mean()

X_train_mean = df_train[features].fillna(mean_storage).values
X_val_mean = df_val[features].fillna(mean_storage).values

w0_mean, w_mean = linear_regression_no_reg(X_train_mean, y_train)

y_pred_mean = w0_mean + X_val_mean.dot(w_mean) 
rmse_mean = round(rmse(y_val, y_pred_mean),4)
print(f'RMSE with mean fill: {rmse_mean}')