# Homework

##### Dataset

In this homework, we will use the Laptops price dataset from Kaggle.

The goal of this homework is to create a regression model for predicting the prices (column 'Final Price').

In [11]:
import pandas as pd
import numpy as np

In [12]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv')
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


First, we'll normalize the names of the columns:

In [14]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

Now, instead of 'Final Price', we have 'final_price'.

Next, use only the following columns:
- 'ram',
- 'storage',
- 'screen',
- 'final_price'

In [15]:
base = ['ram', 'storage', 'screen', 'final_price']
df_num = df[base]

Look at the final_price variable. Does it have a long tail?

In [16]:
print(df_num[['final_price']].tail())

      final_price
2155      2699.99
2156      2899.99
2157      3399.99
2158      1899.99
2159      1699.99


#### Question 1
There's one column with missing values. What is it?
1) 'ram'
2) 'storage'
3) **'screen'**
4) 'final_price'

In [17]:
print("Columns with NaN: ", df_num.isnull().any()) 

Columns with NaN:  ram            False
storage        False
screen          True
final_price    False
dtype: bool


#### Question 2
What's the median (50% percentile) for variable 'ram'?
1) 8
2) **16**
3) 24
4) 32

In [18]:
print(df_num.ram.median())

16.0


##### Prepare and split the dataset
Shuffle the dataset (the filtered one you created above), use seed 42.
Split your data in train/val/test sets, with 60%/20%/20% distribution.
Use the same code as in the lectures

In [19]:
np.random.seed(42)

n = len(df_num)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df_num.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

y_train_orig = df_train.final_price.values
y_val_orig = df_val.final_price.values
y_test_orig = df_test.final_price.values


y_train = np.log1p(df_train.final_price.values)
y_val = np.log1p(df_val.final_price.values)
y_test = np.log1p(df_test.final_price.values)

df_train_def = df_train.drop(columns=['final_price'])
df_val_def = df_val.drop(columns=['final_price'])
df_test_def = df_test.drop(columns=['final_price'])

#### Question 3
- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)

Which option gives better RMSE?
1) With 0
2) With mean
3) **Both are equally good**

In [20]:
train_mean = df_train_def.mean(axis=0)
screen_mean = train_mean['screen']


def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def prepare_X(df, fillable):
    df_num = df.fillna(fillable)
    X = df_num.values
    return X

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

print('===== fillna with mean =====')
X_train = prepare_X(df_train_def,screen_mean )
w_0, w = train_linear_regression(X_train, y_train)

y_pred = w_0 + X_train.dot(w)

print('rmse train', round(rmse(y_train, y_pred),2))

X_val = prepare_X(df_val_def, screen_mean)
y_pred_val = w_0 + X_val.dot(w)

print('rmse validation', round(rmse(y_val, y_pred_val),2))
print('=========================')

print('===== fillna with 0 =====')
X_train = prepare_X(df_train_def,0 )
w_0, w = train_linear_regression(X_train, y_train)

y_pred = w_0 + X_train.dot(w)

print('rmse train', round(rmse(y_train, y_pred),2))

X_val = prepare_X(df_val_def, 0)
y_pred_val = w_0 + X_val.dot(w)

print('rmse validation', round(rmse(y_val, y_pred_val),2))
print('=========================')

===== fillna with mean =====
rmse train 0.45
rmse validation 0.43
===== fillna with 0 =====
rmse train 0.45
rmse validation 0.43


#### Question 4
- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.

Which r gives the best RMSE? If there are multiple options, select the smallest r.
1) 0
2) **0.01**
3) 1
4) 10
5) 100

In [23]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]


w_0, w = train_linear_regression_reg(X_train, y_train)

y_pred = w_0 + X_train.dot(w)


X_train = prepare_X(df_train_def,0 )
X_val = prepare_X(df_val_def,0)

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    #print('%6s' %r, round(rmse(y_val, y_pred),2))
    print('%6s' %r, rmse(y_val, y_pred))

     0 0.4288635315474407
  0.01 0.4288516360534068
   0.1 0.42877560097076367
     1 0.43059329897434
     5 0.46320952575805846
    10 0.5075657769236878
   100 0.6735026061725873


#### Question 5
- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

What's the value of std?
1) 19.176
2) 29.176
3) 39.176
4) 49.176

Note: Standard deviation shows how different the values are. If it's low, then all values are approximately the same. If it's high, the values are different. If standard deviation of scores is low, then our model is stable.

In [24]:
base = ['ram', 'storage', 'screen', 'final_price']

df_num = df[base]

rmse_value = np.ones(10)

for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:

    np.random.seed(s)

    n = len(df_num)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df_num.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    y_train_orig = df_train.final_price.values
    y_val_orig = df_val.final_price.values
    y_test_orig = df_test.final_price.values

    y_train = np.log1p(df_train.final_price.values)
    y_val = np.log1p(df_val.final_price.values)
    y_test = np.log1p(df_test.final_price.values)

    df_train_def = df_train.drop(columns=['final_price'])
    df_val_def = df_val.drop(columns=['final_price'])
    df_test_def = df_test.drop(columns=['final_price'])



    X_train = prepare_X(df_train_def,0 )
    w_0, w = train_linear_regression(X_train, y_train)
    y_pred = w_0 + X_train.dot(w)


    X_val = prepare_X(df_val_def, 0)
    y_pred_val = w_0 + X_val.dot(w)
    print('rmse validation', round(rmse(y_val, y_pred_val),4))
    
    rmse_value[s]=rmse(y_val, y_pred_val)
    standard_dev =round(np.std(rmse_value),3)
    
    print('std dev at ',s,': ',standard_dev)
    print(rmse_value)
    print('=========================')

rmse validation 0.43
std dev at  0 :  0.171
[0.43004216 1.         1.         1.         1.         1.
 1.         1.         1.         1.        ]
rmse validation 0.4521
std dev at  1 :  0.224
[0.43004216 0.45207945 1.         1.         1.         1.
 1.         1.         1.         1.        ]
rmse validation 0.4463
std dev at  2 :  0.255
[0.43004216 0.45207945 0.44626256 1.         1.         1.
 1.         1.         1.         1.        ]
rmse validation 0.4465
std dev at  3 :  0.273
[0.43004216 0.45207945 0.44626256 0.44645156 1.         1.
 1.         1.         1.         1.        ]
rmse validation 0.42
std dev at  4 :  0.281
[0.43004216 0.45207945 0.44626256 0.44645156 0.41998054 1.
 1.         1.         1.         1.        ]
rmse validation 0.4255
std dev at  5 :  0.276
[0.43004216 0.45207945 0.44626256 0.44645156 0.41998054 0.42545763
 1.         1.         1.         1.        ]
rmse validation 0.4488
std dev at  6 :  0.258
[0.43004216 0.45207945 0.44626256 0.44645156

#### Question 6
- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001.

What's the RMSE on the test dataset?
1) 598.60
2) 608.60
3) 618.60
4) 628.60

In [27]:
np.random.seed(9)

n = len(df_num)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df_num.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

y_train_orig = df_train.final_price.values
y_val_orig = df_val.final_price.values
y_test_orig = df_test.final_price.values

y_train = np.log1p(df_train.final_price.values)
y_val = np.log1p(df_val.final_price.values)
y_test = np.log1p(df_test.final_price.values)

df_train_def = df_train.drop(columns=['final_price'])
df_val_def = df_val.drop(columns=['final_price'])
df_test_def = df_test.drop(columns=['final_price'])

    
df_full_train = pd.concat([df_train_def,df_val_def])
y_full_train = np.concatenate([y_train,y_val])
    
X_train = prepare_X(df_full_train,0)
w_0, w = train_linear_regression_reg(X_train, y_full_train, r=0.001)

X_test = prepare_X(df_test_def,0)
y_pred = w_0 + X_test.dot(w)

    
print('rmse (test)', round(rmse(y_test, y_pred),4))


rmse (test) 0.4553
