In [1]:
import pandas as pd
import numpy as np

In [None]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'

In [None]:
!wget $data

In [None]:
df = pd.read_csv('laptops.csv')

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [None]:
df

In [None]:
df.dtypes

In [None]:
columns = ['ram', 'storage', 'screen', 'final_price']

In [None]:
df = df[columns]
df

### Question 1

In [None]:

df.isnull().sum()

### Question 2

In [None]:


median_ram = df['ram'].median()
median_ram

### Question 3

In [None]:
n = len(df)
n_val = int(n* 0.2)
n_test = int(n* 0.2)
n_train = n - n_val - n_test
index = np.arange(n)
np.random.seed(42)
np.random.shuffle(index)

In [None]:
df_train = df.iloc[index[:n_train]]
df_val = df.iloc[index[n_train: n_train + n_val]]
df_test = df.iloc[index[n_train + n_val:]]

In [None]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [None]:
df_train

### Fill missing values with 0

In [None]:
df_train.isnull().sum()

In [None]:
y_train = np.log1p(df_train.final_price.values)
y_test = np.log1p(df_test.final_price.values)
y_val = np.log1p(df_val.final_price.values)

In [None]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:]
  

In [None]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [None]:
features = ['ram', 'storage', 'screen']

In [None]:
def prepare_X(df):
    df = df.copy()
    df['screen'] = df['screen'].fillna(0)
    df_num = df[features]
    X = df_num.values
    return X

In [None]:
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

round(rmse(y_val, y_pred), 2)

## fill missing values with mean

In [None]:
mean = df_train['screen'].mean()
mean

In [None]:
def prepare_X_with_mean(df):
    df = df.copy()
    df['screen'] = df['screen'].fillna(mean)
    df_num = df[features]
    X = df_num.values
    return X

In [None]:
X_train = prepare_X_with_mean(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

round(rmse(y_val, y_pred), 2)

## Question 4

In [None]:
def train_linear_regression_reg(X, y,r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:]
  

In [None]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    
    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = round(rmse(y_val, y_pred), 2)
    print(f"r:{r}, w0:{w0}, score: {score}")

### Question 5

In [None]:
def shuffle_and_split_data(seed_value):
    index = np.arange(n)
    np.random.seed(seed_value)
    np.random.shuffle(index)
    
    df_train_1 = df.iloc[index[:n_train]]
    df_val_1 = df.iloc[index[n_train: n_train + n_val]]
    df_test_1= df.iloc[index[n_train + n_val:]]
    
    df_train_1 = df_train_1.reset_index(drop = True)
    df_val_1 = df_val_1.reset_index(drop = True)
    df_test_1 = df_test_1.reset_index(drop = True)

    return df_train_1, df_val_1, df_test_1
    

In [None]:
scores = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    df_train_1, df_val_1, df_test_1 = shuffle_and_split_data(seed)
    y_train_1 = df_train_1.final_price.values
    y_test_1 = df_test_1.final_price.values
    y_val_1 = df_val_1.final_price.values
    X_train_1 = prepare_X(df_train_1)
    w0, w = train_linear_regression(X_train_1, y_train_1)

    X_val_1 = prepare_X(df_val_1)
    y_pred = w0 + X_val_1.dot(w)

    scores.append(rmse(y_val_1, y_pred))



    
round(np.std(scores), 3)    

### Question 6

In [None]:
df_train, df_val, df_test = shuffle_and_split_data(9)
y_train = df_train.final_price.values
y_test =df_test.final_price.values
y_val = df_val.final_price.values

In [None]:
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)

In [None]:
df_full_train

In [None]:
X_full_train = prepare_X(df_full_train)

In [None]:
y_full_train = np.concatenate([y_train, y_val])

In [None]:
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)    
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = round(rmse(y_test, y_pred), 2)

In [None]:
score