In [2]:
! pip install ipynb

Collecting ipynb
  Downloading ipynb-0.5.1-py3-none-any.whl (6.9 kB)
Installing collected packages: ipynb
Successfully installed ipynb-0.5.1


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from ipynb.fs.full.songs_data_csv import train_features, train_label, test_features, test_label, songs_train

In [2]:
class LinearRegression:
    def fit(self, X, y):
        X_pinv = np.linalg.inv(X.T @ X) @ X.T
        self.w = X_pinv @ y
        return self

    def get_weight(self):
        return self.w

    def predict(self, X):
        return np.sum(self.w.ravel() * X, axis=1)

def model_rmse(y_test, y_pred):
    return np.sqrt(np.mean((y_test.ravel() - y_pred.ravel())**2))

In [3]:
model1 = LinearRegression().fit(train_features, train_label)

In [4]:
test_predict = model1.predict(test_features)
print(model_rmse(test_label, test_predict))
pd.DataFrame(test_predict, columns=['popularity']).head()

9.342984792844721


Unnamed: 0,popularity
0,66.608796
1,79.174602
2,79.751109
3,35.944587
4,47.880575


In [5]:
bias_train = np.hstack((np.ones((len(train_features), 1)), train_features))
bias_test = np.hstack((np.ones((len(test_features), 1)), test_features))

model2 = LinearRegression().fit(bias_train, train_label)

In [6]:
bias_predict = model2.predict(bias_test)
print(model_rmse(test_label, bias_predict))
pd.DataFrame(bias_predict, columns=['popularity']).head()

9.186977328430418


Unnamed: 0,popularity
0,66.378405
1,78.713158
2,78.882348
3,35.152727
4,47.373682


In [7]:
def train_each_feature_cross_validation(train, fold = 5):
    feature = {k: 0 for k in train.columns if k != 'popularity'}

    for train_split, test_split in model_selection.KFold(n_splits=fold, shuffle=True).split(train):
        for column in feature.keys():
            feature_train = np.array(train.iloc[train_split].loc[:,[column]])
            label_train = np.array(train.iloc[train_split].loc[:,['popularity']])

            feature_test = np.array(train.iloc[test_split].loc[:,[column]])
            label_test = np.array(train.iloc[test_split].loc[:,['popularity']])
                
            model = LinearRegression().fit(feature_train, label_train)
            pred = model.predict(feature_test)
            rmse = model_rmse(label_test, pred)

            feature[column] += rmse

    return {k: v/fold for k, v in feature.items()}

In [8]:
feature = train_each_feature_cross_validation(songs_train)
best_feature = min(feature, key=feature.get)
best_feature

'album popularity'

In [10]:
best_feature_train = train_features.loc[: best_feature]
best_feature_test = test_features.loc[: best_feature]

model3 = LinearRegression().fit(best_feature_train, train_label)
best_pred = model3.predict(best_feature_test)
print(model_rmse(test_label, best_pred))
pd.DataFrame(best_pred).head()

9.342984792844721


Unnamed: 0,0
0,66.608796
1,79.174602
2,79.751109
3,35.944587
4,47.880575


In [1]:
# GRADIENT DESCENT
def grad(w, one_X, y):
    N = one_X.shape[0]
    return 1/N * one_X.T.dot(one_X @ w - y)

def cost(w, one_X, y):
    N = one_X.shape[0]
    return .5/N*np.linalg.norm(y - one_X @ w, 2)**2

In [12]:
def add(a, b):
    return a+b

def multiply(a, b):
    return a*b

def perform(a, b, op1, op2):
    return op2(op1(a, b), a)

perform(3, 5, add, multiply)

24