In [None]:
! pip install ipynb

In [3]:
import sys
sys.path.insert(0, '../../data_exploration/components/')
sys.path.insert(0, '../../data')

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from ipynb.fs.full.songs_data_exploration import songs_model_df

In [12]:
songs_train = songs_model_df.iloc[300:]
songs_test = songs_model_df.iloc[:300]

train_features = songs_train.iloc[:, :-1]
train_label = songs_train.iloc[:, -1]
test_features = songs_test.iloc[:, :-1]
test_label = songs_test.iloc[:, -1]

In [7]:
train_features, train_label, test_features, test_label = model_selection.train_test_split(songs_model_df.iloc[:, :-1], songs_model_df.iloc[:, -1], test_size = 0.1)

In [8]:
class LinearRegression:
    def fit(self, X, y):
        X_pinv = np.linalg.inv(X.T @ X) @ X.T
        self.w = X_pinv @ y
        return self

    def get_weight(self):
        return self.w

    def predict(self, X):
        return np.sum(self.w.ravel() * X, axis=1)

def model_rmse(y_test, y_pred):
    return np.sqrt(np.mean((y_test.ravel() - y_pred.ravel())**2))

In [13]:
model1 = LinearRegression().fit(train_features, train_label)

In [14]:
test_predict = model1.predict(test_features)
print(model_rmse(test_label, test_predict))
pd.DataFrame(test_predict, columns=['popularity']).head()

8.995476942809034


Unnamed: 0,popularity
0,64.144658
1,77.160063
2,46.891743
3,27.602482
4,66.54071


In [15]:
bias_train = np.hstack((np.ones((len(train_features), 1)), train_features))
bias_test = np.hstack((np.ones((len(test_features), 1)), test_features))

model2 = LinearRegression().fit(bias_train, train_label)

In [16]:
bias_predict = model2.predict(bias_test)
print(model_rmse(test_label, bias_predict))
pd.DataFrame(bias_predict, columns=['popularity']).head()

9.057773328247688


Unnamed: 0,popularity
0,64.123941
1,77.27317
2,47.090455
3,27.471317
4,66.742759


In [17]:
def train_each_feature_cross_validation(train, fold = 5):
    feature = {k: 0 for k in train.columns if k != 'popularity'}

    for train_split, test_split in model_selection.KFold(n_splits=fold, shuffle=True).split(train):
        for column in feature.keys():
            feature_train = np.array(train.iloc[train_split].loc[:,[column]])
            label_train = np.array(train.iloc[train_split].loc[:,['popularity']])

            feature_test = np.array(train.iloc[test_split].loc[:,[column]])
            label_test = np.array(train.iloc[test_split].loc[:,['popularity']])
                
            model = LinearRegression().fit(feature_train, label_train)
            pred = model.predict(feature_test)
            rmse = model_rmse(label_test, pred)

            feature[column] += rmse

    return {k: v/fold for k, v in feature.items()}

In [18]:
feature = train_each_feature_cross_validation(songs_train)
best_feature = min(feature, key=feature.get)
best_feature

'album_popularity'

In [19]:
best_feature_train = train_features.loc[: best_feature]
best_feature_test = test_features.loc[: best_feature]

model3 = LinearRegression().fit(best_feature_train, train_label)
best_pred = model3.predict(best_feature_test)
print(model_rmse(test_label, best_pred))
pd.DataFrame(best_pred).head()

8.995476942809034


Unnamed: 0,0
0,64.144658
1,77.160063
2,46.891743
3,27.602482
4,66.54071


In [20]:
from sklearn.linear_model import LinearRegression as lr 

model4 = lr().fit(train_features, train_label)
pred = model4.predict(test_features)
assert round(model_rmse(test_label, pred), 11) == round(model_rmse(test_label, bias_predict), 11)
model_rmse(test_label, pred), model_rmse(test_label, bias_predict)

(9.057773328247656, 9.057773328247688)

In [None]:
# GRADIENT DESCENT
def grad(w, one_X, y):
    N = one_X.shape[0]
    return 1/N * one_X.T.dot(one_X @ w - y)

def cost(w, one_X, y):
    N = one_X.shape[0]
    return .5/N*np.linalg.norm(y - one_X @ w, 2)**2