# Linear Regression

In [None]:
from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import pandas as pd

In [None]:
diabetes = datasets.load_diabetes()

data = diabetes.data
target = diabetes.target
features=diabetes.feature_names

# train test split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33, random_state=42)

# just for understand data
df = pd.DataFrame(X_train, columns=features)
df['target'] = y_train

In [None]:
from sklearn.decomposition import PCA

pca = PCA(2)

pca.fit(X_train)
X_reduc = pca.transform(X_train)

pca.explained_variance_ratio_

In [None]:
# import seaborn as sns

# sns.pairplot(df, x_vars=features, y_vars='target', height=7, aspect=.3, kind='reg')

Hàm mất mát của Linear Regression là:
$~~~\mathcal{L}(\mathbf{w,b}) = \frac{1}{2N}||\mathbf{y - (\bar{X}w} +b)||_2^2$

Đạo hàm của hàm mất mát là:
$~~~\nabla_{\mathbf{w}}\mathcal{L}(\mathbf{w,b}) = \frac{1}{N}\mathbf{\bar{X}}^T \mathbf{((\bar{X}w + b) - y)} ~~~~~(1)$

$~~~\nabla_{\mathbf{b}}\mathcal{L}(\mathbf{w,b}) = \frac{1}{N}\mathbf((\bar{X}w + b) - y) ~~~~~(2)$

Cập nhật gradient cho một biến: $~~~~~x_{t+1} = x_{t} - \eta f’(x_{t})$

=> Cập nhật gradient cho Weight và bias:
    $\mathbf{w} \leftarrow \mathbf{w} - \alpha\nabla_{w}~~~$;
    $~~~~~\mathbf{b} \leftarrow \mathbf{b} - \alpha\nabla_{b}$

In [None]:
class LinearRegression():
    # Constructor
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
    
    # Fit
    def fit(self,X, y):
        # init parameters
        n_samples, n_features = X.shape
        self.weights = np.random.rand(n_features)
        self.bias = 0
        threshold = 0.001
        
        # gradient descent
        for _ in range(self.n_iters):
            y_predicted = self.predict(X)
            dW = 1/n_samples*X.T@(y_predicted-y)
            dB = 1/n_samples*(y_predicted-y)
            if np.linalg.norm(dW,2) > threshold:
                self.weights -= self.lr*dW
                self.bias -= self.lr*np.sum(dB)
            else: break
        return self
    
    # Predict 
    def predict(self, X):
        return X@self.weights +self.bias
    
    # Evaluate by RMSE
    def rmse(self,y, y_predicted):
        return 1/(2*len(y))*np.sqrt(np.sum((y-y_predicted)**2))

In [None]:
model = LinearRegression()
model.fit(X_train,y_train)

y_predicted = model.predict(X_test)

model.rmse(y_test, y_predicted)

In [None]:
from sklearn.metrics import mean_squared_error
model_lib = linear_model.LinearRegression().fit(X_train,y_train)

y_predicted_lib = model_lib.predict(X_test)

model.rmse(y_test, y_predicted_lib)

# Cross Validation

In [53]:
from sklearn.model_selection import KFold
import numpy as np

In [None]:
# for i, (train_index, test_index) in enumerate(kf):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={test_index}")

## Manual

In [78]:
n_folds = 5
models = []
models_rmse = []
n_samples,n_features = data.shape
for i in range(n_folds):
    test_index = np.array([index for index in range(i*(n_samples // n_folds),(i+1)*(n_samples // n_folds) if (i+1)*(n_samples // n_folds) < data.shape[0] else data.shape[0])])
    train_index = np.array(list(set([j for j in range(n_samples)]) - set(test_index)))
    model = LinearRegression()
    model.fit(data[train_index],target[train_index])

    y_predicted = model.predict(data[test_index])
    rmse = model.rmse(target[test_index], y_predicted)
    models.append(model)
    models_rmse.append(rmse)

In [80]:
np.average(np.array(models_rmse))

8.829932441584251