# Imports

In [1]:
import sys
import os
sys.path.append("../")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from fscoreai.utils import 
from fscoreai.loss import loss
from sample_data import sample_data

ImportError: cannot import name 'utils' from 'fscoreai.utils' (/home/sardarchitect/repos/fscoreai-ml/exploration/../fscoreai/utils.py)

In [None]:
from sklearn.preprocessing import normalize
from sklearn.linear_model import LinearRegression

# One-Dimensional Data

In [None]:
X, y = sample_data.simulate_linear_data()
print(X.shape, y.shape)

In [None]:
plt.scatter(X, y)

## SimpleLinear Regression with SkLearn

In [None]:
model_sklearn = LinearRegression(fit_intercept=True)
model_sklearn.fit(X,y)
print(model_sklearn.coef_, model_sklearn.intercept_)

In [None]:
model_sklearn.predict(X).shape

## Custom Simple Linear Regression

In [None]:
class SimpleLinearRegression():
    def __init__(self):
        self.coef_ = 0
        self.intercept_ = 0
    
    def fit_stat(self, X, y):
        # Closed-form Solution
            X_mean = np.mean(X, axis=0)
            y_mean = np.mean(y)
            self.coef_ = np.sum((X - X_mean).T*(y-y_mean), axis=1)/(np.sum((X - X_mean)**2, axis=0)) 
            self.intercept_ = y_mean - (self.coef_ * X_mean)
            return self
    
    def fit_grad(self, X, y, lr=1e-8, epochs=50):
        # Batch Gradient Descent Solution
            self.lr = lr  #   Learning rate
            self.epochs = epochs
            self.n, self.d = X.shape
            
            for epoch in range(self.epochs):
                for j in range(self.d):
                    y_pred = self.predict(X)
                    d_coef = - (2 / self.n) * np.sum((y - y_pred).dot(X[:,j])) #Derivative w.r.t. self.coef_
                    d_intercept = - (2 / self.n) * np.sum(y - y_pred) #Derivative w.r.t. self.intercept_
                    self.coef_ -=  self.lr * d_coef          #    Update self.coef_ 
                    self.intercept_ -=  self.lr * d_intercept  #    Update self.intercept_
            return self
        
    def predict(self, X):
        return np.dot(X, self.coef_) + self.intercept_

In [None]:
# Using Closed-form Solution
model_fscore_1 = SimpleLinearRegression()
model_fscore_1.fit_stat(X,y)
print(model_fscore_1.coef_, model_fscore_1.intercept_)

In [None]:
# Using Batch Gradient Descent Solution
model_fscore_2 = SimpleLinearRegression()
model_fscore_2.fit_grad(X,y,lr=1e-6, epochs=10000)
print(model_fscore_2.coef_, model_fscore_2.intercept_)

## Plotting the models

In [None]:
preds_sklearn = model_sklearn.predict(X)
preds_fscore_1 = model_fscore_1.predict(X)
preds_fscore_2 = model_fscore_2.predict(X)

plt.figure(figsize=(10,4))
plt.scatter(X, y)
plt.plot(X, preds_sklearn, color='red')
plt.plot(X, preds_fscore_1, color='yellow')
plt.plot(X, preds_fscore_2, color='green')

plt.show()

# Multi-Dimensional Data

In [None]:
X, y = sample_data.diabetes_data()

In [None]:
# Don't plot the sex data
X = X.drop(columns='sex')
features = X.columns

# Plot
fig, axs = plt.subplots(3, 3)
fig.suptitle('Diabetes Dataset')
for i in range(3):
    for j in range(3):
        n = j + i * 3
        feature = features[n]
        axs[i, j].scatter(X[feature], y, s=1)
        axs[i, j].set_xlabel(feature)
        axs[i, j].set_ylabel('target')
plt.tight_layout()
plt.show()

## Multiple Linear Regression with SkLearn

In [None]:
X, y = X.to_numpy(), y.to_numpy()

In [None]:
# SkLearn for Comparision
model_sklearn = LinearRegression(fit_intercept=True)
model_sklearn.fit(X,y)
print("Coef_:\n", model_sklearn.coef_)
print("\nIntercept_:\n", model_sklearn.intercept_)
print("\nMSE:\n",loss.mean_squared_error(y, model_sklearn.predict(X)))

## Custom Multiple Linear Regression

In [None]:
class MultipleLinearRegression():
    def __init__(self):
        self.betas = None
        self.coef_ = None
        self.intercept_ = None
        
        self.betas_history = {}
        self.loss_history = {}
        self.gradients_history = {}
    
    def fit_stat(self, X, y):
        # Closed-form Solution
        self.n, self.d = X.shape
        x_0 = np.ones((self.n, 1))
        x = np.hstack((x_0, X))
        self.betas = np.linalg.inv(np.dot(x.T, x)).dot(np.dot(x.T, y))

        self.intercept_ = self.betas[0]
        self.coef_ = self.betas[1:]
        return self
    
    def fit_grad(self, X, y, lr, iters, regularization=False):
        # Batch Gradient Descent Solution
        reg = 0
        if regularization:
            reg = 0.1
        y = y.reshape(-1, 1)
        self.n, self.d = X.shape
        x_0 = np.ones((self.n, 1))
        x = np.hstack((x_0, X))

        # Initialize betas
        self.betas = np.random.randn(self.d + 1, 1)

        for i in range(iters):
            y_pred = np.dot(x, self.betas)
            delta_betas = 2 * np.dot(x.T, (y_pred - y)) + reg * self.betas
            self.betas -= (lr * delta_betas)

            # Logging Loss
            self.gradients_history[i] = delta_betas
            b = self.betas.copy()
            self.betas_history[i] = b
            self.loss_history[i] = loss.mean_squared_error(y_pred, y)

        self.intercept_ = self.betas[0]
        self.coef_ = self.betas[1:]
        return self
        
    def predict(self, X):
        x_0 = np.ones((X.shape[0], 1))
        x = np.hstack((x_0, X))
        return np.dot(x, self.betas)

In [None]:
# Closed-form solution
model_fscore_1 = MultipleLinearRegression()
model_fscore_1.fit_stat(X,y)
print("Coef_:\n", model_fscore_1.coef_)
print("\nIntercept_:\n", model_fscore_1.intercept_)
print("\nMSE:\n",loss.mean_squared_error(y, model_fscore_1.predict(X)))

# Linear Regression Batch Gradient Descent Analysis

In [None]:
model = MultipleLinearRegression()
model.fit_grad(X, y, lr=1e-8, iters=100000, regularization=False)
print("Coef_:\n", model.coef_)
print("\nIntercept_:\n", model.intercept_)
print("\nMSE:\n",loss.mean_squared_error(y, model.predict(X)))

In [None]:
# Logging and visualizing stored training data
model_betas = np.array(list(model.betas_history.values()))
model_gradients = np.array(list(model.gradients_history.values()))
model_loss = np.array(list(model.loss_history.values()))

# Visualize Model Gradients
fig, axs = plt.subplots(3,3,constrained_layout=True)
fig.suptitle('Model Gradients vs. Iterations')
for i in range(3):
    for j in range(3):
        n = j + i * 3
        axs[i][j].plot(model.gradients_history.keys(), model_gradients[:, n])
plt.show()

# Visualize Model Parameters
fig, axs = plt.subplots(3, 3, constrained_layout = True)
fig.suptitle('Model Parameters vs. Iterations')
for i in range(3):
    for j in range(3):
        n = j + i * 3
        axs[i][j].plot(model.betas_history.keys(), model_betas[:, n])
plt.show()

# Visualize Model Loss
plt.plot(model.loss_history.keys(), model_loss)
plt.title("Loss vs. Iterations")
plt.show()

In [None]:
# Plot
fig, axs = plt.subplots(3, 3)
fig.suptitle('Diabetes Dataset')
for i in range(3):
    for j in range(3):
        n = j + i * 3
        axs[i][j].scatter(X[:, n], y, s=1)
        axs[i][j].scatter(X[:, n], model.predict(X), s=1)
plt.tight_layout()
plt.show()