# lets implement Ridge regression from scratch


## import section

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
reg_dataset = datasets.make_regression(n_samples=1000, n_features=100, n_informative=10, n_targets=1, shuffle=True, random_state=42)
X = reg_dataset[0]
y = reg_dataset[1]

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = .80,shuffle= True, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(800, 100)
(200, 100)
(800,)
(200,)


In [4]:
standardscalar = StandardScaler()
standardscalar.fit(X_train)
standardscalar.transform(X_train)
standardscalar.transform(X_test)


array([[ 0.75070067, -0.3481925 , -1.17378933, ..., -1.90940973,
         0.46984207,  0.62394679],
       [ 0.84896602,  0.31452426, -0.21710923, ...,  1.1251983 ,
         1.96654319,  0.40548378],
       [ 0.66993726,  1.23443248,  0.99119684, ..., -0.464584  ,
         1.02210749,  1.26164339],
       ...,
       [-0.62975098,  0.57138191,  1.33530621, ..., -1.29090787,
        -1.27575008,  0.22965224],
       [-0.84372868, -1.52121785,  0.47398936, ..., -0.86898062,
         1.74069389, -0.06974126],
       [ 0.29565289,  0.92703245,  0.36029547, ..., -0.17267686,
         0.77579379, -1.19959866]])

# lets write code for Ridge regression

In [5]:
class Ridge:

	def __init__(self, learning_rate = 1e-3, alpha = 1.0, max_iter = 1000):

		self.num_feats = int
		self.train_size = int
		self.weights = np.array 
		self.y_train = np.array 
		self.input_matrix = np.array

		self.learning_rate = learning_rate   #Learning rate for gradient descent
		self.alpha = alpha 	 #Regularization parameter, to control bias-variance tradeoff
		self.max_iter = max_iter 	#Number of iterations to run gradient descent
		self.cost_threshold = 0.1 * learning_rate  #stopping criterion for gradien descent

	def fit(self, X, y):

		"""
			Adjust weights to training data
		"""
		X = pd.DataFrame(X)
		self.train_size = X.shape[0]
		self.num_feats = X.shape[1]
		self.input_matrix = np.append(X, np.ones(self.train_size).reshape(-1, 1), axis = 1)   #Add Column with Ones for intercept term 
		self.y_train = y
		self.weights = np.zeros(self.num_feats + 1) #Extra +1 for the intercept


		#optimize weights
		prev_cost = float("inf")
		for i in range(self.max_iter):
			cost = self._update_weights()

			if i%100 ==0 or i == self.max_iter:
				print("Cost after {} iterations is: {}".format(i, cost))
			if abs(prev_cost -cost) < self.cost_threshold*prev_cost:
				print("Cost after {} iterations is: {}".format(i, cost))
				break
			prev_cost = cost

	def _update_weights(self):

		"""
			Cost Function:
				l(w) = (1/n) * (((y - wX)^2) + alpha * (w^2))
			Gradient:
				delta_w = dl/dw = (2/n)*( ((y - wX)*(-X)) + alpha * w) 
							
							 (or)
				delta_w = dl/dw = (2/n)*( ((wX - y)*(X)) + alpha * w)
			Gradient Descent:
				w = w - (learning_rate * delta_w)
		"""

		y_pred = (self.weights * self.input_matrix).sum(axis = 1)  # y_pred = wX

		cost = (1/self.train_size) * (((self.y_train - y_pred) ** 2).sum(axis = 0) + (self.alpha * (self.weights ** 2)).sum(axis = 0))  

		err = (y_pred - self.y_train).reshape(-1, 1)  # err = wX - y

		delta_w = (2/self.train_size) * (((err * self.input_matrix).sum(axis = 0)) + (self.alpha * self.weights)) #delta_w = (2/n)*( ((wX - y)*(X)) + alpha * w)

		self.weights = self.weights - (self.learning_rate * delta_w) 

		return cost


	def predict(self, X):

		""" Make predictions on given X using trained model """

		size = X.shape[0]
		X = np.append(X, np.ones(size).reshape(-1, 1), axis = 1)

		y_pred = (self.weights * X).sum(axis = 1)

		return y_pred 
		


## Lets fit the model and predict 

In [22]:
ridge_reg = Ridge(learning_rate = 1e-3, alpha = 20.0,max_iter = 3000)
ridge_reg.fit(X_train, y_train)


Cost after 0 iterations is: 24752.939027679462
Cost after 100 iterations is: 16129.251559871554
Cost after 200 iterations is: 10768.485590417115
Cost after 300 iterations is: 7377.921131501019
Cost after 400 iterations is: 5196.182002175037
Cost after 500 iterations is: 3768.2659043401313
Cost after 600 iterations is: 2818.16171291126
Cost after 700 iterations is: 2175.8647164479316
Cost after 800 iterations is: 1735.0394480378955
Cost after 900 iterations is: 1428.1435267070544
Cost after 1000 iterations is: 1211.6165840968051
Cost after 1100 iterations is: 1056.9418099497134
Cost after 1200 iterations is: 945.1770566501388
Cost after 1300 iterations is: 863.5621636371444
Cost after 1400 iterations is: 803.3849786227806
Cost after 1500 iterations is: 758.6205430146339
Cost after 1600 iterations is: 725.0514941715383
Cost after 1700 iterations is: 699.6919311024607
Cost after 1800 iterations is: 680.4051502773326
Cost after 1900 iterations is: 665.6468294933371
Cost after 2000 iteratio

In [23]:
print('Ridge Regression Model Coefficients (W): {}'.format(ridge_reg.weights[:-1]))
print('Ridge Regression Model Intercept (b): {}'.format(ridge_reg.weights[-1]))

Ridge Regression Model Coefficients (W): [ 4.65239599e+01  6.05294488e-02  3.53405665e-01 -1.55491626e+00
  1.04272654e+00 -3.40424221e-02 -1.42492912e-01 -1.04160183e-01
  4.55317201e-01 -8.01411620e-01 -9.09029108e-01 -3.43859871e-01
 -1.09765518e+00  4.61972024e-03  7.89072644e+01  4.97324473e-01
  1.02525566e+00  7.29387223e+01  5.00856882e-01  4.29212537e-01
  6.66839081e-01  2.61300682e-01  4.10154781e-01 -4.78135538e-01
 -4.47904812e-01 -8.46157519e-01  1.87447794e+00  5.86562208e-01
  7.85168667e-01  7.30232247e-01 -3.82361935e-01  6.11037854e-01
 -9.37802050e-01 -3.42972355e-01 -2.80974677e-01 -9.55984922e-01
 -4.69255212e-01 -8.50180645e-01  1.36964780e-01  1.21194893e-01
  8.01229736e-03 -2.96712251e-01  5.44024959e+01  2.68391478e-01
 -2.40261724e-01  6.03502013e-01 -2.34399973e-01 -7.80001374e-03
  1.17251294e+00  8.18815734e-02  3.21643785e-01 -2.18026226e-01
  5.24997494e-01 -1.07155064e+00 -9.13833888e-01  8.00289825e-01
  5.37578468e-01  2.74646835e+01  4.71292608e-01 

# lets see the MAPE of the model

In [24]:
#Evaluating Model through MAPE
print("\nMean Absolute Percentage Error(for train data): {}".format(mean_absolute_percentage_error(y_train, ridge_reg.predict(X_train))))
print("Mean Absolute Percentage Error(for test data): {}".format(mean_absolute_percentage_error(y_test, ridge_reg.predict(X_test))))


Mean Absolute Percentage Error(for train data): 0.12831811741648103
Mean Absolute Percentage Error(for test data): 0.18029095504445578
