<a href="https://colab.research.google.com/github/pudgyhauscat/beginner_linear_regression/blob/main/Beginner_Linear_Regression_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
#below is code for a linear regression on a generated data set with an equation 
#of y = 2 + 3*x_1 + x_2 - 6*x_3 and y = 2 + 3*x_1 + x_2 - 6*x_3 + u. 

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

#initialize three independent variables. each variable has 50 observations with 
#their lowest value possible at 1 and highest possible value at 50. all of this 
#info can be customized. 

x_1 = np.random.randint(low = 1, high = 20, size = 100)
x_2 = np.random.randint(low = 1, high = 20, size = 100)
x_3 = np.random.randint(low = 1, high = 20, size = 100)

#calculate the dependent variable based on the independent variable and the 
#intercept. we're using the variable i to stand for an index so that the list 
#comprehension can effectively move through two lists of the same size at the 
#same time.

linear_y = [3*x for x in x_1]
linear_y_2 = [linear_y[i] + x_2[i] for i in range(0,len(x_2))]
linear_y_3 = [linear_y_2[i] - 6*x_3[i] for i in range(0, len(x_3))]
linear_y_3 = np.array(linear_y_3) + 2

#we have the option to add noise to the model. here we opt to add normally 
#distributed noise, which is a standard assumption of the model. later we'll use 
#the dependent variable with noise and a dependent variable without noise. to 
#compare outcomes

noise = np.random.normal(10, 3, 100)
linear_y_3_with_noise = [linear_y_3[i] + noise[i] for i in range(0, len(noise))]


In [53]:
#here we'll create a matrix using our previously declared independent variables 
#called X_0. Linear regression is an equation of the form Ax = b, so this step 
#creates A. Designated capital X_0. we have b we want x. training the model is 
#simple once the data is set. Worth noting is that sk_learn's regression model 
#is going to use a single value decomposition, which is computationally 
#different than other methods we might use. 

X_0 = np.column_stack([x_1, x_2, x_3])
sk_learn_regression_noiseless = LinearRegression()
sk_learn_regression_noiseless.fit(X_0, linear_y_3)
sk_learn_regression_noiseless.intercept_, sk_learn_regression_noiseless.coef_

(2.000000000000014, array([ 3.,  1., -6.]))

In [54]:
#here the same X_0 from above is used. we substitute linear_y_3 with 
#linear_y_3_with_noise. it changes the outcomes of the regression. we will 
#struggle to get the actual parameters because of the error we've introduced. 

sk_learn_regression_with_noise = LinearRegression()
sk_learn_regression_with_noise.fit(X_0, linear_y_3_with_noise)
sk_learn_regression_with_noise.intercept_, sk_learn_regression_with_noise.coef_

(14.382809971178743, array([ 2.96997359,  0.93850205, -6.10174819]))

In [55]:
#generally we'd split the data into a train and test set. here we split the observations
#and the noiseless data. we train the model with a simple call to LinearRegression() from
#sklearn. then we just use the .split method and specific our dependent and independent variables

x_test_noiseless, x_train_noiseless, y_test_noiseless, y_train_noiseless = train_test_split(X_0, linear_y_3, test_size = .2)

sk_learn_regression_with_split_noiseless= LinearRegression()
sk_learn_regression_with_split_noiseless.fit(x_train_noiseless, y_train_noiseless)


LinearRegression()

In [56]:
#we see that in the noisless model the coefficients are found on the training set

sk_learn_regression_with_split_noiseless.intercept_, sk_learn_regression_with_split_noiseless.coef_

(2.000000000000007, array([ 3.,  1., -6.]))

In [57]:
#we use the model built on the trian set to make predictions on the test set. 
#we use the standard mean squared error for linear regression and get a very small error

predictions_with_split_noiseless = sk_learn_regression_with_split_noiseless.predict(x_test_noiseless)

metrics.mean_squared_error(predictions_with_split_noiseless, y_test_noiseless)

2.7544064581923155e-28

In [58]:
#here is a split for a regression with noise and the model training process 

x_test_noise, x_train_noise, y_test_noise, y_train_noise = train_test_split(X_0, linear_y_3_with_noise, test_size = .2)

sk_learn_regression_with_split_noise= LinearRegression()
sk_learn_regression_with_split_noise.fit(x_train_noise, y_train_noise)


LinearRegression()

In [59]:
#we see that in the noisless model the coefficients are not found exactly on the training
#set with noise

sk_learn_regression_with_split_noise.intercept_, sk_learn_regression_with_split_noise.coef_

(12.436620787837544, array([ 3.1072658 ,  0.96871125, -6.0698519 ]))

In [60]:
#since we didn't find the exact parameters we're left with less accurate predictions.
#this is much more likely than the above outcome. train_test_split won't always split 
#the data in the same way based on how we configured it above. so, the model will have varying
#coefficients and varying errors. in practice we'd want to find the best model that doesn't
#overfit or underfit

predictions_with_split_noise = sk_learn_regression_with_split_noise.predict(x_test_noise)

metrics.mean_squared_error(predictions_with_split_noise, y_test_noise)

8.527457668676409

In [61]:
#the data generation step included above can be fit into a class and ported out of this program
#this will be useful, because we're going to use the same data generation steps in later notebooks. 
#this class will initialize data sets in the exact same way that we initialized the data set here and 
#we won't waste time coding it later

class data_set_generator:
    x_1 = np.random.randint(low = 1, high = 20, size = 100)
    x_2 = np.random.randint(low = 1, high = 20, size = 100)
    x_3 = np.random.randint(low = 1, high = 20, size = 100)
    
    def __init__(self):
      self.intercept = np.random.randint(low = 1, high = 20, size = 1)
      self.X_0 = np.column_stack([x_1, x_2, x_3])
      
    def calculate_y(self, weight_1, weight_2, weight_3):
      linear_y = [weight_1*x for x in x_1]
      linear_y_2 = [linear_y[i] + weight_2*x_2[i] for i in range(0,len(x_2))]
      linear_y_3 = [linear_y_2[i] - weight_3*x_3[i] for i in range(0, len(x_3))]
      self.linear_y = np.array(linear_y_3) + self.intercept
      noise = noise = np.random.normal(10, 3, 100)
      self.linear_y_noise = [linear_y_3[i] + noise[i] for i in range(0, len(noise))]


In [62]:
#here we'll save the class to a variable and supply some weights

data = data_set_generator()
data.calculate_y(3, 1, 6)

In [63]:
#we'll check that the class is behaving as we expect by doing a regression with it
#the only difference is that we have the class randomly generating the intercept.

sk_learn_regression_class_test = LinearRegression()
sk_learn_regression_class_test.fit(data.X_0, data.linear_y)
sk_learn_regression_class_test.intercept_, sk_learn_regression_class_test.coef_

(1.0000000000000142, array([ 3.,  1., -6.]))