# Multiple Linear Regression Using Sklearn Libarary

## Load import libararies

In [1]:
import numpy as np
import pandas as pd
import scipy
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
sns.set()

# Load Data

In [11]:
data = pd.read_csv('1.02. multiple linear regression.csv')

In [12]:
data.head()

Unnamed: 0,SAT,GPA,Attend
0,1714,2.4,NO
1,1664,2.52,NO
2,1760,2.54,NO
3,1685,2.74,NO
4,1693,2.83,NO


In [13]:
data.tail()

Unnamed: 0,SAT,GPA,Attend
79,1936,3.71,YES
80,1810,3.71,YES
81,1987,3.73,YES
82,1962,3.76,YES
83,2050,3.81,YES


In [14]:
data.describe()

Unnamed: 0,SAT,GPA
count,84.0,84.0
mean,1845.27381,3.330238
std,104.530661,0.271617
min,1634.0,2.4
25%,1772.0,3.19
50%,1846.0,3.38
75%,1934.0,3.5025
max,2050.0,3.81


In [15]:
data['Attend'] = data['Attend'].map({'YES':1,'NO':0})

In [16]:
data.describe()

Unnamed: 0,SAT,GPA,Attend
count,84.0,84.0,84.0
mean,1845.27381,3.330238,0.559524
std,104.530661,0.271617,0.499426
min,1634.0,2.4,0.0
25%,1772.0,3.19,0.0
50%,1846.0,3.38,1.0
75%,1934.0,3.5025,1.0
max,2050.0,3.81,1.0


In [17]:
data

Unnamed: 0,SAT,GPA,Attend
0,1714,2.40,0
1,1664,2.52,0
2,1760,2.54,0
3,1685,2.74,0
4,1693,2.83,0
...,...,...,...
79,1936,3.71,1
80,1810,3.71,1
81,1987,3.73,1
82,1962,3.76,1


In [18]:
data.shape

(84, 3)

# Create Regression

## Declare target and independed variable

In [20]:
y = data['GPA']
x = data[['SAT','Attend']]

# Standardization

In [62]:
from sklearn.preprocessing import StandardScaler

In [63]:
scaler = StandardScaler()

In [64]:
scaler.fit(x)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [65]:
x_scaled = scaler.transform(x)

  """Entry point for launching an IPython kernel.


In [67]:
x_scaled
# all the input data standardize

array([[-1.26338288, -1.12706267],
       [-1.74458431, -1.12706267],
       [-0.82067757, -1.12706267],
       [-1.54247971, -1.12706267],
       [-1.46548748, -1.12706267],
       [-1.68684014, -1.12706267],
       [-0.78218146, -1.12706267],
       [-0.78218146, -1.12706267],
       [-0.51270866, -1.12706267],
       [ 0.04548499, -1.12706267],
       [-1.06127829, -1.12706267],
       [-0.67631715, -1.12706267],
       [-1.06127829, -1.12706267],
       [-1.28263094, -1.12706267],
       [-0.6955652 , -1.12706267],
       [ 0.25721362, -1.12706267],
       [-0.86879772, -1.12706267],
       [-1.64834403, -1.12706267],
       [-0.03150724, -1.12706267],
       [-0.57045283, -1.12706267],
       [-0.81105355, -1.12706267],
       [-1.18639066, -1.12706267],
       [-1.75420834, -1.12706267],
       [-1.52323165, -1.12706267],
       [ 1.23886453, -1.12706267],
       [-0.18549169, -1.12706267],
       [-0.5608288 , -1.12706267],
       [-0.23361183, -1.12706267],
       [ 1.68156984,

# Regression with scaled features

In [68]:
reg = LinearRegression()

In [69]:
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [71]:
reg.coef_

array([0.00083844, 0.31740009])

In [72]:
reg.intercept_

1.6054900105802536

# Creatring a summary table

In [73]:
reg_summary = pd.DataFrame([['Intercept'],['SAT'],['Attend']], columns=['Features'])
reg_summary['Weights'] = reg.intercept_, reg.coef_[0], reg.coef_[1]

In [74]:
reg_summary 

Unnamed: 0,Features,Weights
0,Intercept,1.60549
1,SAT,0.000838
2,Attend,0.3174


### weights is a the 'Machine Learning word' for coefficeints
### Intercept is a the 'Machine Learning word' for bias

# Making predication with the standardized coefficients (Weights)

In [76]:
new_data = pd.DataFrame(data=[[1700,0],[1645,1]], columns=['SAT','Attend'])
new_data

Unnamed: 0,SAT,Attend
0,1700,0
1,1645,1


In [77]:
reg.predict(new_data)

array([3.03084152, 3.30212729])

In [78]:
new_data_scaled = scaler.transform(new_data)
new_data_scaled

  """Entry point for launching an IPython kernel.


array([[-1.39811928, -1.12706267],
       [-1.92744085,  0.8872621 ]])

In [80]:
reg.predict(new_data_scaled)

array([1.24658798, 1.88549103])

# Train Test Split

### Import the relevant libraries

In [82]:
from sklearn.model_selection import train_test_split

## Generate some data we are going to split

In [83]:
a = np.arange(1,101)

In [84]:
a

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [85]:
a.shape

(100,)

In [86]:
b = np.arange(501,601)
b

array([501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513,
       514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
       527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539,
       540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552,
       553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565,
       566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578,
       579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591,
       592, 593, 594, 595, 596, 597, 598, 599, 600])

# Split the data

In [87]:
train_test_split(a)

[array([ 9, 58, 39, 24, 38, 28, 61, 40, 82, 30, 18,  6, 74, 33, 27, 96,  1,
        19, 60, 51, 20, 29, 63, 54, 55, 11, 97, 73, 77, 17, 85, 23, 15, 41,
        79, 10, 34, 46, 50, 92,  4, 88, 59, 22, 36, 42, 94, 64,  3, 52, 72,
        47, 53, 90, 75, 65, 12, 25, 78, 87, 16, 48, 69, 31, 84, 56,  7, 81,
        26, 14,  8, 62,  2, 89, 49]),
 array([ 57,  95, 100,  67,  91,  99,  32,  76,  68,  66,  13,  80,  86,
         83,   5,  35,  21,  98,  45,  44,  93,  71,  43,  70,  37])]

In [89]:
a_train, a_test = train_test_split(a)

# Explore the result

In [90]:
a_train.shape, a_test.shape

((75,), (25,))

In [91]:
a_train

array([ 33,   1,  71,   3,  99,  82,  55,  72,  93,  50,  80,  81,  75,
        77,  30,  45,  24,  46,  68,  64,  28,  67,  20,  52,  11,  95,
        25,   9,  40,  70,  83, 100,  42,  43,  13,   2,  19,  61,  39,
        58,   5,  37,  76,  18,  57,  66,   4,  14,  54,  78,  69,  63,
        31,  41,  47,  51,  90,  96,  16,  91,  88,  73,  17,  23,  62,
        59,  49,  44,  35,  48,  26,  10,  22,  84,  94])

In [92]:
a_test

array([56, 98,  7, 79, 89, 15, 65, 27,  8, 34,  6, 87, 38, 74, 32, 86, 12,
       85, 21, 97, 92, 60, 36, 53, 29])

In [93]:
a_train, a_test = train_test_split(a, test_size=0.2)

In [94]:
a_train.shape, a_test.shape

((80,), (20,))

In [101]:
a_train,a_test,b_train,b_test = train_test_split(a,b,random_state=36,test_size=0.2)

In [102]:
a_train.shape,a_test.shape

((80,), (20,))

In [103]:
b_train.shape,b_test.shape

((80,), (20,))

In [104]:
a_train

array([ 93,  61,  62,  96,  64,  12,  49,  63,  40,  35,  51,  17,  77,
        84,  54,  24,   8,  70,  55,  39,  16, 100,  80,  73,  44,  11,
        97,  72,  79,  33,   9,  94,  87,  90,  85,  66,   5,  27,  52,
        18,  58,  50,  67,  82,  21,  19,  20,  89,  42,  25,  68,  26,
        47,  83,  92,  14,  22,  46,  98,  78,  15,  37,  86,   2,  32,
        23,  88,  71,  65,   7,  56,  38,  10,  45,  91,  34,  41,  31,
        99,   6])

In [105]:
a_test

array([60, 43,  3, 28, 29, 76, 59, 69, 53, 75,  4, 74, 36, 48, 81, 30, 95,
       13, 57,  1])

In [106]:
b_train

array([593, 561, 562, 596, 564, 512, 549, 563, 540, 535, 551, 517, 577,
       584, 554, 524, 508, 570, 555, 539, 516, 600, 580, 573, 544, 511,
       597, 572, 579, 533, 509, 594, 587, 590, 585, 566, 505, 527, 552,
       518, 558, 550, 567, 582, 521, 519, 520, 589, 542, 525, 568, 526,
       547, 583, 592, 514, 522, 546, 598, 578, 515, 537, 586, 502, 532,
       523, 588, 571, 565, 507, 556, 538, 510, 545, 591, 534, 541, 531,
       599, 506])

In [107]:
b_test

array([560, 543, 503, 528, 529, 576, 559, 569, 553, 575, 504, 574, 536,
       548, 581, 530, 595, 513, 557, 501])

In [95]:
y.shape

(84,)

In [22]:
x.shape

(84, 2)

# Regression itself

In [23]:
reg = LinearRegression()

In [24]:
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

# R-Squared

# Formula for Adjusted R^2

$R^2_{adj.} = 1 - (a-R^2)*\frac{n-1}{n-p-1}$

In [39]:
x.shape

(84, 2)

In [42]:
r2 = reg.score(x,y)
print("R-Squared =",r2)
n = x.shape[0]
p = x.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

R-Squared = 0.6476831211425227


0.6389839389485109

In [43]:
reg.score(x,y)

0.6476831211425227

# Coeficient 

In [26]:
reg.coef_

array([0.00083844, 0.31740009])

# intercept

In [27]:
reg.intercept_

1.6054900105802536

# Feature selection

In [44]:
from sklearn.feature_selection import f_regression

In [45]:
f_regression(x,y)

(array([ 56.04804786, 110.40014127]), array([7.19951844e-11, 7.49054135e-17]))

In [46]:
p_values = f_regression(x,y)[1]
p_values

array([7.19951844e-11, 7.49054135e-17])

In [55]:
p_values.round(3)

array([0., 0.])

# should be noted that both SAT and Attend are valueable features for our model
# Note: these are the univariate p-values reached from single linear models.
# they do not reflect the interconnection of the features in our multiple linear regression

# Creating a Summary table

In [57]:
reg_summary = pd.DataFrame(data=['SAT','Attend'],columns=['Features'])
reg_summary 

Unnamed: 0,Features
0,SAT
1,Attend


In [58]:
reg_summary = pd.DataFrame(data=x.columns.values,columns=['Features'])
reg_summary 

Unnamed: 0,Features
0,SAT
1,Attend


In [59]:
reg_summary['Coeffiecient'] = reg.coef_
reg_summary['p-values'] = p_values.round(3)

In [61]:
reg_summary

Unnamed: 0,Features,Coeffiecient,p-values
0,SAT,0.000838,0.0
1,Attend,0.3174,0.0
