# Multiple linear regression

## 1- Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

## 2- Load the data

In [2]:
data = pd.read_csv('13_2, Multiple Linear Regression.csv')
data.describe()

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
count,84.0,84.0,84.0
mean,1845.27381,3.330238,2.059524
std,104.530661,0.271617,0.855192
min,1634.0,2.4,1.0
25%,1772.0,3.19,1.0
50%,1846.0,3.38,2.0
75%,1934.0,3.5025,3.0
max,2050.0,3.81,3.0


## 3- Create your multiple regression

In [3]:
y = data ['GPA']
x = data [['SAT','Rand 1,2,3']]
x.head()

Unnamed: 0,SAT,"Rand 1,2,3"
0,1714,1
1,1664,3
2,1760,3
3,1685,3
4,1693,2


In [4]:
reg=LinearRegression()
reg.fit(x,y)

In [5]:
reg.coef_

array([ 0.00165354, -0.00826982])

In [6]:
reg.intercept_

0.29603261264909486

In [7]:
r2=reg.score(x,y)
r2

0.40668119528142843

# we have two inputs and we want to see adjusted R_square
1 – [ (1-R2)* (n-1)/ (n-P-1)]

In [8]:
x.shape

(84, 2)

In [9]:
n=x.shape[0]
p=x.shape[1]

In [10]:
n

84

In [11]:
p

2

In [12]:
adjusted_r2= 1- (1-r2)* (n-1)/ (n-p-1)
adjusted_r2

0.39203134825134023

In [13]:
from sklearn.feature_selection import f_regression

In [14]:
f_regression(x,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [15]:
p_value=f_regression(x,y)[1]
p_value

array([7.19951844e-11, 6.76291372e-01])

In [16]:
p_value.round(3)

array([0.   , 0.676])

# Creating a Summary Table 

In [17]:
reg_summary=pd.DataFrame(data=x.columns.values,columns=['Features'])

In [18]:
reg_summary

Unnamed: 0,Features
0,SAT
1,"Rand 1,2,3"


In [19]:
reg_summary['coefficients'] = reg.coef_

# Assign the rounded p-values to the 'P-values' key
reg_summary['P-values'] = p_value.round(3)

In [20]:
reg_summary

Unnamed: 0,Features,coefficients,P-values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


------------------------------------------------------------------------------------------------

# Function for Adjusted R_square
def adjusted_r_squared(r_squared, n, k):
    adjusted_r2 = 1 - ((1 - r_squared) * (n - 1) / (n - k - 1))
    return adjusted_r2

In [26]:
y = data ['GPA']
x = data [['SAT','Rand 1,2,3']]

In [36]:
y

0     2.40
1     2.52
2     2.54
3     2.74
4     2.83
      ... 
79    3.71
80    3.71
81    3.73
82    3.76
83    3.81
Name: GPA, Length: 84, dtype: float64

In [28]:
from sklearn.preprocessing import StandardScaler

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to your data
scaler.fit(x)

In [34]:
x_scaled=scaler.transform(x)
x_scaled

array([[-1.26338288, -1.24637147],
       [-1.74458431,  1.10632974],
       [-0.82067757,  1.10632974],
       [-1.54247971,  1.10632974],
       [-1.46548748, -0.07002087],
       [-1.68684014, -1.24637147],
       [-0.78218146, -0.07002087],
       [-0.78218146, -1.24637147],
       [-0.51270866, -0.07002087],
       [ 0.04548499,  1.10632974],
       [-1.06127829,  1.10632974],
       [-0.67631715, -0.07002087],
       [-1.06127829, -1.24637147],
       [-1.28263094,  1.10632974],
       [-0.6955652 , -0.07002087],
       [ 0.25721362, -0.07002087],
       [-0.86879772,  1.10632974],
       [-1.64834403, -0.07002087],
       [-0.03150724,  1.10632974],
       [-0.57045283,  1.10632974],
       [-0.81105355,  1.10632974],
       [-1.18639066,  1.10632974],
       [-1.75420834,  1.10632974],
       [-1.52323165, -1.24637147],
       [ 1.23886453, -1.24637147],
       [-0.18549169, -1.24637147],
       [-0.5608288 , -1.24637147],
       [-0.23361183,  1.10632974],
       [ 1.68156984,

In [35]:
reg2=LinearRegression()

In [37]:
reg2.fit(x_scaled,y)

In [38]:
reg2.coef_

array([ 0.17181389, -0.00703007])

In [39]:
reg2.intercept_

3.330238095238095

In [41]:
reg2.score(x_scaled,y)

0.4066811952814283

In [42]:
reg2_summary=pd.DataFrame([['Bias'],['SAT'],['Rand 1,2,3']],columns=['Features'])
reg2_summary['weîght']=reg2.intercept_,reg2.coef_[0],reg2.coef_[1]

In [43]:
reg2_summary

Unnamed: 0,Features,weîght
0,Bias,3.330238
1,SAT,0.171814
2,"Rand 1,2,3",-0.00703


# Making prediction with standardized data

In [45]:
new_data=pd.DataFrame(data=[[1700,2],[1800,1]],columns=[['SAT','Rand 1,2,3']])

In [46]:
new_data

Unnamed: 0,SAT,"Rand 1,2,3"
0,1700,2
1,1800,1


In [47]:
reg2.predict(new_data)

array([295.39979563, 312.58821497])

In [None]:
new_data_scaled=scaler.transform(new_data)
new_data_scaled

In [49]:
reg2.predict(new_data_scaled)

array([3.09051403, 3.26413803])

# What if we removed 'Rand 1,2,3'

In [51]:
reg_simple=LinearRegression()

In [52]:
x_simple_matrix=x_scaled[:,0].reshape(-1,1)

In [53]:
x_simple_matrix

array([[-1.26338288],
       [-1.74458431],
       [-0.82067757],
       [-1.54247971],
       [-1.46548748],
       [-1.68684014],
       [-0.78218146],
       [-0.78218146],
       [-0.51270866],
       [ 0.04548499],
       [-1.06127829],
       [-0.67631715],
       [-1.06127829],
       [-1.28263094],
       [-0.6955652 ],
       [ 0.25721362],
       [-0.86879772],
       [-1.64834403],
       [-0.03150724],
       [-0.57045283],
       [-0.81105355],
       [-1.18639066],
       [-1.75420834],
       [-1.52323165],
       [ 1.23886453],
       [-0.18549169],
       [-0.5608288 ],
       [-0.23361183],
       [ 1.68156984],
       [-0.4934606 ],
       [-0.73406132],
       [ 0.85390339],
       [-0.67631715],
       [ 0.09360513],
       [ 0.33420585],
       [ 0.03586096],
       [-0.35872421],
       [ 1.04638396],
       [-0.65706909],
       [-0.13737155],
       [ 0.18984542],
       [ 0.04548499],
       [ 1.1618723 ],
       [-1.37887123],
       [ 1.39284898],
       [ 0

In [55]:
reg_simple.fit(x_simple_matrix,y)

In [58]:
reg_simple.predict(new_data_scaled[:,0].reshape(-1,1))

array([3.08970998, 3.25527879])