In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


from sklearn.linear_model import LinearRegression

In [4]:
#load data
data = pd.read_csv("rand_dummy.csv")
data.head()

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
0,1714,2.4,1
1,1664,2.52,3
2,1760,2.54,3
3,1685,2.74,3
4,1693,2.83,2


In [5]:
data.describe()

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
count,84.0,84.0,84.0
mean,1845.27381,3.330238,2.059524
std,104.530661,0.271617,0.855192
min,1634.0,2.4,1.0
25%,1772.0,3.19,1.0
50%,1846.0,3.38,2.0
75%,1934.0,3.5025,3.0
max,2050.0,3.81,3.0


In [6]:
#create the multiple linear regression
x = data[['SAT', 'Rand 1,2,3']]
y = data['GPA']

In [7]:
#Using SKlearn

regression = LinearRegression()

regression.fit(x,y)

In [8]:
#get the score to know the r-squared value
regression.score(x,y)

0.40668119528142843

In [9]:
#get the intercept
regression.intercept_

0.29603261264909486

In [11]:
#get the coefficient
regression.coef_

array([ 0.00165354, -0.00826982])

### Compare with Statmodel 

In [13]:
import statsmodels.api as sm

In [15]:
x1 = sm.add_constant(x)
results = sm.OLS(y,x1).fit()
results.summary()

0,1,2,3
Dep. Variable:,GPA,R-squared:,0.407
Model:,OLS,Adj. R-squared:,0.392
Method:,Least Squares,F-statistic:,27.76
Date:,"Thu, 03 Oct 2024",Prob (F-statistic):,6.58e-10
Time:,13:22:50,Log-Likelihood:,12.72
No. Observations:,84,AIC:,-19.44
Df Residuals:,81,BIC:,-12.15
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2960,0.417,0.710,0.480,-0.533,1.125
SAT,0.0017,0.000,7.432,0.000,0.001,0.002
"Rand 1,2,3",-0.0083,0.027,-0.304,0.762,-0.062,0.046

0,1,2,3
Omnibus:,12.992,Durbin-Watson:,0.948
Prob(Omnibus):,0.002,Jarque-Bera (JB):,16.364
Skew:,-0.731,Prob(JB):,0.00028
Kurtosis:,4.594,Cond. No.,33300.0


### Calculating Adjusted R-Squared in Sklearn

In Sklearn, there is no in-built module for the adjusted r-squared. So we have to manually calculate it using the formular.
The formular for adjusted r-squared is:

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [16]:
#n = number of rows or records
#p = number of columns or features

In [19]:
x.shape

(84, 2)

In [21]:
aR = 1 - (1- 0.407) * ((84-1)/(84-2-1))
aR

0.39235802469135794

### Determining Feature Seelection Using f_regression in Sklearn

The f_regression finds the f-statistics for the *simple* regression created with each of the independent variables. So that means it assumes that the linear regression is performed using each of the variables separately. So it doesn't take into account any form of mutual effect of the variables

In [23]:
from sklearn.feature_selection import f_regression
f_regression(x,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

3.1245e-6 = 3.1245 x 10 -6 = 0.0000031245

In [24]:
p_values = f_regression(x,y)[1]
p_values

array([7.19951844e-11, 6.76291372e-01])

In [26]:
p_values.round(3)

array([0.   , 0.676])

In [27]:
x

Unnamed: 0,SAT,"Rand 1,2,3"
0,1714,1
1,1664,3
2,1760,3
3,1685,3
4,1693,2
...,...,...
79,1936,3
80,1810,1
81,1987,3
82,1962,1


## Standardization

Data standardization is a critical aspect of data preprocessing in analytics, machine learning, and various applications requiring data-driven insights. It refers to the process of converting data into a uniform format, ensuring its consistency across different datasets or variables


n1 = 123, 456, 301, 565
n2 = 1.1, 2.1, 3.1

Standardization is also nknown as feature scaling.

This is done by subtracting each value from the mean value and dividing by the standard deviation.


In [30]:
n1 = 123, 456, 301, 565
n2 = 1.1, 2.1, 3.1, 1.4

n1 = np.array(n1)
n1
n2 = np.array(n2)
n2

array([1.1, 2.1, 3.1, 1.4])

In [31]:
df1 = pd.DataFrame(data=n1)
df1

Unnamed: 0,0
0,123
1,456
2,301
3,565


In [33]:
df1.mean()

0    361.25
dtype: float64

In [34]:
df1.std()

0    192.253955
dtype: float64

In [35]:
(123 - 361)/192

-1.2395833333333333

In [36]:
from sklearn.preprocessing import StandardScaler

In [38]:
scaler = StandardScaler()

In [39]:
scaler.fit(x)

In [40]:
scaler

In [41]:
x_scaled = scaler.transform(x)
x_scaled

array([[-1.26338288, -1.24637147],
       [-1.74458431,  1.10632974],
       [-0.82067757,  1.10632974],
       [-1.54247971,  1.10632974],
       [-1.46548748, -0.07002087],
       [-1.68684014, -1.24637147],
       [-0.78218146, -0.07002087],
       [-0.78218146, -1.24637147],
       [-0.51270866, -0.07002087],
       [ 0.04548499,  1.10632974],
       [-1.06127829,  1.10632974],
       [-0.67631715, -0.07002087],
       [-1.06127829, -1.24637147],
       [-1.28263094,  1.10632974],
       [-0.6955652 , -0.07002087],
       [ 0.25721362, -0.07002087],
       [-0.86879772,  1.10632974],
       [-1.64834403, -0.07002087],
       [-0.03150724,  1.10632974],
       [-0.57045283,  1.10632974],
       [-0.81105355,  1.10632974],
       [-1.18639066,  1.10632974],
       [-1.75420834,  1.10632974],
       [-1.52323165, -1.24637147],
       [ 1.23886453, -1.24637147],
       [-0.18549169, -1.24637147],
       [-0.5608288 , -1.24637147],
       [-0.23361183,  1.10632974],
       [ 1.68156984,

In [42]:
reg = LinearRegression()

reg.fit(x_scaled,y)

In [43]:
reg.score(x_scaled, y)

0.4066811952814283