In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
sns.set_style('whitegrid')

In [2]:
# Regression
from sklearn.linear_model import LinearRegression

In [3]:
data = pd.read_csv('sat.csv')

In [4]:
data

Unnamed: 0,SAT,GPA
0,1714,2.40
1,1664,2.52
2,1760,2.54
3,1685,2.74
4,1693,2.83
...,...,...
79,1936,3.71
80,1810,3.71
81,1987,3.73
82,1962,3.76


In [5]:
X = data['SAT']

In [6]:
X

0     1714
1     1664
2     1760
3     1685
4     1693
      ... 
79    1936
80    1810
81    1987
82    1962
83    2050
Name: SAT, Length: 84, dtype: int64

In [7]:
y = data['GPA']

In [8]:
y

0     2.40
1     2.52
2     2.54
3     2.74
4     2.83
      ... 
79    3.71
80    3.71
81    3.73
82    3.76
83    3.81
Name: GPA, Length: 84, dtype: float64

In [9]:
y.shape

(84,)

In [10]:
reg = LinearRegression()

In [11]:
reg.fit(np.array(X).reshape(-1,1),y)

LinearRegression()

In [12]:
new_data = pd.DataFrame(data= [1740,1860], columns= ['SAT'])

In [13]:
new_data

Unnamed: 0,SAT
0,1740
1,1860


In [14]:
reg.predict(new_data)

array([3.15593751, 3.35462007])

In [15]:
reg.coef_

array([0.00165569])

In [16]:
reg.intercept_

0.2750402996602803

In [17]:
reg.score(np.array(X).reshape(-1,1),y)

0.40600391479679765

# Multiple Regression

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [19]:
from sklearn.linear_model import LinearRegression

In [20]:
data = pd.read_csv('multisat.csv')

In [21]:
data

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
0,1714,2.40,1
1,1664,2.52,3
2,1760,2.54,3
3,1685,2.74,3
4,1693,2.83,2
...,...,...,...
79,1936,3.71,3
80,1810,3.71,1
81,1987,3.73,3
82,1962,3.76,1


In [22]:
X = data[['SAT','Rand 1,2,3']]

In [23]:
X

Unnamed: 0,SAT,"Rand 1,2,3"
0,1714,1
1,1664,3
2,1760,3
3,1685,3
4,1693,2
...,...,...
79,1936,3
80,1810,1
81,1987,3
82,1962,1


In [24]:
y = data['GPA']

In [25]:
mul_reg = LinearRegression()

In [26]:
mul_reg.fit(X,y)

LinearRegression()

In [27]:
mul_reg.coef_

array([ 0.00165354, -0.00826982])

In [28]:
mul_reg.score(X,y)

0.40668119528142843

In [29]:
from sklearn.feature_selection import f_regression

In [30]:
f_regression(X,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [31]:
f_regression(X,y) [1]

array([7.19951844e-11, 6.76291372e-01])

In [32]:
p_value = f_regression(X,y) [1]

In [33]:
p_value

array([7.19951844e-11, 6.76291372e-01])

In [34]:
p_value.round(3)

array([0.   , 0.676])

In [35]:
reg_summary = pd.DataFrame(data=['SAT','Rand 1,2,3'],columns=['features'])

In [36]:
reg_summary

Unnamed: 0,features
0,SAT
1,"Rand 1,2,3"


In [37]:
reg_summary['coefficent'] = mul_reg.coef_

In [38]:
reg_summary

Unnamed: 0,features,coefficent
0,SAT,0.001654
1,"Rand 1,2,3",-0.00827


In [39]:
reg_summary['p_value'] = p_value.round(3)

In [40]:
reg_summary

Unnamed: 0,features,coefficent,p_value
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676
