In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('1.02. Multiple linear regression.csv')
data.head()

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
0,1714,2.4,1
1,1664,2.52,3
2,1760,2.54,3
3,1685,2.74,3
4,1693,2.83,2


In [3]:
y = data['GPA']
x = data[['SAT','Rand 1,2,3']]

reg = LinearRegression()
reg.fit(x,y)



LinearRegression()

In [4]:
reg.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': False,
 'positive': False}

In [43]:
x.std()

SAT           104.530661
Rand 1,2,3      0.855192
dtype: float64

In [5]:
reg.score(x,y)

0.40668119528142843

In [6]:
reg.intercept_

0.29603261264909486

In [7]:
reg.coef_

array([ 0.00165354, -0.00826982])

# let's create our own adjusted R^2 function

In [8]:
def get_adj_r2():
    r2 = reg.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    return 1-(1-r2)*(n-1)/(n-p-1)
    
adj_r2 = get_adj_r2()
adj_r2

0.39203134825134023

# Feature Selection

In [9]:
from sklearn.feature_selection import f_regression
feature_results = f_regression(x,y)

In [10]:
feature_results # 1st array = F-statistics
                # 2nd array = p-value
    #we are only interested in p-values

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [11]:
p_val_of_feature_results = feature_results[1].round(3)

In [12]:
p_val_of_feature_results

array([0.   , 0.676])

# Summary Table like statsmodels

In [14]:
reg_summary = pd.DataFrame(data=x.columns.values, columns=['Features']) #creating dataframe and first column as 'Features' 
reg_summary

Unnamed: 0,Features
0,SAT
1,"Rand 1,2,3"


In [17]:
reg_summary['Coefficients'] = reg.coef_ #adding another column as 'coefficients'
reg_summary

Unnamed: 0,Features,Coefficients
0,SAT,0.001654
1,"Rand 1,2,3",-0.00827


In [19]:
reg_summary['P-values'] = p_val_of_feature_results #adding another column as 'P-values'
reg_summary

Unnamed: 0,Features,Coefficients,P-values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


In [28]:
r2 = reg.score(x,y)
adj_r2 = get_adj_r2()

In [29]:
r2,adj_r2

(0.40668119528142843, 0.39203134825134023)

# standardization

In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)

scaled = scaler.transform(x)


In [46]:
reg_std = LinearRegression()
reg_std.fit(scaled,y)

LinearRegression()

In [47]:
reg_std.intercept_

3.330238095238095

In [48]:
reg_std.coef_

array([ 0.17181389, -0.00703007])

In [95]:
reg_std_summary = pd.DataFrame(data=['Bias','SAT','Rand 1,2,3'],columns=['Features'])

In [96]:
reg_std_summary['Weights(standardized Data)'] = reg_std.intercept_,reg_std.coef_[0],reg_std.coef_[1]

In [97]:
reg_std_summary['Weights(normal data)'] = reg.intercept_,reg.coef_[0],reg.coef_[1]

In [124]:
reg_std_summary

Unnamed: 0,Features,Weights(N),Weights(S)
0,Bias,0.296033,3.330238
1,SAT,0.001654,0.171814
2,"Rand 1,2,3",-0.00827,-0.00703


# Predict with Standardized model

In [180]:
new_data = pd.DataFrame(data=[[1700,2],[1800,1]],columns=['SAT','Rand 1,2,3'])

In [181]:
new_data

Unnamed: 0,SAT,"Rand 1,2,3"
0,1700,2
1,1800,1


In [182]:
new_data_scaled = scaler.transform(new_data)

In [183]:
new_data_scaled

array([[-1.39811928, -0.07002087],
       [-0.43571643, -1.24637147]])

In [184]:
result1 = reg_std.predict(new_data_scaled)
result1

array([3.09051403, 3.26413803])

# Predict after removing unneeded feature ( Rand 1,2,3 in this case)

In [149]:
reg_std_no_rand = LinearRegression()

In [150]:
scaled_data_refine = scaled[:,0].reshape(-1,1)

In [151]:
reg_std_no_rand.fit(scaled_data_refine,y)

LinearRegression()

In [185]:
new_data_scaled_refine = new_data_scaled[:,0].reshape(-1,1)
result2 = reg_std_no_rand.predict(new_data_scaled_refine)
result2

array([3.08970998, 3.25527879])

In [186]:
compare = pd.DataFrame(data=[[result1[0],result2[0]],[result1[1],result2[1]]],columns=['Prediction with Rand 1,2,3','Prediction without Rand 1,2,3'])

In [187]:
compare

Unnamed: 0,"Prediction with Rand 1,2,3","Prediction without Rand 1,2,3"
0,3.090514,3.08971
1,3.264138,3.255279
