# This Document was build for Linear Regression / Predictions
# Author: Rick Ou
# Version: 1.0
Updated 2/9/2021
Package Used: Sklearn,Panda,Numpy,Seaborn,statsmodels
(Supervised Linear Model)

###  <font color='firebrick'> Setting Up

In [14]:
import pandas as pd
import plotly.graph_objs as go
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

In [16]:
#Import CSV
height = pd.read_csv("LR_galton.csv",index_col= 0) #Simple Linear 
house = pd.read_csv("LR_Housing.csv") #Mutiple Linear 

###  <font color='firebrick'> No Spliting - Sklearn

Simple:

In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
#Building Model
X = height.drop('child', axis = 1).values
y = height['child'].values
linear_model = LinearRegression(); 
linear_model.fit(X,y)

LinearRegression()

In [19]:
#Output of Model
r_sq = linear_model.score(X,y)
print('Rsq:', r_sq)
print('intercept:', linear_model.intercept_)
print('slope:', linear_model.coef_)

Rsq: 0.5712707984937203
intercept: 25.6411764132815
slope: [0.62145545]


In [20]:
# Simple Prediction Based on Model
y_pred = linear_model.predict(X)
x_new = np.arange(50,60).reshape((-1,1))
y_new = linear_model.predict(x_new)
print(y_new)

[56.71394872 57.33540416 57.95685961 58.57831506 59.1997705  59.82122595
 60.4426814  61.06413684 61.68559229 62.30704773]


Mutil:

In [21]:
#Brief View of Relationship - Could be shown with heat map
correlation_matrix = house.corr().round(2)
print(correlation_matrix)

         crim    zn  river    rm  ptratio  medv
crim     1.00 -0.20  -0.06 -0.22     0.29 -0.39
zn      -0.20  1.00  -0.04  0.31    -0.39  0.36
river   -0.06 -0.04   1.00  0.09    -0.12  0.18
rm      -0.22  0.31   0.09  1.00    -0.36  0.70
ptratio  0.29 -0.39  -0.12 -0.36     1.00 -0.51
medv    -0.39  0.36   0.18  0.70    -0.51  1.00


In [22]:
X2 = house[['ptratio','rm','river']].values
y2 = house['medv'].values
linear_model_2 = LinearRegression();
linear_model_2.fit(X2,y2)

LinearRegression()

In [23]:
r_sq_house = linear_model_2.score(X2,y2)
print("Rsq:", r_sq_house)
print("intercept:", linear_model_2.intercept_)
print("Coefficient:", linear_model_2.coef_)

Rsq: 0.5686477306723617
intercept: -3.0793824867043966
Coefficient: [-1.22958628  7.6516124   3.14115085]


###  <font color='firebrick'> No Spliting - Statsmodel

In [27]:
import statsmodels.api as sm

Simple:

In [30]:
#Building Model
 
X = height.drop('child', axis = 1).values
y = height['child'].values
X3 = sm.add_constant(X)
ols = sm.OLS(y,X3)
ols_result = ols.fit()
ols_result.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.571
Model:,OLS,Adj. R-squared:,0.571
Method:,Least Squares,F-statistic:,1234.0
Date:,"Wed, 03 Feb 2021",Prob (F-statistic):,1.7399999999999998e-172
Time:,12:00:56,Log-Likelihood:,-1316.3
No. Observations:,928,AIC:,2637.0
Df Residuals:,926,BIC:,2646.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,25.6412,1.209,21.212,0.000,23.269,28.013
x1,0.6215,0.018,35.127,0.000,0.587,0.656

0,1,2,3
Omnibus:,12.622,Durbin-Watson:,0.156
Prob(Omnibus):,0.002,Jarque-Bera (JB):,11.736
Skew:,-0.23,Prob(JB):,0.00283
Kurtosis:,2.698,Cond. No.,2520.0


Multiple:

In [32]:
X3 = house[['ptratio','rm','river']].values
y3 = house['medv'].values

X3 = sm.add_constant(X3)
ols1 = sm.OLS(y3,X3)
ols1_result = ols1.fit()
ols1_result.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.569
Model:,OLS,Adj. R-squared:,0.566
Method:,Least Squares,F-statistic:,220.6
Date:,"Wed, 03 Feb 2021",Prob (F-statistic):,2.98e-91
Time:,12:06:29,Log-Likelihood:,-1627.5
No. Observations:,506,AIC:,3263.0
Df Residuals:,502,BIC:,3280.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.0794,4.161,-0.740,0.460,-11.255,5.096
x1,-1.2296,0.134,-9.187,0.000,-1.493,-0.967
x2,7.6516,0.411,18.614,0.000,6.844,8.459
x3,3.1412,1.071,2.933,0.004,1.037,5.245

0,1,2,3
Omnibus:,165.871,Durbin-Watson:,0.802
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1219.6
Skew:,1.224,Prob(JB):,1.47e-265
Kurtosis:,10.201,Cond. No.,304.0


###  <font color='firebrick'> Spliting  -Sklearn

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix

In [27]:
#train_test_split function from scikit-learn’s model_selection
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [28]:
#X is predictoer and y is prediction variables. 
#Test size is 30% of all data while 70% are training 
from sklearn.model_selection import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test =train_test_split(X, y,test_size=0.3,random_state=0,stratify=y)

In [29]:
#Building Model
linear_model = LinearRegression(); 
linear_model.fit(X_train,y_train)

LinearRegression()

In [30]:
#Output of Model
r_sq = linear_model.score(X,y)
print('Rsq:', r_sq)
print('intercept:', linear_model.intercept_)
print('slope:', linear_model.coef_)

Rsq: 0.8978058595457641
intercept: 4.546420646208332
slope: [-0.12513277  0.04273739 -0.07449586  0.03005712 -0.00098181  0.17766913
 -0.38843476 -0.37358106  0.01103795  0.07904957 -0.02978961 -0.26339015
 -0.00076404]


In [31]:
# Simple Prediction Based on Model
y_pred = linear_model.predict(X_test)
print(y_pred)

[1.07014351 2.19672438 0.818916   0.98270616 1.5489863  2.97204665
 2.23112173 3.08734032 1.16514404 3.01599338 1.05133087 1.85582846
 2.99957892 1.136675   3.18152061 3.11665136 1.18946997 1.18826177
 3.01469928 1.54055079 0.85365725 2.94132931 2.08334918 1.99048162
 1.92322139 1.8586196  0.80023998 2.42140762 2.90422408 1.97753294
 1.5824685  2.81181759 1.28304785 2.241602   2.13509484 1.10726656
 2.17189711 1.26921464 1.33772734 2.83079053 2.75761483 1.96803339
 1.99475072 1.80037841 1.22587622 1.9358688  2.08406889 2.54786005
 2.21543174 2.68518804 2.23428746 2.44619913 1.33656904 1.24645804]


In [32]:
# Determine Model 
# The coefficients
print('Coefficients: \n', linear_model.coef_)
# The mean squared error
print('Mean squared error:', mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination:',r2_score(y_test, y_pred))

Coefficients: 
 [-0.12513277  0.04273739 -0.07449586  0.03005712 -0.00098181  0.17766913
 -0.38843476 -0.37358106  0.01103795  0.07904957 -0.02978961 -0.26339015
 -0.00076404]
Mean squared error: 0.05389563039120559
Coefficient of determination: 0.9113594708286771
