### Multiple Linear Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("50_Startups.csv")
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [3]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

In [5]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [6]:
states = pd.get_dummies(X['State'], drop_first=True)

In [7]:
X = X.drop('State', axis=1)

In [8]:
X = pd.concat([X,states],axis=1)

In [9]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
# Model Multiple Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [13]:
y_pred = regressor.predict(X_test)

In [14]:
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)

In [15]:
# If score value is closer to 1 it shows the model is good
score

0.9347068473282424

### Multicollinerity in Linear Regression

In [16]:
import pandas as pd

In [19]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.12.0-cp37-none-win_amd64.whl (9.1 MB)
Collecting patsy>=0.5


The system cannot find the path specified.
You should consider upgrading via the 'C:\Users\AG20459\AppData\Local\Continuum\anaconda3\envs\tf2_2\python.exe -m pip install --upgrade pip' command.


  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.0


In [22]:
import statsmodels.api as sm
df_adv = pd.read_csv("data/advertising.csv",index_col=0)
X = df_adv[['TV','radio','newspaper']]
y = df_adv[['sales']]
df_adv.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [25]:
# OLS(Ordinary Least Square) model with intercept on TV and Radio
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

In [26]:
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Wed, 28 Oct 2020",Prob (F-statistic):,1.58e-96
Time:,22:41:31,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [27]:
import matplotlib.pyplot as plt
X.iloc[:,:-1].corr()

Unnamed: 0,const,TV,radio
const,,,
TV,,1.0,0.054809
radio,,0.054809,1.0


In [29]:
df_salary = pd.read_csv('data/Salary_Data.csv')
df_salary.head()

Unnamed: 0,YearsExperience,Age,Salary
0,1.1,21.0,39343
1,1.3,21.5,46205
2,1.5,21.7,37731
3,2.0,22.0,43525
4,2.2,22.2,39891


In [30]:
X = df_salary[['YearsExperience','Age']]
y= df_salary[['Salary']]

In [31]:
X=sm.add_constant(X)
model=sm.OLS(y,X).fit()

In [32]:
model.summary()

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.96
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,323.9
Date:,"Thu, 29 Oct 2020",Prob (F-statistic):,1.35e-19
Time:,07:00:44,Log-Likelihood:,-300.35
No. Observations:,30,AIC:,606.7
Df Residuals:,27,BIC:,610.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6661.9872,2.28e+04,-0.292,0.773,-5.35e+04,4.02e+04
YearsExperience,6153.3533,2337.092,2.633,0.014,1358.037,1.09e+04
Age,1836.0136,1285.034,1.429,0.165,-800.659,4472.686

0,1,2,3
Omnibus:,2.695,Durbin-Watson:,1.711
Prob(Omnibus):,0.26,Jarque-Bera (JB):,1.975
Skew:,0.456,Prob(JB):,0.372
Kurtosis:,2.135,Cond. No.,626.0


In [34]:
X.iloc[:,1:].corr()

Unnamed: 0,YearsExperience,Age
YearsExperience,1.0,0.987258
Age,0.987258,1.0
