# Multiple Linear Regression
* Read the data from CSV
* Read all independent columns to X
* Read dependent (predictable) column to y
* X has categorical column called 'State', **do one-hot-encoding for categorical varaibles**
* Split the X,y to training and test data-sets

In [1]:
# import pandas, matplotlib and numpy
import pandas as pd
import numpy as np

In [2]:
# Read 50 startups data
df_startup = pd.read_csv('../data-csv/csv-files/50_Startups.csv')
df_startup.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# Read all the columns except the 'Profit' column to X
X = df_startup.iloc[:, :-1]
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [4]:
# Read the 'Profit' column to y
y = df_startup.iloc[:, 4]
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [5]:
# "State" is a categorical variable, lets use One Hot encoding 
# One hot encoding is a process by which categorical variables are converted into a form that could be 
# provided to ML algorithms to do a better job in prediction.
# Drop one column to avoid "DUMMY VARIABLE TRAP"
states = pd.get_dummies(X['State'], drop_first=True)
states.head()

Unnamed: 0,Florida,New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0


In [6]:
# Drops the 'State' column from X
X = X.drop('State', axis=1)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [7]:
# Conacat "one-hot-encoded" values of State column to X (axis=1 means add column)
X = pd.concat([X, states], axis=1)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [8]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [9]:
# LinearRegression fits a linear model with coefficients w = (w1, …, wp) using 
# plain Ordinary Least Squares
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
y_pred = regressor.predict(X_test)

In [11]:
# Model evaluation using Means squared error
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

# This is the mean absolute error for test-dataset

7514.293659640612

## Feature selection using backward elimination

* We have 4 features (also called independent variables) in the given data-set
* Some features may be statistically more significant to predict profit than the others
* Some features may have very less statistical significance 
* We need to identify and one of the technique used called Backward elimination

In [12]:
# To eliminate feature with least significance we use a statistical number called as "P-value"
# We can get the p-value of each feature/column using statsmodel.formula.api library
#!pip3 install statsmodels
import statsmodels.regression.linear_model as sm_regrsr_lmodel
X = np.append(arr = np.ones([50,1]).astype(int), values = X, axis = 1)

In [13]:
X_opt = X[:,[0,1,2,3,4]]
regressor_OLS = sm_regrsr_lmodel.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Tue, 10 Mar 2020",Prob (F-statistic):,8.49e-29
Time:,15:05:40,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,0.8060,0.046,17.606,0.000,0.714,0.898
x2,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x3,0.0270,0.017,1.592,0.118,-0.007,0.061
x4,220.1585,2900.536,0.076,0.940,-5621.821,6062.138

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [14]:
X_opt = X[:,[0,1,2,3]]
regressor_OLS = sm_regrsr_lmodel.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Tue, 10 Mar 2020",Prob (F-statistic):,4.53e-30
Time:,15:05:41,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [15]:
X_opt = X[:,[0,1,3]]
regressor_OLS = sm_regrsr_lmodel.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Tue, 10 Mar 2020",Prob (F-statistic):,2.1600000000000003e-31
Time:,15:05:42,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [16]:
X_opt = X[:,[0,1]]
regressor_OLS = sm_regrsr_lmodel.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Tue, 10 Mar 2020",Prob (F-statistic):,3.5000000000000004e-32
Time:,15:05:42,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [17]:
#Lets once again fit linear regression model using just R&D column
from sklearn.model_selection import train_test_split
X_opt_train, X_opt_test, y_opt_train, y_opt_test = train_test_split(X_opt, y, test_size = 0.2, random_state = 0)
# LinearRegression fits a linear model with coefficients w = (w1, …, wp) using 
# plain Ordinary Least Squares
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_opt_train, y_opt_train)
y_opt_pred = regressor.predict(X_opt_test)
# Model evaluation using Means squared error
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_opt_test, y_opt_pred)

6772.453280477901