In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataset = pd.read_csv(r'D:\Trainer\DSc ML\Datasets\Linear Reg\50_Startups.csv')
# dataset = pd.read_csv(r'D:\Trainer\DSc ML\Datasets\Linear Reg\Salary_Data.csv')

In [3]:
dataset.head()

# Profit: DV

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
# Status of the dataset

dataset.info()

# No missing values
# State is a categorical variable but it is necessary so we can't remove
# So, we need to encode it because the model won't work on categorical values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.8+ KB


In [5]:
# No missing values
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

#### Split data into independent variable, x and dependent variable, y

In [6]:
x = dataset.iloc[:, 0 : 4].values
x

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

In [7]:
# No endogeneity problem, as we rightly found dependent variable
y = dataset.iloc[:, 4].values
y

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94, 156991.12,
       156122.51, 155752.6 , 152211.77, 149759.96, 146121.95, 144259.4 ,
       141585.52, 134307.35, 132602.65, 129917.04, 126992.93, 125370.37,
       124266.9 , 122776.86, 118474.03, 111313.02, 110352.25, 108733.99,
       108552.04, 107404.34, 105733.54, 105008.31, 103282.38, 101004.64,
        99937.59,  97483.56,  97427.84,  96778.92,  96712.8 ,  96479.51,
        90708.19,  89949.14,  81229.06,  81005.76,  78239.91,  77798.83,
        71498.49,  69758.98,  65200.33,  64926.08,  49490.75,  42559.73,
        35673.41,  14681.4 ])

### Encoding

In [8]:
# x1 = pd.DataFrame(x) # Converting x into dataframe to do encoding
# x1.head()

### We will not use manual approach as we have to convert both x and y into dataframes

In [9]:

# x1['State'] = x1['State'].astype('category')
# x1['State'] = x1['State'].cat.codes #Encodes starting from 0 as per ascending order
# x2  = pd.get_dummies(x1, columns = ['State']) # Get One Hot Encoding

# x1 = x1.iloc[:, 0 : 4] # get dummy variables

### We will use sklearn package

In [10]:
from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder()
x[:, 3] = encoding.fit_transform(x[:, 3]) # Encoding into serial numbers

In [11]:
x

array([[165349.2, 136897.8, 471784.1, 2],
       [162597.7, 151377.59, 443898.53, 0],
       [153441.51, 101145.55, 407934.54, 1],
       [144372.41, 118671.85, 383199.62, 2],
       [142107.34, 91391.77, 366168.42, 1],
       [131876.9, 99814.71, 362861.36, 2],
       [134615.46, 147198.87, 127716.82, 0],
       [130298.13, 145530.06, 323876.68, 1],
       [120542.52, 148718.95, 311613.29, 2],
       [123334.88, 108679.17, 304981.62, 0],
       [101913.08, 110594.11, 229160.95, 1],
       [100671.96, 91790.61, 249744.55, 0],
       [93863.75, 127320.38, 249839.44, 1],
       [91992.39, 135495.07, 252664.93, 0],
       [119943.24, 156547.42, 256512.92, 1],
       [114523.61, 122616.84, 261776.23, 2],
       [78013.11, 121597.55, 264346.06, 0],
       [94657.16, 145077.58, 282574.31, 2],
       [91749.16, 114175.79, 294919.57, 1],
       [86419.7, 153514.11, 0.0, 2],
       [76253.86, 113867.3, 298664.47, 0],
       [78389.47, 153773.43, 299737.29, 2],
       [73994.56, 122782.75, 30331

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
col_transformer = ColumnTransformer([('encoder', OneHotEncoder(), [3])], remainder = 'passthrough')

# Setting remainder=’passthrough’ will mean that all columns not specified in the list of “transformers” will be passed through
# without transformation, instead of being dropped.

In [13]:
col_transformer

ColumnTransformer(remainder='passthrough',
                  transformers=[('encoder', OneHotEncoder(), [3])])

In [14]:
x1 = col_transformer.fit_transform(x)

In [15]:
x = np.hstack((x1[:, 3 : 6], x1[:, 0 : 3]))

In [16]:
pd.DataFrame(x).head()

Unnamed: 0,0,1,2,3,4,5
0,165349.2,136897.8,471784.1,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,0.0,1.0,0.0


In [17]:
len(x[0])

6

In [18]:
x = x[:, : len(x[0]) - 1] # Getting Dummy Variables

In [19]:
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4
0,165349.2,136897.8,471784.1,0.0,0.0
1,162597.7,151377.59,443898.53,1.0,0.0
2,153441.51,101145.55,407934.54,0.0,1.0
3,144372.41,118671.85,383199.62,0.0,0.0
4,142107.34,91391.77,366168.42,0.0,1.0
5,131876.9,99814.71,362861.36,0.0,0.0
6,134615.46,147198.87,127716.82,1.0,0.0
7,130298.13,145530.06,323876.68,0.0,1.0
8,120542.52,148718.95,311613.29,0.0,0.0
9,123334.88,108679.17,304981.62,1.0,0.0


### Split into training and test

In [20]:
from sklearn.model_selection import train_test_split
# from sklearn.cross_validation import train_test_split # for Python 2.7

x_train, x_test, y_train, y_test = train_test_split(x, y) # By default we will get 75% : 25% split

In [21]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(37, 5)
(13, 5)
(37,)
(13,)


In [22]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

# Arguments of LinearRegression() if used or tuned, it is called as Hyperparameter tuning

In [23]:
regressor.fit(x_train, y_train)

LinearRegression()

In [24]:
y_pred = regressor.predict(x_test)

In [25]:
y_pred

array([149365.88460003,  64244.87799749,  55343.53312077,  99493.92185129,
       110132.03212471, 116301.09307534, 114467.00379459, 101718.98555035,
        53877.83411307, 192264.20526624,  75393.11820325, 182399.87175684,
       103461.41230143])

In [26]:
y_test

array([132602.65,  65200.33,  49490.75,  97483.56, 108733.99, 108552.04,
       110352.25, 107404.34,  35673.41, 192261.83,  90708.19, 191050.39,
       101004.64])

In [27]:
regressor.coef_

array([ 8.17160098e-01, -5.40170479e-02,  1.76189642e-02, -2.24484862e+03,
       -9.40112275e+02])

In [28]:
regressor.intercept_

56229.90469566645

In [29]:
from sklearn.metrics import r2_score

In [30]:
r2_score(y_test, y_pred)

0.9577807294290395

## 95% Accuracy

It is a multiple linear regression problem so we have to go for OLS method

#### It is a Multiple Linear Regression, hence we have to use OLS method, find Durbin Watson test and verify other assumptions

In [31]:
# import statsmodels.formula.api as sm # deprecated
import statsmodels.api as sm

In [32]:
x_train.dtype

dtype('O')

In [33]:
x_train = np.array(x_train, dtype=float)

In [34]:
x_train.dtype

dtype('float64')

In [35]:
regressor_ols = sm.OLS(endog = y_train, exog = x_train).fit()

# Endogeneity: Dependent Variable
# Exogeneity: Independent Variable

In [36]:
regressor_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.988
Model:,OLS,Adj. R-squared (uncentered):,0.986
Method:,Least Squares,F-statistic:,507.3
Date:,"Wed, 02 Feb 2022",Prob (F-statistic):,1.7800000000000001e-29
Time:,18:23:35,Log-Likelihood:,-404.09
No. Observations:,37,AIC:,818.2
Df Residuals:,32,BIC:,826.2
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.7017,0.086,8.179,0.000,0.527,0.876
x2,0.3069,0.046,6.616,0.000,0.212,0.401
x3,0.0864,0.029,2.993,0.005,0.028,0.145
x4,6000.1366,5281.703,1.136,0.264,-4758.340,1.68e+04
x5,6020.8599,6001.365,1.003,0.323,-6203.521,1.82e+04

0,1,2,3
Omnibus:,0.003,Durbin-Watson:,1.691
Prob(Omnibus):,0.999,Jarque-Bera (JB):,0.106
Skew:,-0.004,Prob(JB):,0.949
Kurtosis:,2.738,Cond. No.,797000.0


### P > |t|: P is p-value.
### If P <= 0.05: Significant Variable
### If P > 0.05: Not significant

#### for State variable, p-value is more than 0.05.
#### So, State is not significant. We can remove it

**Adj R-Square is calculated considering only the significant variables**

**R-Square is calculates considering both significant and insignificant variables**

## Decision Tree & Random Forest Usng Regression Model

In [37]:
from sklearn.tree import DecisionTreeRegressor

In [38]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(x_train, y_train)

DecisionTreeRegressor()

In [39]:
y_pred_dt = dt_reg.predict(x_test)

In [40]:
r2_score(y_test, y_pred_dt)

0.8878645746933184

In [41]:
from sklearn.ensemble import RandomForestRegressor

In [42]:
rf_reg = RandomForestRegressor()
rf_reg.fit(x_train, y_train)

RandomForestRegressor()

In [43]:
y_pred_rf = rf_reg.predict(x_test)

In [44]:
r2_score(y_test, y_pred_rf)

0.9343187894086853