

## Multi linear regression 

50-startups problem


In [1]:
#import libraries
import pandas as pd
import numpy as np 
import seaborn as sns
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

In [2]:
# loading the data
startup=pd.read_csv('50_Startups.csv')
startup.head()

**EDA**

In [3]:
startup.info()

All datatypes are correct.


In [4]:
# Dscriptive analysis
startup.describe()

In [5]:
# checking for null values
startup.isnull().sum()

there are no missing values in dataset

In [6]:
startup.shape

In [7]:
# renaming columns
data=startup.rename({'R&D Spend':'rds','Administration':'admin','Marketing Spend':'marketing','State':'state','Profit':'profit'},axis=1)
data.head()

In [8]:
# for finding duplicate values
data[data.duplicated()]

There are no duplicate values in dataset.

**Correlation**

In [9]:
data.corr()

In [10]:
sns.set_style(style='darkgrid')
sns.pairplot(data)

**Model Building**

In [11]:
model = smf.ols("profit~rds+admin+marketing",data=data).fit()

**Model Testing**

In [12]:
# finding coefficient parameter
model.params

In [13]:
# finding tvalues and pvalues
model.tvalues , np.round(model.pvalues,5)

In [14]:
# finding rsquared values
model.rsquared , model.rsquared_adj

Model accuracy = 94.75%

In [15]:
# build SLR and MLR models for insignificant variables 'admin' and 'marketing'
slr_admin=smf.ols("profit~admin",data=data).fit()

In [16]:
# finding tvalues and pvalues
slr_admin.tvalues , slr_admin.pvalues

In [17]:
# for marketing
slr_mark=smf.ols("profit~marketing",data=data).fit()
slr_mark.tvalues,slr_mark.pvalues

In [18]:
# for admin+marketing MLR
mlr_am=smf.ols("profit~admin+marketing",data=data).fit()
mlr_am.tvalues,mlr_am.pvalues

**Model Validation**

**1.Colinearity check**

In [19]:
# 1) Colinearity Problem Check
# Calculate VIF = 1/(1-Rsquare) for all independent variables

rsq_r=smf.ols("rds~admin+marketing",data=data).fit().rsquared
vif_r=1/(1-rsq_r)

In [20]:
rsq_a=smf.ols("admin~rds+marketing",data=data).fit().rsquared
vif_a=1/(1-rsq_a)

In [21]:
rsq_m=smf.ols("marketing~rds+admin",data=data).fit().rsquared
vif_m=1/(1-rsq_m)

In [22]:
# Putting the values in Dataframe format
data1={'Variables':['rds','admin','marketing'],'Vif':[vif_r,vif_a,vif_m]}
Vif_df=pd.DataFrame(data1)
Vif_df

**2.Residual Analysis**

In [23]:
import statsmodels.api as sm
model=smf.ols("profit~rds+admin+marketing",data=data).fit()
qqplot=sm.qqplot(model.resid,line='q') 
plt.title("Normal Q-Q plot of residuals")
plt.show()

Data showing homoscedacity

In [24]:
list(np.where(model.resid>10))

**Residual Plot for Homoscedasticity**

In [25]:
model=smf.ols("profit~rds+admin+marketing",data=data).fit()

In [26]:
model.summary()

In [27]:
def get_standardized_values( vals ):
    return (vals - vals.mean())/vals.std()

In [28]:
plt.figure(figsize=(15,10))
plt.scatter(get_standardized_values(model.fittedvalues),
            get_standardized_values(model.resid))

plt.title('Residual Plot')
plt.xlabel('Standardized Fitted values')
plt.ylabel('Standardized residual values')
plt.show()

**Residual Vs Regressors plots**

In [29]:
fig = plt.figure(figsize=(15,10))
fig = sm.graphics.plot_regress_exog(model, "rds", fig=fig)
plt.show()

In [30]:
fig = plt.figure(figsize=(15,10))
fig = sm.graphics.plot_regress_exog(model, "admin", fig=fig)
plt.show()

In [31]:
fig = plt.figure(figsize=(15,10))
fig = sm.graphics.plot_regress_exog(model, "marketing", fig=fig)
plt.show()

**Identify the outliers**

by cook's distance

In [32]:
from statsmodels.graphics.regressionplots import influence_plot

model_influence = model.get_influence()
(c, _) = model_influence.cooks_distance

In [33]:
#Plot the influencers values using stem plot
fig = plt.subplots(figsize=(20, 7))
plt.stem(np.arange(len(startup)), np.round(c, 3))
plt.xlabel('Row index')
plt.ylabel('Cooks Distance')
plt.show()

index and value of the influencer where c is more than 0.5

In [34]:
(np.argmax(c),np.max(c))

**High Influence points**

In [35]:
k = startup.shape[1]
n = startup.shape[0]
leverage_cutoff = 3*((k + 1)/n)
leverage_cutoff

In [36]:
from statsmodels.graphics.regressionplots import influence_plot


In [37]:
influence_plot(model,alhpa=0.5)

y=[i for i in range(-2,8)]
x=[leverage_cutoff for i in range(10)]
plt.plot(x,y,'r+')

plt.show()

we can see in above plot,that the data point 49 is the influencer.

In [38]:
data[data.index.isin([49])]

**Improving the model**

In [39]:
#Discard the data points which are influencers and reasign the row number (reset_index())
data2=data.drop(startup.index[[49]],axis=0).reset_index(drop=True)

In [40]:
data2

**Final Model**

In [41]:
# model delation diagnosis
while np.max(c)>0.5 :
    model=smf.ols("profit~rds+admin+marketing",data=data2).fit()
    (c,_)=model.get_influence().cooks_distance
    c
    np.argmax(c) , np.max(c)
    data_=data2.drop(data2.index[[np.argmax(c)]],axis=0).reset_index(drop=True)
    data_
else:
    final_model=smf.ols("profit~rds+admin+marketing",data=data2).fit()
    final_model.rsquared , final_model.aic
    print("Thus model accuracy is improved to",final_model.rsquared)

In [42]:
final_model.rsquared

In [43]:
final_model.summary()

**Model prediction**

In [44]:
# prediction for new data
startup1=pd.DataFrame({'rds':70000,"admin":90000,"marketing":140000},index=[0])
startup1

In [45]:
# Manual Prediction of Price
final_model.predict(startup1)

In [46]:
# Automatic Prediction of Price 
pred_y=final_model.predict(data)
pred_y

In [47]:
d2={'Prep_Models':['Model','Final_Model'],'Rsquared':[model.rsquared,final_model.rsquared]}
table=pd.DataFrame(d2)
table