<a href="https://colab.research.google.com/github/revati2013/Assignment05_Multi-Linear-Regression/blob/main/Assignment_5(50_startup).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.graphics.regressionplots import influence_plot

In [None]:
d=pd.read_csv('50_Startups.csv')

In [None]:
d

In [None]:
data=d.rename({'R&D Spend':'RDS','Administration':'Adms','Marketing Spend':'MS'},axis=1)

In [None]:
data.info()

In [None]:
data[data.duplicated()]

No duplicate data

In [None]:
data.describe()

In [None]:
#correlation analysis
data.corr()

In [None]:
sns.pairplot(data)

In [None]:
#model building
model=smf.ols("Profit~RDS+Adms+MS",data=data).fit()

In [None]:
model.params

In [None]:
#finding tvalues and pvalues
model.tvalues,np.round(model.pvalues,5)

In [None]:
#finding rsquared values
model.rsquared , model.rsquared_adj 

Model accuracy is 94.75%

In [None]:
# Building SLR and MLR models for insignificant variables 'ADMS' and 'MKTS'
# Also finding their tvalues and pvalues
slr_a=smf.ols("Profit~Adms", data=data).fit()
slr_a.tvalues, slr_a.pvalues

In [None]:
slr_m=smf.ols("Profit~MS",data=data).fit()
slr_m.tvalues, slr_m.pvalues

In [None]:
mlr_am=smf.ols("Profit~Adms+MS",data=data).fit()
mlr_am.tvalues, mlr_am.pvalues

In [None]:
# 1) Collinearity Problem Check
# Calculate VIF = 1/(1-Rsquare) for all independent variables

rsq_r=smf.ols("RDS~Adms+MS",data=data).fit().rsquared
vif_r=1/(1-rsq_r)

rsq_a=smf.ols("Adms~RDS+MS",data=data).fit().rsquared
vif_a=1/(1-rsq_a)

rsq_m=smf.ols("MS~RDS+Adms",data=data).fit().rsquared
vif_m=1/(1-rsq_m)

# Putting the values in Dataframe format
d1={'Variables':['RDS','Adms','MS'],'Vif':[vif_r,vif_a,vif_m]}
Vif_df=pd.DataFrame(d1)
Vif_df

None variable has VIF>20,no collinearity

In [None]:
# 2) Residual Analysis
# Test for Normality of Residuals (Q-Q Plot) using residual model (model.resid)
sm.qqplot(model.resid,line='q')
plt.title("Normal QQ plot of residual")
plt.show()

In [None]:
# Test for Homoscedasticity or Heteroscedasticity (plotting model's standardized fitted values vs standardized residual values)

def standard_values(vals) : return (vals-vals.mean())/vals.std()  # User defined z = (x - mu)/sigma

plt.scatter(standard_values(model.fittedvalues),standard_values(model.resid))
plt.title('Residual Plot')
plt.xlabel('standardized fitted values')
plt.ylabel('standardized residual values')
plt.show() 

In [None]:
# Test for errors or Residuals Vs Regressors or independent 'x' variables or predictors 
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'RDS',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'Adms',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'MS',fig=fig)
plt.show()

In [None]:
#Model Deletion Diagnostics (checking Outliers or Influencers)
# Two Techniques : 1. Cook's Distance & 2. Leverage value
(c,_)=model.get_influence().cooks_distance

In [None]:
c

In [None]:
# Plot the influencers using the stem plot
fig=plt.figure(figsize=(15,7))
plt.stem(np.arange(len(data)),np.round(c,5))
plt.xlabel('Row Index')
plt.ylabel('Cooks Distance')
plt.show()

In [None]:
# Index and value of influencer where C>0.5
np.argmax(c),np.max(c)

In [None]:
# 2. Leverage Value using High Influence Points
influence_plot(model)
plt.show()

In [None]:
data[data.index.isin([49])]

In [None]:
data1=data.drop(data.index[[49]],axis=0)

In [None]:
data1

In [None]:
 # Final Model
 final_model=smf.ols("Profit~RDS+Adms+MS",data=data1).fit()
 final_model.rsquared , final_model.aic

model accuracy is improved to 96.13%

In [None]:
#New data for prediction is
new_data=pd.DataFrame({'RDS':70000,"Adms":90000,"MS":140000},index=[0])
new_data

In [None]:
final_model.predict(new_data)

In [None]:
# Automatic Prediction of Price with 90.02% accurcy
pred_y=final_model.predict(data1)
pred_y

In [None]:
#table containing R^2 value for each prepared model
d2={'Prep_Models':['Model','Final_Model'],'Rsquared':[model.rsquared,final_model.rsquared]}
table=pd.DataFrame(d2)

In [None]:
table