In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm 
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import sweetviz as sv 
import seaborn as sns
from statsmodels.graphics.regressionplots import influence_plot

In [None]:
pip install seaborn

In [None]:
import pandas as pd 

In [None]:
strt=pd.read_excel("50_Startups.xlsx")
strt.head()

In [None]:
strt.info()

In [None]:
strt.plot(kind='box',subplots=True, layout=(3,3),figsize=(20,20))

In [None]:
strt1=pd.get_dummies(data=strt,columns=['State'])

In [None]:
strt1.head()

In [None]:
from scipy import stats
zsc=stats.zscore(strt1)

In [None]:
zscores=np.abs(zsc)

In [None]:
filter_zscores=(zscores<3).all(axis=1)

In [None]:
filtered=strt1[filter_zscores]

In [None]:
filtered.shape #seems like there are no outliers as we can see after transforming df and trying to eliminate the zvalues above 3 and below -3 no row got eliminated 

In [None]:
#Lets Try Isolation Forest method 
#clf=IsolationForest(random_state=20,contamination=0.05) #based on domain knowledge the contamination value should be set
#clf.fit(strt1)

In [None]:
#strt1['scores']=clf.decision_function(strt1)

In [None]:
#strt1['anamoly']=clf.predict(strt1)

In [None]:
#strt1

In [None]:
strt1.corr() #we can observe that there is no multicollinearity problem between the independent variables

In [None]:
sns.heatmap(strt1.corr(),annot=True)
#we can consider multicollinearity prob when correlation 'r' value is above 0.75 its nominal to consider for multicollinearity
#there is barely any correlation between State_New_York vs Profit which is 0.031 we can drop this column

In [None]:
strt2=strt1.drop(['State_New York'],axis=1)
strt2.head()

In [None]:
scaler=MinMaxScaler(feature_range=(0,1))
columns=strt2.columns
d=scaler.fit_transform(strt2)
normdf=pd.DataFrame(d,columns=columns)
normdf.head()

In [None]:
sweet_report=sv.analyze(strt2)
sweet_report.show_html('EDA_of_50Startups.html')

In [None]:
normdf.corr()

In [None]:
normdf.rename(columns={'R&D Spend':'RD_Spend'}, inplace=True)
normdf.rename(columns={'Marketing Spend':'Marketing_Spend'},inplace=True)
normdf.head()

In [None]:
model= smf.ols('Profit~RD_Spend+Administration+Marketing_Spend+State_California+State_Florida',data=normdf).fit()
#here had to change the column name as the code was showing error because the smf was treating R as separate in R&D_Spend column


In [None]:
model.params

In [None]:
model.tvalues, '\n', model.pvalues

In [None]:
#lets build model2 only on RD_SPend,ADministration,Marketing_Spend
model2=smf.ols('Profit~RD_Spend+Administration+Marketing_Spend',data=normdf).fit()

In [None]:
model2.params

In [None]:
model2.tvalues, '\n',model2.pvalues 

In [None]:
normdf1=normdf.drop(['State_California','State_Florida'],axis=1)

In [None]:
normdf1.corr()

In [None]:
#Let us build model3 by excluding Administration
model3=smf.ols('Profit~Marketing_Spend+RD_Spend',data=normdf1).fit()

In [None]:
model3.params

In [None]:
model3.tvalues,'\n', model3.pvalues

In [None]:
model3.rsquared,model2.rsquared,model.rsquared
#we can see that after removing insignificant columns (Administration,State) the model's accuracy(R2 value) has barely changed

In [None]:
#Residual plots for Homoscedasticity
def get_standardized_values (vals):
    return (vals-vals.mean())/vals.std()

In [None]:
plt.scatter(get_standardized_values(model3.fittedvalues),
           get_standardized_values(model3.resid))
plt.title('Residuals plot for Model3')
plt.xlabel('Fittedvalues')
plt.ylabel('Residuals')
plt.show()


In [None]:
#Quantile-Quantile plot 
qqplot=sm.qqplot(model3.resid,line='q')
plt.title('Normality of Residuals Plot')
plt.show()


In [None]:
fig=plt.figure(figsize=(15,8))
fig=sm.graphics.plot_regress_exog(model3,'Marketing_Spend',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
fig=sm.graphics.plot_regress_exog(model3,'RD_Spend',fig=fig)
plt.show()

In [None]:
model_influence=model3.get_influence()
(c,_)=model_influence.cooks_distance
summary_cooks=model_influence.summary_frame()
summary_cooks

In [None]:
fig=plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(normdf1)),np.round(c,3))
plt.title("influential points")
plt.xlabel("row index")
plt.ylabel("Cook's Distance")
plt.show()

In [None]:
np.argmax(c),np.max(c)

In [None]:
pip install tabulate

In [None]:
from tabulate import tabulate

In [None]:
table=[['Model No.','Rsquared','Adj_Rsquared','AIC','BIC'],
       ['Model 1 ❌',model.rsquared,model.rsquared_adj,model.aic,model.bic],
       ['Model 2 ❌',model2.rsquared,model2.rsquared_adj,model2.aic,model2.bic],
       ['Model 3 ✔️',model3.rsquared,model3.rsquared_adj,model3.aic,model3.bic]]

In [None]:
print(tabulate(table,headers='firstrow',tablefmt='grid',showindex=range(1,4)))

In [None]:
model3.summary()

In [None]:
pip install lazypredict

In [None]:
from lazypredict.Supervised import LazyRegressor

In [None]:
strt.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [None]:
strt["State"]=le.fit_transform(strt["State"])

In [None]:
strt.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(1,2))
d=scaler.fit_transform(strt)
df=pd.DataFrame(d,columns=strt.columns)
df.head()

In [None]:
X=df.iloc[:,:-1]
Y=df.iloc[:,-1:]
Y

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(X,Y,test_size=0.33,random_state=9)


In [None]:
from lazypredict.Supervised import LazyRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
reg=LazyRegressor(verbose=0,ignore_warnings=False,custom_metric=None)
models,predictions= reg.fit(x_train,x_test,y_train,y_test)
print(models)

In [None]:
pip install xelatex