In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

In [None]:
startup_df=pd.read_csv("50_Startups.csv")
startup_df.head()

In [None]:
startup_df.tail()

In [None]:
startup_df.info()

In [None]:
startup_df.isnull().sum()

In [None]:
startup_df.dtypes

In [None]:
startup_df.duplicated().sum()

In [None]:
startup_df=startup_df.rename({'R&D Spend':'R_DSpend','Marketing Spend':'MarketingSpend'},axis=1)

In [None]:
#fig = plt.figure(figsize=(10,60))

fig, axis = plt.subplots(nrows=2,ncols=2)
axis[0,0].boxplot(startup_df.R_DSpend)
axis[0,0].set_title('R  & D Spent')
axis[0,1].boxplot(startup_df.Administration)
axis[0,1].set_title('Administration spent')
axis[1,0].boxplot(startup_df.MarketingSpend)
axis[1,0].set_title('Marketing Spent')
axis[1,1].boxplot(startup_df.Profit)
axis[1,1].set_title('Profit')
fig.tight_layout()

In [None]:
# There are no outliers in the data

#
CORRELATION AND LINEARITY TEST

In [None]:
startup_df.corr()

In [None]:
sns.pairplot(startup_df)

In [None]:
### MarketSpend and R&D  have good correlation coefficient

#
BUILDING MODEL

In [None]:
model_basic=smf.ols('Profit~R_DSpend+Administration+MarketingSpend',data=startup_df).fit()
predicted_basic=model_basic.predict(startup_df)
model_basic.summary()

In [None]:
error_basic=startup_df.Profit-predicted_basic
error_basic.sum()

#
RMSE AND VIF METHODS


In [None]:
def RMSE(actual,predicted):
    error=actual-predicted
    rmse_value=np.sqrt(np.mean(error*error))
    return rmse_value

def vif(rSquared):
    vif_value=1/(1-rSquared)
    return vif_value

    

In [None]:
RMSE_basic=RMSE(startup_df.Profit,predicted_basic)
#vif_basic=vif(model_basic.rsquared)
RMSE_basic

In [None]:
# Since Administration has a probability value more it has no significance

# BUILDING MODEL BY REMOVING THE ADMINISTRATION FROM INPUT

In [None]:
model_rem_admin=smf.ols('Profit~R_DSpend+MarketingSpend',data=startup_df).fit()
predict_rem_admin=model_rem_admin.predict(startup_df)
model_rem_admin.summary()

In [None]:
RMSE_rem_admin=RMSE(startup_df.Profit,predict_rem_admin)
RMSE_rem_admin

In [None]:
model_rem_admin_market=smf.ols('Profit~R_DSpend',data=startup_df).fit()
predict_rem_admin_market=model_rem_admin_market.predict(startup_df)
model_rem_admin_market.summary()

In [None]:
RMSE_rem_admin_market=RMSE(startup_df.Profit,predict_rem_admin_market)
RMSE_rem_admin_market

# Regression plot

In [None]:
import statsmodels.api as sm
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_partregress_grid(model_basic, fig=fig)
plt.show()

# COOKS Distance and H leverage to remove row outliers

In [None]:
model_influence=model_rem_admin_market.get_influence()
(cooks_values,_)=model_influence.cooks_distance
cooks_values

In [None]:
#Ploting the cooks values 
fig = plt.subplots(figsize=(20,7))
plt.stem(np.arange(len(startup_df)),np.round(cooks_values,3))
plt.xlabel('Indexes')
plt.ylabel('cooks values')

In [None]:
(np.argmax(cooks_values),np.max(cooks_values))

In [None]:
from statsmodels.graphics.regressionplots import influence_plot

influence_plot(model_rem_admin_market)
plt.show()

In [None]:
k=startup_df.shape[1]
r=startup_df.shape[0]
leverage_cutoff = 3*((k + 1)/r)
leverage_cutoff

In [None]:
startup_df_new=startup_df.copy()

In [None]:
startup_df_changed=startup_df_new.drop(startup_df_new.index[49],axis=0)
startup_df_changed.shape

In [None]:
# After removing the row outliers checking the model parameters

In [None]:
model_new=smf.ols('Profit~R_DSpend+MarketingSpend',data=startup_df_changed).fit()
predicted_new=model_new.predict(startup_df_changed)
model_new.summary()

In [None]:
RMSE_new=RMSE(startup_df_changed.Profit,predicted_new)
RMSE_new

In [None]:
df_rsq={'Model':['basic','rem_admin','rem_admin_market','model_new'],
        'R^2':[model_basic.rsquared,model_rem_admin.rsquared,model_rem_admin_market.rsquared,model_new.rsquared],
        'AIC':[model_basic.aic,model_rem_admin.aic,model_rem_admin_market.aic,model_new.aic],
        'RMSE':[RMSE_basic,RMSE_rem_admin,RMSE_rem_admin_market,RMSE_new]
       }
rsq_df=pd.DataFrame(df_rsq)
rsq_df

In [None]:
error_changed=predicted_new-startup_df_changed.Profit
plt.hist(error_changed)
plt.ylabel("error")

In [None]:
error_changed=sns.scatterplot(x=startup_df_changed.Profit,y=error_changed,data=startup_df_changed)
error_changed.set(xlabel="Original Profit", ylabel="Error", title="Residual Plot")