In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/startup-logistic-regression/50_Startups.csv')

In [None]:
df.info()

In this dataset we have R&D spend,Administrator, Marketing Spend and profit. Our goal is to make a model which can best predict the profit gain by which company. So Profit is our dependent variable and other features are independent variables. 

In [None]:
df.head(n=4)

In [None]:
df.describe()

In [None]:
#Check Null Value
df.isnull().sum()

In [None]:
#Split Dataset into X and y
X=df.drop(columns='Profit')
y=df['Profit']
print(X.shape)
print(y.shape)

In [None]:
X['State'].unique()

In [None]:
#Make object type variable into numeric
# from sklearn.preprocessing import LabelEncoder,OneHotEncoder
# LE=LabelEncoder()
# df.iloc[:,3]=LE.fit_transform(df.iloc[:,3])
# OHE=OneHotEncoder()
# OHE.fit_transform(df[['State']]).toarray()
X=pd.get_dummies(X,drop_first=True)

In [None]:
#Check any outlier on features having numeric values
import matplotlib.pyplot as plt
%matplotlib inline
for i in X.iloc[:,0:3]:
    plt.boxplot(df[i],notch=True,patch_artist=True)
    plt.show()

In [None]:
#Check Multicollinearity. 
#VIF=1/1-R2 (R2 is required to determine the coefficient in linear regression)
#Greater the value of R2, greater the value of VIF. Value above 5 considers the high collinearity.
#VIF (Variance Influation factor is the method through which we can check the multicollinearity)

from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
pd.DataFrame({'Features':X.columns,'VIF':[ VIF(X.values,i) for i in range(len(X.columns))]})

From the above table we can see that R&D Spend and Marketing Spend having little collinearity. So we are considering both feature for model building

In [None]:
#Split X and y into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=21)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
model_mlr=LinearRegression()
model_mlr.fit(X_train,y_train)
print(model_mlr.coef_)
print(model_mlr.intercept_)

In [None]:
y_pred=model_mlr.predict(X_test)
x_pred=model_mlr.predict(X_train)

In [None]:
#Check model performance for traina and test
from sklearn.metrics import mean_squared_error
train_score=model_mlr.score(X_train,y_train)
test_score=model_mlr.score(X_test,y_test)
print(train_score)
print(test_score)
print(f"RMSE score of training dataset is : {np.sqrt(mean_squared_error(y_train,x_pred))}")
print(f"RMSE score of testing dataset is {np.sqrt(mean_squared_error(y_test,y_pred))}")

In [None]:
#Test this model with unseen or new data
[110221,120223,423112,0,0,1]
np.round(model_mlr.predict(np.array([[110221,120223,423112,1,0]])),2)

In [None]:
print(f"The difference between train score and test score is {test_score-train_score}")

To minimize the difference we have to use "Backward Elimination", this method provides the importance features which gives the good prediction result. 

In [None]:
#Preparation of Backward Elimination
import statsmodels.api as sma

#Here we have to manually add b0 which is constant features in MLR but not associated with any of the column
dataset=X.copy()
dataset['x0']=np.ones((len(X),1),dtype='int')
#Now we will choose all columns and fit them into OLS and then check which p-value is greater than SL value (0.05)
dataset_opt=dataset.iloc[:,[5,3,4,0,1,2]]
sma_ola=sma.OLS(endog=y,exog=dataset_opt).fit()
sma_ola.summary()

From the above OLS method we can see that feature "State_Florida","State New York" and "Administration" having high p value which is greater than SL of 0.05. So we will remove those features and prepare a model

In [None]:
dataset_opt=dataset_opt.iloc[:,[0,3,5]]
sma_ola=sma.OLS(y,dataset_opt).fit()
sma_ola.summary()

R&D spend also having high p value which is greater than SL value of 0.05. So we need to remove this. 

In [None]:
dataset_opt=dataset_opt[['x0','R&D Spend']]
results=sma.OLS(y,dataset_opt).fit()
results.summary()

In [None]:
#Make the model using this feature and see what is the difference we are getting
#Extract X and y from dataframe (Here we are using only R&D as independent variable)

X=df.iloc[:,[0]].values
y=df.iloc[:,-1].values

#Split X and y into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=21)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#Make model
from sklearn.linear_model import LinearRegression
model_mlr=LinearRegression()
model_mlr.fit(X_train,y_train)
print(model_mlr.coef_)
print(model_mlr.intercept_)

y_pred=model_mlr.predict(X_test)
x_pred=model_mlr.predict(X_train)

#Check model performance for traina and test
from sklearn.metrics import mean_squared_error
train_score=model_mlr.score(X_train,y_train)
test_score=model_mlr.score(X_test,y_test)
print(train_score)
print(test_score)
print(f"RMSE score of training dataset is : {np.sqrt(mean_squared_error(y_train,x_pred))}")
print(f"RMSE score of testing dataset is {np.sqrt(mean_squared_error(y_test,y_pred))}")

In [None]:
print(f"The difference between train score and test score is {test_score-train_score}")