<a href="https://colab.research.google.com/github/sandipanpaul21/ML-Code-in-Python/blob/master/07_Simple_and_Multiple_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Libraries 

from sklearn import datasets 
import pandas as pd
import seaborn as sns
import numpy as np

  import pandas.util.testing as tm


In [None]:
# Load the Datasets (For indepth analysis please refer to Part O1 All About Datasets)

# Boston Dataset for Regression
boston = datasets.load_boston()
boston_pd = pd.DataFrame(boston.data)
boston_pd.columns = boston.feature_names
boston_pd["HOUSEPRICE"] = boston.target
boston_pd.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,HOUSEPRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [None]:
# Dataset overall Information
boston_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CRIM        506 non-null    float64
 1   ZN          506 non-null    float64
 2   INDUS       506 non-null    float64
 3   CHAS        506 non-null    float64
 4   NOX         506 non-null    float64
 5   RM          506 non-null    float64
 6   AGE         506 non-null    float64
 7   DIS         506 non-null    float64
 8   RAD         506 non-null    float64
 9   TAX         506 non-null    float64
 10  PTRATIO     506 non-null    float64
 11  B           506 non-null    float64
 12  LSTAT       506 non-null    float64
 13  HOUSEPRICE  506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [None]:
# Let set the BASE MODEL on which we will improve

# Assigning Independent and Dependent variables
dependent_variable = boston_pd[['HOUSEPRICE']]
print("DEPENDENT VARIABLE : ",dependent_variable.columns)
independent_variables = boston_pd[boston_pd.columns[0:12]]
print("INDEPENDENT VARIABLES : ")
print(independent_variables.columns)

DEPENDENT VARIABLE :  Index(['HOUSEPRICE'], dtype='object')
INDEPENDENT VARIABLES : 
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B'],
      dtype='object')


In [32]:
# Base Model

import statsmodels.api as sm
from statsmodels.api import OLS
OLS(dependent_variable, independent_variables).fit().summary()

0,1,2,3
Dep. Variable:,HOUSEPRICE,R-squared (uncentered):,0.954
Model:,OLS,Adj. R-squared (uncentered):,0.953
Method:,Least Squares,F-statistic:,846.6
Date:,"Mon, 31 Aug 2020",Prob (F-statistic):,2.38e-320
Time:,14:27:25,Log-Likelihood:,-1556.1
No. Observations:,506,AIC:,3136.0
Df Residuals:,494,BIC:,3187.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CRIM,-0.1439,0.036,-3.990,0.000,-0.215,-0.073
ZN,0.0413,0.015,2.696,0.007,0.011,0.071
INDUS,-0.0370,0.068,-0.540,0.589,-0.172,0.098
CHAS,3.2525,0.961,3.384,0.001,1.364,5.141
NOX,-10.8653,3.422,-3.175,0.002,-17.590,-4.141
RM,7.1436,0.289,24.734,0.000,6.576,7.711
AGE,-0.0449,0.014,-3.235,0.001,-0.072,-0.018
DIS,-1.2292,0.206,-5.980,0.000,-1.633,-0.825
RAD,0.2008,0.071,2.829,0.005,0.061,0.340

0,1,2,3
Omnibus:,277.013,Durbin-Watson:,0.927
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3084.31
Skew:,2.148,Prob(JB):,0.0
Kurtosis:,14.307,Cond. No.,8130.0


In [None]:
# Output Explaination 

# Omnibus/Prob(Omnibus) - 
# A test of the skewness and kurtosis of the residual (characteristic #2). 
# We hope to see a value close to zero which would indicate normalcy. 
# The Prob (Omnibus) performs a statistical test indicating 
# the probability that the residuals are normally distributed. 
# We hope to see something close to 1 here. 
# In this case Omnibus = 277 (way higher than 1) and Prob(Omnibus) = 0 
# is way high (normally = 1) and the Prob (Omnibus) is 0 which is way too low
# so the data is not normal, not ideal. 

# Skew - 
# Measure of data symmetry.
# We want to see something close to zero, indicating the residual distribution is normal. 
# Note that this value also drives the Omnibus
# In this case, Skewness = 2.148, way higher than 0 so skewwed

# Kurtosis - 
# Measure of "peakiness", or curvature of the data. 
# Kurtosis of the normal distribution is 3.0.
# In this case, Kurtosis = 14 which is way too higher

# Durbin-Watson - 
# Tests for homoscedasticity
# We hope to have a value between 1 and 2. 
# In this case, Durbin-Watson = 0.927 is close, but within limits.

# Jarque-Bera (JB)/Prob(JB) - 
# like the Omnibus test in that it tests both skew and kurtosis. 
# It is also performed for the distribution analysis of the regression errors.
# A large value of JB test indicates that the errors are not normally distributed.
# In this case, JB = 3084 which is way too higher so error are not normally distributed

# Condition Number -
# This test measures the sensitivity of a function's output as compared to its input 
# When we have multicollinearity, we can expect much higher fluctuations to small changes in the data,
# hence, we hope to see a relatively small number, something below 30. 
# In this case, Condition Number = well above 30, so multicollinearity present

# R Square and Adjusted R Square - 
# Both measures model performance and Possible values range from 0.0 to 1.0. 
# The Adjusted R Squared value is always a bit lower than the Multiple R-Squared value 
# Adjusted R Square consequently is a more accurate measure of model performance.
# Adding an additional explanatory variable to the model will likely 
# increase the Multiple R-Squared value, but decrease the Adjusted R-Squared value
# Adjusted R Square will only increase when good variables are added in the model
# Higher the Adjusted R Square, better is the model
# In this case, Adjusted R Square = 0.95 which is good for the model

# F-Statistics - 
# This test for overall significance has the following two hypotheses:
# Null hypothesis : Model with no independent variables fits the data as well as your model.
# Alternative hypothesis : Model fits the data better than the intercept-only model.
# In this case, P(F-Statistics) = less than 0.05 suggests Independent variables are important

# Log-Likelihood, AIC and BIC are for LOGISTIC REGRESSION MODEL