In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as pt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing data (Boston Housing Data to predict house prices)
1. Link - https://www.kaggle.com/schirmerchad/bostonhoustingmlnd

In [None]:
input_ads = pd.read_csv('../input/bostonhoustingmlnd/housing.csv')

#-----------------------------------------------------------------
#Summary
print(input_ads.shape)
input_ads.head()

# Null Check

In [None]:
input_ads.isnull().sum()

# Description of the target variable

In [None]:
input_ads['MEDV'].describe()

# Data Splitting & Pre-Processing 

In [None]:
#Splitting of the ADS into X and Y components
X = input_ads[[cols for cols in list(input_ads.columns) if 'MEDV' not in cols]]
y = input_ads['MEDV']

#Train-test split creation
X, X_test, y, y_test = train_test_split(X, y, test_size=0.30, random_state=100)

#--------------------------------------------------------------------------------
#Scaling the datasets
scaler = StandardScaler()

X_arr = scaler.fit_transform(X)
X_test_arr = scaler.fit_transform(X_test)

y_arr = np.array(y).reshape(X_arr.shape[0],1)
y_test_arr = np.array(y_test).reshape(X_test_arr.shape[0],1)

#--------------------------------------------------------------------------------
#Summary
print('Training x rows :',X_arr.shape)
print('Testing x rows :',X_test_arr.shape)
print('Training y rows :',y_arr.shape)

# Linear Regression from scratch

### Defining fwd prop UDF, Cost function UDF & initiating weights and intercepts

In [None]:
#For forward propagation in the model
def fwd_prop(X_arr,w,b):
    
    a = np.dot(X_arr,w) + b
    #print('Shape of a:',a.shape)
    
    return a

#Cost function as per regularization
def cost_fn(y_true,y_pred,n_examples,reg_alpha,reg_type,w_):
    
    if reg_type=='L1':
        reg = np.sum(abs(w_))
    elif reg_type=='L2':
        reg = 0.5 * np.sum(np.square(w_))
    
    cost = (1/(2*n_examples)) * np.sum(np.square(y_pred-y_true)) + (reg_alpha*reg)
    #print('Cost :',cost)
    return cost    

#Setting seed
np.random.seed(100)

#Initializing weights(w) and bias(b) vectors 
#-------------------------------------------
w = np.random.rand(X.shape[1],1)
print(w)
#-------------------------------------------
b = np.zeros(1)
b

## UDF for batch_gradient_descent
#### 1. If batch_size=1, it becomes stochastic gradient descent

In [None]:
def batch_gradient_descent(y_arr_overall,n_examples,X_arr_overall,w_,b_,n_iters=10,lr=0.01,batch_size=1,reg_alpha=1,reg_type='L1'):
    
    print('Total training rows :',X_arr_overall.shape[0])
    
    #----------------------------------------------------------------------------------------
    #Creating x-y batches according to the provided batch_size
    
    n_batches = X.shape[0]//batch_size
    print('Total Batches to create in each epoch/iter :',n_batches)
    
    batches_x = np.array_split(X_arr_overall,n_batches)
    print('Total Batches of X:',len(batches_x))

    batches_y = np.array_split(y_arr,n_batches)
    print('Total Batches of y:',len(batches_y))
    
    cost_history = [] #Cache for cost function o/p at necessary intervals for plotting later

    #----------------------------------------------------------------------------------------
    for i in range(n_iters): #Total iterations/epochs to train on
        
        if i%1000==0:
            print('#-------------------- Epoch number :',i,'--------------------#')
        
        for j in range(len(batches_x)): #For each batch created for each epoch/iter
            
            #print('Batch No :',j)
            
            X_arr_ = batches_x[j]
            y_arr_ = batches_y[j]

            #----------------------------------------------------------------------------------------
            #Forward propagation of the model - calculation of the model prediction
            a_temp = fwd_prop(X_arr_,w_,b_)

            cost = cost_fn(y_arr_,a_temp,n_examples,reg_alpha,reg_type,w_)
            
            if cost == np.inf: #If any inf is encountered due to exploding gradients
                print('---- Inf encountered due to exploding gradients ----')
                return w_,b_,cost_history

            #----------------------------------------------------------------------------------------
            error = a_temp-y_arr_ #The residual calculation
            
            #Applying regularization
            if reg_type=='L1':
                
                reg_derivative = np.divide(w_, abs(w_), out=np.zeros_like(w_), where=abs(w_)!=0)
                reg_derivative = np.where(reg_derivative==np.inf,0,reg_derivative)
                
            elif reg_type=='L2':
                
                reg_derivative = w_         
            
            #Calculating the gradients for the current batch
            dw = 1/n_examples * (np.dot(X_arr_.T,error) + (reg_alpha*reg_derivative)) #Customized for regularization
            db = 1/n_examples * np.sum(error)
            
            #Updating the weight and the intercept
            w_ = w_ - (lr * dw)
            b_ = b_ - (lr * db)
        
        #Updating cost into the cache
        cost_history = cost_history + [cost]
        #-------------------------------------------------
        #Progress at regular intervals
        if (i%5000==0):
            print(i,': Cost ------->',cost)
            
            f_train_a = fwd_prop(X_arr_overall,w_,b_) #Results on whole training data after every 5k epochs
            print(f_train_a.shape)
        
            print('MSE of training set :',mean_squared_error(y_arr_overall,f_train_a))
            print('RMSE of training set :',np.sqrt(mean_squared_error(y_arr_overall,f_train_a)))
        
    return w_,b_,cost_history
    


### Training the linear regression model

In [None]:
w_final,b_final,cost_history = batch_gradient_descent(y_arr_overall=y_arr,
                                                      n_examples=X_arr.shape[0],
                                                      X_arr_overall=X_arr,
                                                      w_=w,
                                                      b_=b,
                                                      n_iters=20001,
                                                      lr=0.001,
                                                      batch_size=20,
                                                      reg_alpha=0.05,
                                                      reg_type='L1')

### Plotting cost over epochs (Should have a sharp decrease)

In [None]:
#Cost plot over epochs (1 value at end of each epoch) - over the last batch
sns.set_style('darkgrid')
ax = sns.lineplot(x=list(range(0,20001)),y=cost_history)
ax.set(xlabel='No of epochs',ylabel='Cost',title='Cost vs Epochs-Linear/Lasso/Ridge Regression')

### UDF for predicting

In [None]:
def predict(w_,b_,test_x,test_y):
    
    print("Testing on :",test_x.shape[0],'rows')
    
    a_temp = fwd_prop(test_x,w_,b_) #Applying the trained weights(w_) and bias(b_)
    print('Shape of prediction :',a_temp.shape)
    
    print('MSE of test set :',mean_squared_error(test_y,a_temp))
    print('RMSE of test set :',np.sqrt(mean_squared_error(test_y,a_temp)))
    
    print(a_temp[0:3])
    
    return a_temp


# Predictions from the manual created linear regression model

In [None]:
predictions_ = predict(w_final,b_final,X_test_arr,y_test_arr)

# Linear Regression from sklearn as benchmark

In [None]:
from sklearn.linear_model import LinearRegression

#---------------------------------------------------------------------------------------
lin_reg = LinearRegression()
lin_reg.fit(X_arr,y_arr)

prediction_sklearn = lin_reg.predict(X_test_arr)

#---------------------------------------------------------------------------------------
print('MSE of test set :',mean_squared_error(y_test_arr,prediction_sklearn))
print('RMSE of test set :',np.sqrt(mean_squared_error(y_test_arr,prediction_sklearn)))

## Comparing the delta between manual predictions and sklearn predictions

In [None]:
delta = prediction_sklearn-predictions_

#---------------------------------------------------------
print('25th Quantile of delta :',np.quantile(delta,0.25))
print('Median of delta :',np.quantile(delta,0.5))
print('75th Quantile of delta :',np.quantile(delta,0.75))

# Checking the assumptions of linear regression

## Calculating the residuals

In [None]:
residuals = predictions_ - y_test_arr
residuals[0:3]

## Assumption 1 : Linear relation between dependent & independent variable

In [None]:
print('-- Pair-plot for all the features on the whole data--')
ax = sns.pairplot(input_ads,x_vars = ['RM','LSTAT','PTRATIO'],y_vars = ['MEDV'],size=7,aspect=0.7)
ax.set(title='Pair-plot for all the features on the whole data')

### Insights : 
1. We observe that RM and LSTAT have linear behaviour to some degree against the taget variable which aligns with the assumption of lineariy for linear regression

In [None]:
#Residual vs fitted plot for test data
sns.set_style("darkgrid")
ax = sns.scatterplot(x=np.divide(predictions_,1000).ravel(),y=np.divide(residuals,1000).ravel(),marker='o')
ax.set(xlabel='Predicted-y in 1k units', ylabel='Residuals in 1k units',title='Residuls vs Fitted Plot for Test Data')

### Insights - 
1. The above residual vs fitted plot is sowing very loose signs of a pattern which should be the case as the residuals shouldn't have any pattern among themselves as per the assumptions of linear regression  

## Assumption 2 : Mean of residulals should be close to 0

In [None]:
print('Mean of residuals (Should be 0):',np.mean(residuals))

### Insights : 
1. Not abiding by the assumption

## Assumption 3 : Absence of Multi-Collinearity 

In [None]:
# Import library for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

#------------------------------------------------------------------------------------
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

#------------------------------------------------------------------------------------
X_VIF = calc_vif(X)
X_VIF = X_VIF.sort_values(['VIF'],ascending=False) #Sorting by descending order
#X_VIF[X_VIF['VIF']>4] #Filtering for above 4 #Usually VIF above 4 or 5 is problematic
X_VIF

### Insights : 
1. From above, we can see PTRATIO and RM have very high multi-collinearity, We'll have to validate this with correlation analysis

### Validating the above findings of VIF through pearson's correlation

In [None]:
sns.set_style("darkgrid")
sns.heatmap(input_ads.corr(method='spearman'),annot=True)

## Assumption 4 : Homoskedascity check
### Definition : There residuals of the model should not be in any pattern

In [None]:
#Residual vs fitted plot for test data

sns.set_style("darkgrid")
ax = sns.scatterplot(x=np.divide(predictions_,1000).ravel(),y=np.divide(residuals,1000).ravel(),marker='o')
sns.lineplot([0,800],[0,0],color='red')
ax.set(xlabel='Predicted-y in 1k units', ylabel='Residuals in 1k units',title='Residuls vs Fitted Plot for Test Data')

### Hypothesis Test for Homoskedascity

In [None]:
import statsmodels.stats.api as sms
import statsmodels.api as sm
import pylab as py
from statsmodels.compat import lzip

#-------------------------------------------------------
name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(residuals, X_test)
lzip(name, test)

### Insights : 
1. From the plot, we can see theres no definite pattern suggesting no Heteroskedascity
2. From the hypothesis test above, we can see that p-value (~0.976) > alpha (0.05). Hence, we cannot reject the null hypothesis that there is homoskedascity present

## Assumption 5 : Normality of residuals

In [None]:
sm.qqplot(residuals.ravel(), line ='45', loc=np.mean(residuals.ravel()), scale=np.std(residuals.ravel()))
py.show()

### Insights : 
1. The residuals are fairly aligned with normal distribution with limited deviation at the initial quantiles

## Hypothesis testing to check for Gaussian spread of residuals

In [None]:
import scipy.stats as stats

#-----------------------------------------------------------
shap_stat,shap_p = stats.shapiro(residuals.ravel())
print('Stat :',shap_stat)
print('p-value from SHAPIRO_WILKS test :',shap_p)

In [None]:
sns.distplot(residuals.ravel(),color='g')

### Insights : 
1. From the Q-Q plot, we could see that there was some minor deviation from normal distribution at the initial quantiles
2. From Shapiro-Wilks test we see that the p-value (~0.003) < alpha (0.05). Hence, we can reject the null hypothesis that the residual is normally distributed
3. From the kde-plot above, we can see that the distribution seems slightly left skewed (matching with initial quantiles of Q-Q plot)
4. All in all, the residuals are not normally distributed and hence the assumption is violated

# END