In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
from sklearn.datasets import load_boston

In [18]:
data = load_boston()

In [19]:
print(data['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [26]:
X = pd.DataFrame(data.data, columns=data.feature_names)

In [27]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [28]:
y = data.target

#### Splitting data

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=101)

#### Scaling data

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
sc = StandardScaler()

In [34]:
X_train = sc.fit_transform(X_train)

In [35]:
X_test = sc.transform(X_test)

In [36]:
X_train

array([[-0.42704101,  4.09805388, -1.2821606 , ..., -1.75847274,
         0.36848849, -1.47206228],
       [ 0.36426841, -0.48006999,  0.95531935, ...,  0.78573951,
         0.38490745, -1.46049687],
       [ 0.01647159, -0.48006999,  0.95531935, ...,  0.78573951,
         0.34736276, -0.35889127],
       ...,
       [-0.42578825, -0.48006999, -0.91795342, ...,  0.78573951,
         0.41511834, -0.36178262],
       [-0.41521993,  0.12231473, -0.52907882, ..., -1.52718072,
         0.43799542,  0.02999575],
       [-0.41982916,  2.41137667, -1.42581204, ..., -0.09317018,
         0.15208658, -1.09474068]])

#### Creating Linear Regression model

In [37]:
from sklearn.linear_model import LinearRegression

In [38]:
lin_reg = LinearRegression()

In [39]:
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [44]:
#show_stats method is written below, please execute that first
show_stats(lin_reg, pd.DataFrame(X_train, columns=X.columns), pd.DataFrame(X_test, columns=X.columns), pd.Series(y_train), pd.Series(y_test))

[34mR Squared Score:
[30mR Squared Score (Train Model):  0.74713
[30mR Squared Score (Test Model):  0.7124
 
[34mAdjusted R Squared Score:
[30mAdjusted R Squared (Train Model): 0.73746
[30mAdjusted R Squared (Test Model): 0.6853
 
[34mErrors : 
[30mMean Absolute Error : 3.8356963614189383
[30mRoot Mean Squared Error : 5.3429940362560915

[34mBias / Intercept: 
[30mIntercept for the model:  21.97684
 
[34mWeights / Coefficients: 
[0m
         Coefficients  F-stat(p-value) Significant
CRIM         -0.72909     1.173987e-19         Yes
ZN            1.04362     5.713584e-17         Yes
INDUS         0.14023     4.900260e-31         Yes
CHAS          0.94381     7.390623e-05         Yes
NOX          -2.01829     7.065042e-24         Yes
RM            2.18918     2.487229e-74         Yes
AGE           0.33464     1.569982e-18         Yes
DIS          -2.90740     1.206612e-08         Yes
RAD           2.31811     5.465933e-19         Yes
TAX          -1.77438     5.637734e-29  

In [45]:
#### Fitting SVM Regressor

In [46]:
from sklearn.svm import SVR

In [50]:
svr = SVR(kernel='linear')

In [51]:
svr.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [52]:
#show_stats method is written below, please execute that first
show_stats(svr, pd.DataFrame(X_train, columns=X.columns), pd.DataFrame(X_test, columns=X.columns), pd.Series(y_train), pd.Series(y_test))

[34mR Squared Score:
[30mR Squared Score (Train Model):  0.70662
[30mR Squared Score (Test Model):  0.6861
 
[34mAdjusted R Squared Score:
[30mAdjusted R Squared (Train Model): 0.6954
[30mAdjusted R Squared (Test Model): 0.65653
 
[34mErrors : 
[30mMean Absolute Error : 3.6685142988034025
[30mRoot Mean Squared Error : 5.581901921476146

[34mBias / Intercept: 
[30mIntercept for the model:  [21.04399]
 
[34mWeights / Coefficients: 
[0m
         Coefficients  F-stat(p-value) Significant
CRIM         -1.06792     1.173987e-19         Yes
ZN            0.97777     5.713584e-17         Yes
INDUS         0.27320     4.900260e-31         Yes
CHAS          0.67414     7.390623e-05         Yes
NOX          -0.94952     7.065042e-24         Yes
RM            2.82113     2.487229e-74         Yes
AGE          -0.29934     1.569982e-18         Yes
DIS          -1.60839     1.206612e-08         Yes
RAD           1.35165     5.465933e-19         Yes
TAX          -1.72805     5.637734e-29 

In [40]:
def show_stats(regression_model, X_train, X_test, y_train, y_test):
    #For printing in colored font
    import colorama
    from colorama import Fore, Style
    #print(Fore.BLUE + "Hello World")
    #print(Style.RESET_ALL)
    
    #7. Checking the data types, exit if not proper types
    X_train_type = type(X_train)
    X_test_type = type(X_test)
    y_train_type = type(y_train)
    y_test_type = type(y_test)
    if ((X_train_type != pd.DataFrame) & (X_test_type != pd.DataFrame) & (y_train_type != pd.Series) & (y_test_type != pd.Series)):
        print(Fore.RED + '**** Please pass the data in expected format ****')
        print('You passed the data in the following format:')
        print('X_train type you passsed: ',X_train_type)
        print('X_test type you passsed: ',X_test_type)
        print('y_train type you passsed: ',y_train_type)
        print('y_test type you passsed: ',y_test_type)
        print(Fore.BLUE +'Expected Formats are: ')
        print('X_train type: ',pd.DataFrame)
        print('X_test type: ',pd.DataFrame)
        print('y_train type: ',pd.Series)
        print('y_test type: ',pd.Series)
        return 'Exiting Method'
 
    #1. Getting R SquaredAdjusted R-Squared
    print(Fore.BLUE + 'R Squared Score:')
    train_rsq = regression_model.score(X_train,y_train)
    test_rsq = regression_model.score(X_test,y_test)
    print(Fore.BLACK + 'R Squared Score (Train Model): ',train_rsq.round(5))
    print(Fore.BLACK + 'R Squared Score (Test Model): ',test_rsq.round(5))
    print(" ")
    
    #2. Getting Adjusted R Squared
    print(Fore.BLUE + 'Adjusted R Squared Score:')
    train_adrsq = 1 - (1 - train_rsq) * (X_train.shape[0]-1)/(X_train.shape[0] - X_train.shape[1] - 1)
    test_adrsq = 1 - (1 - test_rsq) * (X_test.shape[0]-1)/(X_test.shape[0] - X_test.shape[1] - 1)
    print(Fore.BLACK + 'Adjusted R Squared (Train Model):', train_adrsq.round(5))
    print(Fore.BLACK + 'Adjusted R Squared (Test Model):', test_adrsq.round(5))
    print(' ')
    
    #8. Getting MAE and RMSE
    print(Fore.BLUE + "Errors : ")
    from sklearn import metrics
    import numpy as np
    y_pred = regression_model.predict(X_test)
    print(Fore.BLACK + "Mean Absolute Error :", metrics.mean_absolute_error(y_test, y_pred))
    print(Fore.BLACK + "Root Mean Squared Error :", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('')
    
    #3. Getting Intercept
    print(Fore.BLUE + "Bias / Intercept: ")
    print(Fore.BLACK + "Intercept for the model: ",regression_model.intercept_.round(5))
    print(' ')
    
    #=========== Getting & Creating the coefficient & p-values matrix =====================
    #4. Getting Coefficients
    print(Fore.BLUE + "Weights / Coefficients: ")
    print(Style.RESET_ALL)
    coefs = regression_model.coef_
    coefs= coefs.reshape(-1,1)
    coefs_df = pd.DataFrame(data=coefs.round(5), index=X_test.columns, columns=['Coefficients'])
    
    #5. Getting p-values of features
    from sklearn.feature_selection import f_regression
    X_all = pd.concat([X_train, X_test])
    y_all = y_train.append(y_test)
    p_values = f_regression(X_all,y_all)[1] 
    
    #Tried adding round off to 5 digits for p-value but it is not wotking, hence commented
    #coefs_df = pd.concat([coefs_df,pd.DataFrame(p_values, index=X_test.columns, columns=['F-stat(p-value)']), 
                          #pd.DataFrame(p_values.round(5), index=X_test.columns, columns=['round(p-value)']) ], axis=1)
    coefs_df = pd.concat([coefs_df,pd.DataFrame(p_values, index=X_test.columns, columns=['F-stat(p-value)'])], axis=1)
    
    #6. Adding Significant Column
    sig = []
    for element in p_values:
        if element.round(5) < 0.05:
            sig.append('Yes')
        else:
            sig.append('No')
    coefs_df = pd.concat([coefs_df, pd.DataFrame(sig, columns=['Significant'], index=X_test.columns)], axis=1)
    print(coefs_df)
    print(' ')
    