In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
boston=pd.read_csv('../input/boston-housing/boston_housing.csv')
boston.head()

In [None]:
boston.isnull().sum()

In [None]:
#Show missing variable
msno.matrix(boston);

***Descriptive Statistics of Each Features***

In [None]:
def MissingUniqueStatistics(df):
    variable_name_list = []
    total_entry_list = []
    data_type_list = []
    unique_values_list = []
    number_of_unique_values_list = []
    missing_value_number_list = []
    missing_value_ratio_list = []
    mean_list=[]
    std_list=[]
    min_list=[]
    Q1_list=[]
    Q2_list=[]
    Q3_list=[]
    max_list=[]

    df_statistics = boston.describe().copy()

    for col in boston.columns:
        variable_name_list.append(col)
        total_entry_list.append(boston.loc[:,col].shape[0])
        data_type_list.append(boston.loc[:,col].dtype)
        unique_values_list.append(list(boston.loc[:,col].unique()))
        number_of_unique_values_list.append(len(list(boston.loc[:,col].unique())))
        missing_value_number_list.append(boston.loc[:,col].isna().sum())
        missing_value_ratio_list.append(round((boston.loc[:,col].isna().sum()/boston.loc[:,col].shape[0]),4))
        
        try:
            mean_list.append(df_statistics.loc[:,col][1])
            std_list.append(df_statistics.loc[:,col][2])
            min_list.append(df_statistics.loc[:,col][3])
            Q1_list.append(df_statistics.loc[:,col][4])
            Q2_list.append(df_statistics.loc[:,col][5])
            Q3_list.append(df_statistics.loc[:,col][6])
            max_list.append(df_statistics.loc[:,col][7])
    
        except:
            mean_list.append('NaN')
            std_list.append('NaN')
            min_list.append('NaN')
            Q1_list.append('NaN')
            Q2_list.append('NaN')
            Q3_list.append('NaN')
            max_list.append('NaN')


    data_info_df = pd.DataFrame({'Variable': variable_name_list, 
                               '#_Total_Entry':total_entry_list,
                               '#_Missing_Value': missing_value_number_list,
                               '%_Missing_Value':missing_value_ratio_list,
                               'Data_Type': data_type_list, 
                               'Unique_Values': unique_values_list,
                               '#_Unique_Values':number_of_unique_values_list,
                               'Mean':mean_list,
                               'STD':std_list,
                               'Min':min_list,
                               'Q1':Q1_list,
                               'Q2':Q2_list,
                               'Q3':Q3_list,
                               'Max':max_list
                               })

    data_info_df = data_info_df.set_index("Variable", inplace=False)

    
    return data_info_df.sort_values(by='%_Missing_Value', ascending=False)



In [None]:
data_info = MissingUniqueStatistics(boston)
data_info

In [None]:
#Target Value Distribution
plt.subplots(figsize=(12, 9))
sns.distplot(boston['medv'], fit = stats.norm)

(mu, sigma) = stats.norm.fit(boston['medv'])

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma = $ {: .2f})'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frekans')

#Probability Plot
fig = plt.figure()
stats.probplot(boston['medv'], plot = plt)
plt.show()

In [None]:
boston.tail()

In [None]:
boston.corr()

In [None]:
#High Correlation between features
corr_matrix = boston.corr().abs()
high_corr_var=np.where(corr_matrix>0.8)
high_corr_var=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
high_corr_var

In [None]:
#High Correlation with Dependent Value
corr = boston.corr().abs()
k = 10 #number of variables for heatmap
cols = corr.nlargest(k, 'medv')['medv'].index
cm = np.corrcoef(boston[cols].values.T)
sns.set(font_scale=1.25)
fig, ax = plt.subplots(figsize=(10,10))       
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels=cols.values, xticklabels=cols.values,cmap='RdYlGn')
plt.show()

In [None]:
#variables that are highly correlated with each other except the dependent variable
correlated_features = set()
correlation_matrix = boston.loc[:, boston.columns != 'medv'].corr()

for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.7:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
            
correlated_features

In [None]:
#Correlation with dependent variable
cor_target = abs(boston.corr()["medv"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.7]
relevant_features

I will drop the variable 'RAD' according to the above situations. ('TAX' affects the target variable more than the 'RAD' variable.)

In [None]:
sns.pairplot(boston,palette='coolwarm',height=1.5,corner=True,plot_kws=dict(marker="+", linewidth=1),diag_kws=dict(fill=False));

In [None]:
pp = sns.pairplot(data=boston,
                  y_vars=['medv'],
                  x_vars=['crim','zn','indus','chas','nox','rm','age','dis','rad','tax','ptratio','black','lstat'],
                  plot_kws=dict(marker="D", linewidth=1))
pp.fig.set_size_inches(20,3)

As you can see, there is a strong relationship between explanatory variables. Multiple linear linkage can be reduced by standardizing the data.

# ***Detecting Multicollinearity with VIF***

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from statsmodels.tools.tools import add_constant


# the independent variables set 
X = boston.iloc[:,:-1]
X = add_constant(X)
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

There is no variable with VIF value greater than 10. But 'rad' and 'tax' s VIF value >5.

We can use ridge regression or principal components to solve the multicollinearity problem. Since the values of multicollinearity are low, there is no need to subtract variables.

# ***Outlier Plotting***

In [None]:
#Box Plot Each Numeric Features in Data
for col in boston.columns:
    sns.boxplot(data = [boston[col]], linewidth = 1, width = 0.5) 
    plt.ylabel(col)
    plt.title("IQR")
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X=boston.iloc[:,:-1]
y=boston.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# ***Variable Definitions and OLS Regression Results***

In [None]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

When we put it into the basic multiple regression model without any transformation etc.;
* our adjusted R^2 is 0.73. (The regression result of the given model shows that 73% of the change in the medv rate is explained together by these explanatory variables.)
* F statistic is 108.1 
* indus and age features p_value is >0.05


In [None]:
#Partial Regression Plots
fig = sm.graphics.plot_partregress_grid(est2)
fig.set_size_inches(15.5, 18.5)
fig.tight_layout(pad=1.0)

# ***Multiple Linear Regression***

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

regressor.fit(X_train,y_train)
y_pred= regressor.predict(X_test)

print(y_pred[0:5])

In [None]:
print('coefficients of all features (ß1,ß2,...): ' + str(regressor.coef_))
print('intercept of model (ß0): ' + str(regressor.intercept_))

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
test_set_rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
test_set_r2 = r2_score(y_test, y_pred)

print(test_set_rmse)
print(test_set_r2)

In [None]:
r_squared = 0.71
plt.scatter(y_test,y_pred)
plt.xlabel('Actual values')
plt.ylabel('Predicted values')

plt.plot(np.unique(y_test), np.poly1d(np.polyfit(y_test, y_pred, 1))(np.unique(y_test)))

plt.text(7,0.5, 'R-squared = %0.2f' % r_squared)
plt.show()

As you can see here, there are situations that break normality in data (such as Influence or leverage points etc.).

# *Generate Prediction Intervals*

In [None]:
model = LinearRegression()
fit_model = model.fit(X_train, y_train)
predictions = fit_model.predict(X_test)

def get_prediction_interval(prediction, y_test, test_predictions, pi=.95):    
#get standard deviation of y_test
    sum_errs = np.sum((y_test - test_predictions)**2)
    stdev = np.sqrt(1 / (len(y_test) - 2) * sum_errs)
#get interval from standard deviation
    one_minus_pi = 1 - pi
    ppf_lookup = 1 - (one_minus_pi / 2)
    z_score = stats.norm.ppf(ppf_lookup)
    interval = z_score * stdev
#generate prediction interval lower and upper bound
    lower, upper = prediction - interval, prediction + interval
    return lower, prediction, upper
print('prediction interval of first value :')
get_prediction_interval(predictions[0], y_test, predictions)

# *Residual Plotting*
To analyze the variance of the error of the regressor. 

In [None]:
#1-
residuals = y_test-y_pred
plt.plot(X_test,residuals, 'o', color='darkblue')
plt.title("Residual Plot")
plt.xlabel("Independent Variable")
plt.ylabel("Residual");

In [None]:
#2-
from yellowbrick.regressor import ResidualsPlot
from sklearn.linear_model import Ridge

model = Ridge()
visualizer = ResidualsPlot(model)

visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)  
visualizer.show();                

If the points are randomly dispersed around the horizontal axis, a linear regression model is usually appropriate for the data; otherwise, a non-linear model is more appropriate. 

In [None]:
visualizer = ResidualsPlot(model, hist=False, qqplot=True)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show();

Q-Q plot which is a common way to check that residuals are normally distributed. We can see that there are outliers with the Q-Q plot.

# ***Influence plots***

In [None]:
fig = sm.graphics.influence_plot(est2, criterion="cooks")

As you can see there are a few worrisome observations. 380,418,405,410 have high leverage but a low residual. 364,368,372,371,369,370 has high residual and small leverage.

# *Heteroscedasticity*
* In regression analysis, heteroscedasticity refers to the unequal scatter of residuals.
* Heteroscedasticity is a problem because ordinary least squares (OLS) regression assumes that the residuals come from a population that has homoscedasticity (constant variance)

In [None]:
import statsmodels.formula.api as smf

#fit regression model
fit = smf.ols('medv ~ crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat', data=boston).fit()

#view model summary
print(fit.summary())

In [None]:
from statsmodels.compat import lzip
import statsmodels.stats.api as sms

#perform Bresuch-Pagan test
names = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']
test = sms.het_breuschpagan(fit.resid, fit.model.exog)

lzip(names, test)

p-value is less than 0.05,we have to reject the null hypothesis.(The null hypothesis (H0): Homoscedasticity is present.).

Variable transformations can be done, but we will use the standardize method to minimize variance while setting up the final model.

# ***Feature Selection and Modelling***

In [None]:
#Drop RAD
boston.drop(columns=['rad'],inplace=True)

In [None]:
#Drop Outlier according to LOF
from sklearn.neighbors import LocalOutlierFactor
clf=LocalOutlierFactor(n_neighbors=20)

pred=clf.fit_predict(boston)
pred

* 1:Normal observation
* -1: Anomaly observation

In [None]:
boston=boston[pred==1]

In [None]:
#Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()

boston[['crim', 'zn', 'indus', 'chas',
      'nox', 'rm', 'age', 'dis', 'tax',
       'ptratio', 'black', 'lstat']] = mms.fit_transform(boston[['crim', 'zn', 'indus', 'chas',
                                                                           'nox', 'rm', 'age', 'dis', 'tax',
                                                                           'ptratio', 'black', 'lstat']])

In [None]:
"""#Standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
boston[['crim', 'zn', 'indus', 'chas',
      'nox', 'rm', 'age', 'dis', 'tax',
       'ptratio', 'black', 'lstat', 'medv']] = scaler.fit_transform(boston[['crim', 'zn', 'indus', 'chas',
                                                                           'nox', 'rm', 'age', 'dis', 'tax',
                                                                           'ptratio', 'black', 'lstat', 'medv']])"""

In [None]:
boston.head()

In [None]:
X=boston.iloc[:,:-1]
Y=boston[['medv']]

In [None]:
#LassoCV Feature Selection
from sklearn.linear_model import LassoCV

reg=LassoCV(cv=10)
reg.fit(X,Y)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X,Y))
coef = pd.Series(reg.coef_, index = X.columns)

In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [None]:
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model");

Here Lasso model has taken all the features except CRIM. So I will drop CRIM.

In [None]:
X.drop(columns=['crim'],inplace=True)

In [None]:
x, X_test, y, y_test = train_test_split(X, Y, train_size=0.8,test_size=0.2, random_state=101)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(x, y, train_size=0.75,test_size=0.25, random_state=101)

Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

train_errors = []
valid_errors = []
param_range = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

for max_depth in param_range:
    random_forest = RandomForestRegressor(max_depth=max_depth, n_estimators=100, random_state=1)
    random_forest.fit(X_train, y_train)
    
    train_errors.append(np.sqrt(mean_squared_error(y_train, random_forest.predict(X_train))))
    valid_errors.append(np.sqrt(mean_squared_error(y_valid, random_forest.predict(X_valid))))
    

plt.xlabel('max_depth')
plt.ylabel('root mean_squared_error')
plt.plot(param_range, train_errors, label="train rmse")
plt.plot(param_range, valid_errors, label="validation rmse")
plt.legend()
plt.show()

In [None]:
random_forest = RandomForestRegressor(max_depth=4, n_estimators=100, random_state=1)
random_forest.fit(X_train, y_train)

In [None]:
root_mean_squared_error = np.sqrt(mean_squared_error(y_train, random_forest.predict(X_train)))
print(root_mean_squared_error)

train_set_r2 = r2_score(y_train, random_forest.predict(X_train))
print(train_set_r2)

In [None]:
root_mean_squared_error = np.sqrt(mean_squared_error(y_valid, random_forest.predict(X_valid)))
print(root_mean_squared_error)

valid_set_r2 = r2_score(y_valid, random_forest.predict(X_valid))
print(valid_set_r2)

In [None]:
root_mean_squared_error = np.sqrt(mean_squared_error(y_test, random_forest.predict(X_test)))
print(root_mean_squared_error)

test_set_r2 = r2_score(y_test, random_forest.predict(X_test))
print(test_set_r2)

In [None]:
#Random Forest Regressor with CV
from sklearn.model_selection import cross_val_score
cross_val_scores = cross_val_score(RandomForestRegressor(max_depth=4, n_estimators=100, random_state=1),\
                                   X_test, y_test, scoring='neg_mean_squared_error', cv=5)
cross_val_scores = np.sqrt(np.abs(cross_val_scores)) 
print(cross_val_scores)
print("mean:", np.mean(cross_val_scores))

In [None]:
from sklearn.model_selection import validation_curve
train_scores, valid_scores = validation_curve(RandomForestRegressor(n_estimators=100, random_state=1), X_train, y_train, "max_depth",
                                               param_range, scoring='neg_mean_squared_error', cv=5)
train_scores = np.sqrt(np.abs(train_scores))
valid_scores = np.sqrt(np.abs(valid_scores))

train_scores_mean = np.mean(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)

plt.title("Validation Curve with Random Forest")
plt.xlabel("max_depth")
plt.ylabel("RMSE")
plt.plot(param_range, train_scores_mean, label="train rmse")
plt.plot(param_range, valid_scores_mean, label="validation rmse")

plt.legend()
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression, Ridge 
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor 
import xgboost as xgb 
import lightgbm as lgb

mods = [LinearRegression(),Ridge(),GradientBoostingRegressor(),
  RandomForestRegressor(),BaggingRegressor(),
  xgb.XGBRegressor(), lgb.LGBMRegressor()]

fitted = [mod.fit(X_train,y_train) for mod in mods]

model_df = pd.DataFrame({
    'Model': [type(i).__name__ for i in fitted],
    'Score': [i.score(X_train,y_train) for i in fitted]
    })

model_df

In [None]:
mods = [LinearRegression(),Ridge(),GradientBoostingRegressor(),
  RandomForestRegressor(),BaggingRegressor(),
  xgb.XGBRegressor(), lgb.LGBMRegressor()]

fitted = [mod.fit(X_train,y_train) for mod in mods]

model_df = pd.DataFrame({
    'Model': [type(i).__name__ for i in fitted],
    'Score': [i.score(X_valid,y_valid) for i in fitted]
    })

model_df

In [None]:
mods = [LinearRegression(),Ridge(),GradientBoostingRegressor(),
  RandomForestRegressor(),BaggingRegressor(),
  xgb.XGBRegressor(), lgb.LGBMRegressor()]

fitted = [mod.fit(X_train,y_train) for mod in mods]

model_df = pd.DataFrame({
    'Model': [type(i).__name__ for i in fitted],
    'Score': [i.score(X_test,y_test) for i in fitted]
    })

model_df

In [None]:
plt.bar(model_df['Model'], model_df['Score'], color = (0.5,0.1,0.5,0.6))
plt.title('Performance Compare')
plt.xlabel('Algorithms')
plt.ylabel('Values')
plt.ylim(0.50,0.95)
plt.xticks(model_df['Model'],rotation='vertical');