The Boston Housing Dataset

The Boston Housing Dataset is a derived from information collected by the U.S. Census Service concerning housing in the area of Boston MA. The following describes the dataset columns:

CRIM - per capita crime rate by town

ZN - proportion of residential land zoned for lots over 25,000 sq.ft.

INDUS - proportion of non-retail business acres per town.

CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)

NOX - nitric oxides concentration (parts per 10 million)

RM - average number of rooms per dwelling

AGE - proportion of owner-occupied units built prior to 1940

DIS - weighted distances to five Boston employment centres

RAD - index of accessibility to radial highways

TAX - full-value property-tax rate per $10,000

PTRATIO - pupil-teacher ratio by town

N - 1000(N - 0.63)^2 where N is the proportion of Non-Americans by town

LSTAT - % lower status of the population

MEDV - Median value of owner-occupied homes in $1000's


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The target is MEDV (Median value of owner-occupied homes)

In [None]:
#I will use deep learning method and for scaling will use minmaxscaler
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
#firt of all i will make the dataset ready to model.
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'N', 'LSTAT', 'MEDV']
housing = pd.read_csv("/kaggle/input/boston-house-prices/housing.csv", header=None, delimiter=r"\s+", names=column_names)
housing.head()

In [None]:
#dimension of the dataset
print(np.shape(housing))

In [None]:
# summarized statistics of data
print(housing.describe())

In [None]:
#while prediction of MEDV , the columns ZN and CHAS is not necessary 
#and in the features above 50.00 in MEDV columns are not necessary 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

fig, axs = plt.subplots(ncols=7, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in housing.items():
    sns.boxplot(y=v, ax=axs[index])
    index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

In [None]:
    for k, v in housing.items():
        q1 = v.quantile(0.25)
        q3 = v.quantile(0.75)
        irq = q3 - q1
        v_col = v[(v <= q1 - 1.5 * irq) | (v >= q3 + 1.5 * irq)]
        perc = np.shape(v_col)[0] * 100.0 / np.shape(housing)[0]
        print("Column %s outliers = %.2f%%" % (k, perc))
    
#there are outliers in the columns CRIM,ZN,RM and B seemed in the graphs above. 
#lets see the percentages of them

In [None]:
#lets remove features of MEDV columns below 50
housing = housing[~(housing['MEDV'] >= 50.0)]
print(np.shape(housing))

In [None]:
#lets see the new graphs as plot
fig, axs = plt.subplots(ncols=7, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in housing.items():
    sns.distplot(v, ax=axs[index])
    index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

In [None]:
#lets see the correlation
plt.figure(figsize=(20, 10))
sns.heatmap(housing.corr().abs(),  annot=True)

In [None]:
#in the matrix above it is seemed that TAX and RAD coloumns are highly correlated but LSTAT,INDUS,RM,TAX,NOX,PTRAIO columns have low correlation.
from sklearn import preprocessing
# Let's scale the columns before plotting them against MEDV
min_max_scaler = preprocessing.MinMaxScaler()
column_sels = ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']
x = housing.loc[:,column_sels]
y = housing['MEDV']
x = pd.DataFrame(data=min_max_scaler.fit_transform(x), columns=column_sels)
fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for i, k in enumerate(column_sels):
    sns.regplot(y=y, x=x[k], ax=axs[i])
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

In [None]:
#lets remove the skewness of the data
y =  np.log1p(y)
for col in x.columns:
    if np.abs(x[col].skew()) > 0.3:
        x[col] = np.log1p(x[col])

In [None]:
#starting the model. split the train and test datas. after that i will make fit transfer at the same time
from sklearn.model_selection import train_test_split

train_test_split(x, y)
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, random_state = 20)
scaler = MinMaxScaler((-1,1))
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
from sklearn.linear_model import ElasticNet

# we use elastic net. alpha is the total penalty parameter and l1_ratio is the proportion of alpha to be given to l1 norm
elastic = ElasticNet(alpha = 0.05 , l1_ratio= 0.5, max_iter = 1000) 

#fit the model with train data. 
model = elastic.fit(x_train,y_train)

#R2 is the default scoring method for linear regression
r2_train = model.score(x_train,y_train)
r2_test = model.score(x_test,y_test)

print("R2 Score for train data is ", r2_train)
print("R2 Score for test data is ", r2_test)
model.coef_ 

In [None]:
# import ridge from linear model
from sklearn.linear_model import Ridge

# we use ridge instead of linear regression. 
ridge = Ridge(alpha = 0.3) 


#column_sels = ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']
#x = housing.loc[:,column_sels]
#y = housing['MEDV']


#fit the model with train data. 
model = ridge.fit(x_train,y_train)

#R2 is the default scoring method for linear regression
r2_train = model.score(x_train,y_train)
r2_test = model.score(x_test,y_test)

print("R2 Score for train data is ", r2_train)
print("R2 Score for test data is ", r2_test)
model.coef_

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression() 
#fit the model with train data. 
model = lr.fit(x_train,y_train)

#R2 is the default scoring method for linear regression
r2_train = model.score(x_train,y_train)
r2_test = model.score(x_test,y_test)

print("R2 Score for train data is ", r2_train)
print("R2 Score for test data is ", r2_test)
linear_coef = model.coef_
linear_coef

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

x_train_transformed = scaler.fit_transform(x_train)

In [None]:
from sklearn.linear_model import SGDRegressor
#lets first train a linear regression model
#try: alpha=0, max_iter = 10000
#predictor = SGDRegressor(alpha=0) 

predictor = SGDRegressor(alpha=0, max_iter = 10000) 


#fit the model with train data. 
model = predictor.fit(x_train_transformed,y_train)

x_test_transformed = scaler.transform(x_test)

#R2 is the default scoring method for linear regression
r2_train = model.score(x_train_transformed,y_train)
r2_test = model.score(x_test_transformed,y_test)

print("R2 Score for train data is ", r2_train)
print("R2 Score for test data is ", r2_test)
model.coef_