In [None]:
import sys
import logging

import numpy as np
import scipy as sp
import sklearn
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sn
sn.set_context("poster")
sn.set(rc={'figure.figsize': (16, 9.)})
sn.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

__Column Definitions__

1. CRIM: per capita crime rate by town 

2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft. 

3. INDUS: proportion of non-retail business acres per town 

4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) 

5. NOX: nitric oxides concentration 

6. RM: average number of rooms per dwelling 

7. AGE: proportion of owner-occupied units built prior to 1940 

8. DIS: weighted distances to ﬁve Boston employment centers 

9. RAD: index of accessibility to radial highways 

10. TAX: full-value property-tax rate per 10000 

11. PTRATIO: pupil-teacher ratio by town 

12. B: 1000(Bk−0.63)2 where Bk is the proportion of blacks by town 

13. LSTAT: % lower status of the population 

14. MEDV: Median value of owner-occupied homes in $1000s



In [None]:
columns_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

In [None]:
data_set = pd.read_csv('../input/housing.csv',names=columns_names,delim_whitespace=True,)

In [None]:
data_set.head()

In [None]:
data_set.hist(figsize=(15,10),grid=False)
plt.show()

In [None]:
data_set.describe()

In [None]:
plt.figure(figsize=(15,10)) 
sn.heatmap(data_set.corr(),annot=True) 

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(data_set)
scaled_df = pd.DataFrame(scaled_df,columns=columns_names) 

In [None]:
X = scaled_df[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']]
y = scaled_df['MEDV']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [None]:
def pretty_print_linear(coefs, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)
                                   for coef, name in lst)

In [None]:
linear_regression = LinearRegression()
model = linear_regression.fit(X_train, y_train)
print('The accuracy of the Linear Regression is: {:.2f}'.format(model.score(X_train,y_train)))
print('The accuracy of the Linear Regression is: {:.2f}'.format(model.score(X_test,y_test)))
pretty_print_linear(model.coef_)

In [None]:
ridge = Ridge()
model = ridge.fit(X_train, y_train)
print('The accuracy of the Ridge Regression is: {:.2f}'.format(model.score(X_train,y_train)))
print('The accuracy of the Ridge Regression is: {:.2f}'.format(model.score(X_test,y_test)))
pretty_print_linear(model.coef_)

In [None]:
lasso = Lasso()
model = lasso.fit(X_train, y_train)
print('The accuracy of the Lasso Regression is: {:.2f}'.format(model.score(X_train,y_train)))
print('The accuracy of the Lasso Regression is: {:.2f}'.format(model.score(X_test,y_test)))
pretty_print_linear(model.coef_)

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor().fit(X_train, y_train)
print('The accuracy of the Decision Tree Regression is: {:.2f}'.format(model.score(X_train,y_train)))
print('The accuracy of the Decision Tree Regression is: {:.2f}'.format(model.score(X_test,y_test)))

In [None]:
n_features = X.shape[1]
plt.barh(range(n_features), model.feature_importances_, align = 'center')
plt.yticks(np.arange(n_features), X.columns)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.ylim(-1, n_features)