## Boston Housing Exercise

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import datasets
seed = 0
np.random.seed(seed)

In [None]:
from sklearn.datasets import load_boston

In [None]:
# Load the Boston Housing dataset from sklearn
boston = load_boston()
bos = pd.DataFrame(boston.data)
# give our dataframe the appropriate feature names
bos.columns = boston.feature_names
# Add the target variable to the dataframe
bos['Price'] = boston.target

### Our goal will be to predict the price of housing based on the feaures in this data set

In [None]:
# For student reference, the descriptions of the features in the Boston housing data set
# are listed below
boston.DESCR

In [None]:
bos.head()

In [None]:
# Select target (y) and features (X)
X = bos.iloc[:,:-1]
y = bos.iloc[:,-1]

In [None]:
# Split the data into a train test split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=seed, shuffle=True)

### Exercise:  Use sklearn to fit a multiple linear regression model.  How will you decide which features to include?

In [None]:
lreg = LinearRegression()
lreg.fit(x_train,y_train)
pred = lreg.predict(x_test)
rmse = np.sqrt(np.mean((y_test-pred)**2))

print("Root Mean Squared Error: {}".format(rmse))
print("R^2: {}".format(lreg.score(x_test, y_test)))
print("Adjusted R^2: {}".format(1 - (1-lreg.score(x_test, y_test))*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)))

In [None]:
# Fit a linear regression model using Ridge
ridge = Ridge()
ridge.fit(x_train,y_train) 
pred = lreg.predict(x_test)
rmse = np.sqrt(np.mean((y_test-pred)**2))

print("Root Mean Squared Error: {}".format(rmse))
print("R^2: {}".format(lreg.score(x_test, y_test)))
print("Adjusted R^2: {}".format(1 - (1-lreg.score(x_test, y_test))*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)))

In [None]:
# Fit a linear regression model using lasso
lasso= Lasso()
lasso.fit(x_train,y_train) 
pred = lreg.predict(x_test)
rmse = np.sqrt(np.mean((y_test-pred)**2))

print("Root Mean Squared Error: {}".format(rmse))
print("R^2: {}".format(lreg.score(x_test, y_test)))
print("Adjusted R^2: {}".format(1 - (1-lreg.score(x_test, y_test))*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)))

In [None]:
#Remove dependent Variable 
X = bos[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD','PTRATIO', 'B', 'LSTAT']]
# Split the data into a train test split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=seed, shuffle=True)

In [None]:
lreg = LinearRegression()
lreg.fit(x_train,y_train)
pred = lreg.predict(x_test)
rmse = np.sqrt(np.mean((y_test-pred)**2))

print("Root Mean Squared Error: {}".format(rmse))
print("R^2: {}".format(lreg.score(x_test, y_test)))
print("Adjusted R^2: {}".format(1 - (1-lreg.score(x_test, y_test))*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)))

In [None]:
#Remove dependent Variable 
#X = bos[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD','PTRATIO', 'B', 'LSTAT']]
X = bos[['CRIM','ZN', 'INDUS','CHAS', 'NOX','RM', 'DIS','RAD','PTRATIO', 'B','LSTAT']]
# Split the data into a train test split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=seed, shuffle=True)
lreg = LinearRegression()
lreg.fit(x_train,y_train)
pred = lreg.predict(x_test)
rmse = np.sqrt(np.mean((y_test-pred)**2))

print("Root Mean Squared Error: {}".format(rmse))
print("R^2: {}".format(lreg.score(x_test, y_test)))
print("Adjusted R^2: {}".format(1 - (1-lreg.score(x_test, y_test))*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)))

### What is the coefficient of determination (r-squared) for your model?  What about the mean squared error?

In [None]:
#Remove dependent Variable 
#X = bos[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD','PTRATIO', 'B', 'LSTAT']]
X = bos[['CRIM','ZN', 'INDUS','CHAS', 'NOX','RM', 'DIS','RAD','TAX','AGE','PTRATIO', 'B','LSTAT']]
# Split the data into a train test split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=seed, shuffle=True)
lreg = LinearRegression()
lreg.fit(x_train,y_train)
pred = lreg.predict(x_test)
rmse = np.sqrt(np.mean((y_test-pred)**2))

In [None]:
print("Root Mean Squared Error: {}".format(rmse))
print("R^2: {}".format(lreg.score(x_test, y_test)))
print("Adjusted R^2: {}".format(1 - (1-lreg.score(x_test, y_test))*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)))

### Can you improve upon your origninal model? 

#### Hint 1:  Look at the correlations of your features to your target - are there features you think are more important than others?  This is exploratory - just play with buildind different models

#### Hint 2:  Are there features you can engineer (categorical features based on binning the numeric features in the dataset) that may be useful?  How do you handle categorical features in MLR?

In [None]:
bos_corr=pd.DataFrame(bos).corr()
bos_corr.style.background_gradient(cmap='coolwarm')

### Make a scatterplot of the observations in the test data, where the x-axis is the actual price and the y axis is the predicted price from your favorite model.  What does this plot tell you about your regression model?

In [None]:
plt.scatter(y_test,pred)
plt.xlabel('Price')
plt.ylabel('Predicted_Price')
plt.show()