## Boston Housing Exercise

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets
seed = 0
np.random.seed(seed)

In [None]:
from sklearn.datasets import load_boston

In [None]:
# Load the Boston Housing dataset from sklearn
boston = load_boston()
bos = pd.DataFrame(boston.data)
# give our dataframe the appropriate feature names
bos.columns = boston.feature_names
# Add the target variable to the dataframe
bos['Price'] = boston.target

### Our goal will be to predict the price of housing based on the feaures in this data set

In [None]:
# For student reference, the descriptions of the features in the Boston housing data set
# are listed below
print(boston.DESCR)

In [None]:
bos.head()
bos.columns
bos.describe()

In [None]:
# Select target (y) and features (X)
X = bos.iloc[:,:-1]
y = bos.iloc[:,-1]


In [None]:
# Split the data into a train test split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=seed, shuffle=True)

### Exercise:  Use sklearn to fit a multiple linear regression model.  How will you decide which features to include?

In [None]:
bos.plot(x='CRIM', y='Price', style='o',c='black')
bos.plot(x='NOX', y='Price', style='o',c='coral')
plt.title('CRIM vs Price')
plt.xlabel('crime')
plt.ylabel('Price')
plt.show()

In [None]:
bos.plot(x='NOX', y='Price', style='o',c='coral')
plt.title('NOX vs Price')
plt.xlabel('NOX')
plt.ylabel('Price')
plt.show()

In [None]:
plt.scatter(bos.TAX, bos.Price, color='r')
plt.scatter(bos.AGE, bos.Price, color='g')
plt.xlabel('Tax')
plt.ylabel('AGE')
plt.show()

In [None]:
bos.columns

In [None]:

plt.boxplot(bos.CRIM,showmeans=True)

In [None]:
# correlation
corr = bos.corr().round(2)
corr.style.background_gradient(cmap='coolwarm')

In [None]:
from sklearn.linear_model import LinearRegression
slm = LinearRegression()
slm.fit(x_train, y_train)

In [None]:
coeff_df = pd.DataFrame(slm.coef_, X.columns, columns=['Coefficient'])
coeff_df


In [None]:
importance = slm.coef_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

### What is the coefficient of determination (r-squared) for your model?  What about the mean squared error?

In [None]:
y_pred = slm.predict(x_test)
actual_vs_predict = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
actual_vs_predict.head()

In [None]:
def metrics(m,X,y):

    yhat = m.predict(X)

    SS_Residual = sum((y-yhat)**2)

    SS_Total = sum((y-np.mean(y))**2)

    r_squared = round(1 - (float(SS_Residual))/SS_Total,4)

    adj_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1),4)

    return r_squared,adj_r_squared
print('R squared Value:', metrics(slm,x_test,y_test)[0])
print('Adjusted R sqaure Value:', metrics(slm,x_test,y_test)[1])

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

### Can you improve upon your origninal model? 

#### Hint 1:  Look at the correlations of your features to your target - are there features you think are more important than others?  This is exploratory - just play with buildind different models

#### Hint 2:  Are there features you can engineer (categorical features based on binning the numeric features in the dataset) that may be useful?  How do you handle categorical features in MLR?

In [None]:
#Iteration 2
# remove TAX,INDUS,AGE,DIS based on correlation and coefficient value combinedly
columns=['CRIM', 'ZN', 'CHAS','DIS', 'NOX', 'RM', 'RAD','PTRATIO', 'B', 'LSTAT']
X = bos[columns]
y = bos.iloc[:,-1]

# Split the data into a train test split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=seed, shuffle=True)

from sklearn.linear_model import LinearRegression
slm = LinearRegression()
slm.fit(x_train, y_train)
coeff_df = pd.DataFrame(slm.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
y_pred = slm.predict(x_test)
actual_vs_predict = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
actual_vs_predict.head()

In [None]:
def metrics(m,X,y):

    yhat = m.predict(X)
    #print(yhat)
    SS_Residual = sum((y-yhat)**2)

    SS_Total = sum((y-np.mean(y))**2)

    r_squared = round(1 - (float(SS_Residual))/SS_Total,4)

    adj_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1),4)

    return r_squared,adj_r_squared

print('R squared Value:', metrics(slm,x_test,y_test)[0])
print('Adjusted R sqaure Value:', metrics(slm,x_test,y_test)[1])
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


In [None]:
#iteration3
# adding AGE based on correlation and coefficient value combinedly
columns=['CRIM', 'CHAS','DIS', 'NOX', 'RM','B','INDUS', 'RAD','PTRATIO', 'LSTAT']
X = bos[columns]
y = bos.iloc[:,-1]

# Split the data into a train test split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=seed, shuffle=True)

from sklearn.linear_model import LinearRegression
slm = LinearRegression()
slm.fit(x_train, y_train)
coeff_df = pd.DataFrame(slm.coef_, X.columns, columns=['Coefficient'])
coeff_df
y_pred = slm.predict(x_test)
actual_vs_predict = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
actual_vs_predict.head()

def metrics(m,X,y):

    yhat = m.predict(X)
    #print(yhat)
    SS_Residual = sum((y-yhat)**2)

    SS_Total = sum((y-np.mean(y))**2)

    r_squared = round(1 - (float(SS_Residual))/SS_Total,4)

    adj_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1),4)

    return r_squared,adj_r_squared

print('R squared Value:', metrics(slm,x_test,y_test)[0])
print('Adjusted R sqaure Value:', metrics(slm,x_test,y_test)[1])
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
bos.hist()

In [None]:
## using log transform to normalise data

In [None]:
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
bos_new=pd.DataFrame(transformer.transform(bos))
bos_new.columns=bos.columns
X_new = bos_new.iloc[:,:-1]
# y_new = bos_new.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(X_new,y,test_size=0.3, random_state=seed, shuffle=True)


from sklearn.linear_model import LinearRegression
slm = LinearRegression()
slm.fit(x_train, y_train)
coeff_df = pd.DataFrame(slm.coef_, X_new.columns, columns=['Coefficient'])
coeff_df
y_pred = slm.predict(x_test)
actual_vs_predict = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

def metrics(m,X,y):

    yhat = m.predict(X)
    #print(yhat)
    SS_Residual = sum((y-yhat)**2)

    SS_Total = sum((y-np.mean(y))**2)

    r_squared = round(1 - (float(SS_Residual))/SS_Total,4)

    adj_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1),4)

    return r_squared,adj_r_squared
print('co efficients are:',coeff_df)
print('R squared Value:', metrics(slm,x_test,y_test)[0])
print('Adjusted R sqaure Value:', metrics(slm,x_test,y_test)[1])
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


In [None]:
actual_vs_predict.head()

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(x_train,
          y_train) 
# Evaluate the output

print(ridge.intercept_)
print(ridge.coef_)

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso()
lasso.fit(x_train,
          y_train) 
# Evaluate the output

print(lasso.intercept_)
print(lasso.coef_)

In [None]:
bos_new.head()

### Make a scatterplot of the observations in the test data, where the x-axis is the actual price and the y axis is the predicted price from your favorite model.  What does this plot tell you about your regression model?

In [None]:
#Iteration 1
plt.scatter(actual_vs_predict.Actual, actual_vs_predict.Predicted, color='r')
plt.xlabel('Actual')
plt.ylabel('Predict')

In [None]:
#Iteration 4 after Log transform
plt.scatter(actual_vs_predict.Actual, actual_vs_predict.Predicted, color='r')
plt.xlabel('Actual')
plt.ylabel('Predict')
