## Coding Exercise #0302

### 1. Linear regression and diagnostics:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn import metrics
%matplotlib inline

#### 1.1. Load the 'Boston' dataset from Scikit-Learn:

In [None]:
data = load_boston()

In [None]:
# Display the dictionary keys.
data.keys()

In [None]:
# Display the description on the data.
print(data['DESCR'])

In [None]:
# The explanatory variables.
X = data['data']
header = data['feature_names']

In [None]:
# The response variable.
Y = data['target']
Y = Y.reshape(-1, 1)

#### 1.2. Convert the data into a DataFrame and then explore:

In [None]:
df = pd.DataFrame(np.append(X,Y,axis = 1))
df.columns = list(header)+['PRICE']

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
# Descriptive statistics of the variables (columns).
df.describe()

In [None]:
# Pair-wise correlation matrix.
np.round(df.corr(),2)

In [None]:
# Visualize the correlation matrix.
sns.heatmap(df.corr(),cmap='coolwarm')
plt.show()

In [None]:
# Visualize RM vs PRICE.
plt.scatter(X[:,5],Y[:,0],c = 'g',s=15,alpha=0.5)
plt.xlabel('RM')
plt.ylabel('PRICE')
plt.show()

#### 1.3. Train by linear regression:

In [None]:
# Train.
lm = LinearRegression(fit_intercept=True)
lm.fit(X,Y)

In [None]:
# The intercept.
lm.intercept_

In [None]:
# The rest of coefficients (parameters).
lm.coef_

In [None]:
# Display the parameters as a DataFrame.
parametersDF = pd.DataFrame(lm.coef_,index=['Parameter Value'],columns=header)
parametersDF['Intercept'] = lm.intercept_[0]
parametersDF

#### 1.4. Diagnostics:

In [None]:
# In-sample prediction.
predY = lm.predict(X)

In [None]:
# Display real Y vs predicted Y.
plt.scatter(Y,predY,c = 'blue', s=15, alpha=0.5)
plt.xlabel('REAL PRICE')
plt.ylabel('PREDICTED PRICE')
plt.show()

In [None]:
# Calculate the correlation between the real Y and predicted Y.
pd.Series(Y[:,0]).corr(pd.Series(predY[:,0]))

In [None]:
# Coefficient of determination (R^2):
lm.score(X,Y)

#### 1.5. In-sample and out-of-sample testing:

In [None]:
# Split the dataset.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=123)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
# predY_in = in-sample prediction of Y.
# predY_out = out-of-sample prediction of Y.
lm = LinearRegression()
lm.fit(X_train,Y_train)
predY_in = lm.predict(X_train)
predY_out = lm.predict(X_test)

In [None]:
print('In-sample MSE is      : ' + str(metrics.mean_squared_error(Y_train, Y_pred_train)))
print('Out-of-sample MSE is  : ' + str(metrics.mean_squared_error(Y_test, Y_pred_test)))
print('-'*50)
print('In-sample RMSE is     : ' + str(np.sqrt(metrics.mean_squared_error(Y_train, Y_pred_train))))
print('Out-of-sample RMSE is : ' + str(np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_test))))

NOTE: In-sample error are a bit smaller but not by much.

#### 1.6. Residual analysis:

In [None]:
# Calculate residual.
residual = Y_train - predY_in

In [None]:
# Q: Can you check "visually" that the mean = 0 and variance = constant?
plt.scatter(Y_train,residual,c = 'red', s=15, alpha=0.5)
plt.xlabel('Y')
plt.ylabel('Residual')
plt.title('Residual')
plt.show()

In [None]:
# Q: Are the residuals normally distributed centered around 0?
sns.distplot(residual, bins=50, color='green').set_title("Residual Histogram")
plt.show()

#### 1.7. Given a new set of values for the explanatory variables, predict the response:  
- CRIM     : 0.03
- ZN       : 0.0
- INDUS    : 13.0
- CHAS     : 0.0
- NOX      : 0.4
- RM       : 4.3
- AGE      : 23.5
- DIS      : 1.9
- RAD      : 1.0
- TAX      : 273.0
- PTRATIO  : 18.0 
- B        : 380.0
- LSTAT    : 7.5

In [None]:
X_new = np.array([0.03, 0.0, 13.0, 0.0, 0.4, 4.3, 23.5, 1.9, 1.0, 273.0, 18.0, 380.0, 7.5]).reshape(1,-1)  # Reshaped as a row.
Y_pred_new = lm.predict(X_new)
print(np.round(Y_pred_new[0,0],3))