# MEST Day 1

## Morning Session
### Linear Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
train_df = pd.read_csv('/content/gdrive/My Drive/boston/train.csv', index_col='ID')

In [None]:
train_df.info()

In [None]:
train_df.head()

Can we see a visual relationship between rooms and medv?

In [None]:
train_df[['rm', 'medv']].head()

In [None]:
_rooms = train_df['rm'].values
_medv = train_df['medv'].values

plt.scatter(_rooms, _medv)
plt.xlabel('No of Rooms')
plt.ylabel('Median Value')
plt.show()

Can we find an equation for this line?

In [None]:
_r = np.linspace(min(_rooms), max(_rooms)).reshape(-1, 1)
_m = np.linspace(min(_medv), max(_medv)).reshape(-1, 1)

plt.scatter(_rooms, _medv)
plt.plot(_r, _m, color='red', linewidth=3)
plt.xlabel('No of Rooms')
plt.ylabel('Median Value')
plt.show()

# Function
* $y = wx + b$
* $y = w_0.x_0 + w_1.x_1$
* $y = w.x$
* $y$ is a predicted variable also called $y_{pred}$
* $w$ is a vector of weights
* $x$ is a vector of predictors

# Cost
* $L = \frac{\sqrt{\sum{(y - y_{pred})^2}}}{n}$

* $y_{pred} = a + wx$
* vectorize the operation
* $y_{pred} = a.w_0 + x.w_1$
* $y_{pred} = (a x) * (w_0 w_1)$
* $y_{pred} = x * w$

# Define y

In [None]:
y = train_df['medv'].values
print(type(y))
print(y.shape)
y = y.reshape(-1, 1)
print(y.shape)

# Define x

In [None]:
train_df['constant'] = 1

In [None]:
train_df.head()

In [None]:
columns = ['constant', 'rm', 'zn', 'indus']
x = train_df[columns].values
print(type(x))

In [None]:
print(x.shape)

In [None]:
w = np.zeros((x.shape[1], 1))
print(type(w))
print(w.shape)
print(w)

In [None]:
y_pred = np.dot(x, w)

In [None]:
print(y_pred.shape)

In [None]:
error = y - y_pred
print(error.shape)
squared_error = np.power(error, 2)
root_mean_squared_error = sqrt(squared_error.sum()) / y_pred.shape[0]

In [None]:
print(root_mean_squared_error)

## Implement Gradient Descent
* learning rate $\alpha$
* gradient

* gradient is $2 * error$
* gradient update rule is $w = w - gradient * \alpha$

In [None]:
costs = []
w_0_s = []
w_1_s = []

learning_rate = 1e-3
steps = 20

In [None]:
for a in range(steps):
    
    w_0 = w[0][0]
    w_1 = w[1][0]
    
    
    # make prediction
    y_pred = np.dot(x, w)
    error = y - y_pred
    error_squared = np.power(error, 2)
    # cost function is LMS
    LMS = error_squared.sum() / (2 * y.shape[0])
    
    costs.append(LMS)
    w_0_s.append(w_0)
    w_1_s.append(w_1)
    
    # update 
    
    w_0 = w_0 + learning_rate/y.shape[0] * error.sum()
    w_1 = w_1 + learning_rate/y.shape[0] * (error * x[1]).sum()
    
    w[0][0] = w_0
    w[1][0] = w_1

In [None]:
cost_df = pd.DataFrame({'cost': pd.Series(costs), 'w_0': pd.Series(w_0_s), 'w_1': pd.Series(w_1_s)})
cost_df['cost'].plot()

In [None]:
print(cost_df)

In [None]:
print(w_0, w_1)

Make a Prediction

In [None]:
_w = [w_0, w_1]
_w = np.asarray(_w)
_x = train_df[['constant', 'rm']].values
y_pred = np.dot(_x, _w)

In [None]:
_p = pd.DataFrame(dict(actual=train_df['medv'].values, predicted=y_pred.reshape(-1)))
_p.head()

## Matrix Multiplication Refresher

$y = w.x + b = w.x$

$\begin{bmatrix} y_{11} \\ y_{21} \\ y_{31} \\ y_{41} \end{bmatrix} = \begin{bmatrix} w_{11}  \\ w_{21}  \end{bmatrix}  \begin{bmatrix} x_{11}  & x_{12} \\ x_{21} & x_{22} \\ x_{31} & x_{32} \\  x_{41} & x_{42} \end{bmatrix} + \begin{bmatrix} b_{11} \\ b_{21} \\ b_{31} \\ b_{41} \end{bmatrix} $

$\begin{bmatrix} y_{11} \\ y_{21} \\ y_{31} \\ y_{41} \end{bmatrix} = \begin{bmatrix} w_{11} * x_{11} + w_{21} * x_{12} \\ w_{11} * x_{21} + w_{21} * x_{22} \\ w_{11} * x_{31} + w_{21} * x_{32} \\ w_{11} * x_{41} + w_{21} * x_{42} \end{bmatrix} + \begin{bmatrix} b_{11} * 1 \\ b_{21} * 1 \\ b_{31} * 1 \\ b_{41} * 1 \end{bmatrix} $

$\begin{bmatrix} y_{11} \\ y_{21} \\ y_{31} \\ y_{41} \end{bmatrix} = \begin{bmatrix} w_{11} * x_{11} + w_{21} * x_{12} + b * 1 \\ w_{11} * x_{21} + w_{21} * x_{22} + b * 1 \\ w_{11} * x_{31} + w_{21} * x_{32} + b * 1 \\ w_{11} * x_{41} + w_{21} * x_{42} + b * 1 \end{bmatrix}$

$\begin{bmatrix} y_{11} \\ y_{21} \\ y_{31} \\ y_{41} \end{bmatrix} = \begin{bmatrix} w_{11} \\ w_{21} \\ w_{31}  \end{bmatrix} \bullet \begin{bmatrix}  x_{11} & x_{12} & 1 \\  x_{21} & x_{22} & 1 \\  x_{31} & x_{32} & 1 \\  x_{41} & x_{42} & 1 \end{bmatrix}$

### Implement SciPY

In [None]:
import pandas as pd
import numpy as np
from scipy.optimize import fmin, minimize

In [None]:
y = train_df['medv'].values
y = y.reshape(-1, 1)

In [None]:
train_df['constant'] = 1
columns = ['constant', 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']
x = train_df[columns].values

In [None]:
w = np.zeros([x.shape[1], 1])

In [None]:
print(w)

In [None]:
def pred(x, w):
    return np.dot(x, w)

In [None]:
y_pred = pred(x, w)

In [None]:
def loss(_w):
    p = pred(x, _w)
    e = y - p
    se = np.power(e, 2)
    rse = np.sqrt(np.sum(se))
    rmse = rse / y.shape[0]
    return rmse

In [None]:
l = loss(w)

In [None]:
l

In [None]:
min = fmin(loss, w, maxiter=1000)

In [None]:
min

In [None]:
loss(min)

In [None]:
y_min = pred(x, min)

In [None]:
out = pd.DataFrame({'y': y[:,0], 'y_pred': y_pred[:,0], 'y_min': pred(x, min)})

In [None]:
out.head(n=15)

In [None]:
nms = minimize(loss, w, method='nelder-mead')

In [None]:
nms.x.shape

In [None]:
out_2 = pd.DataFrame({'y': y[:,0], 'y_pred': y_pred[:,0], 'y_min': pred(x, nms.x)})

In [None]:
out_2.head()

## Afternoon Session
### Scikit-Learn

In [None]:
import pandas as pd
import numpy as np
import math

from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# feature selection
from sklearn.feature_selection import RFECV

# pipeline
from sklearn.pipeline import Pipeline

# preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

# LinearRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

# Neighbors
from sklearn.neighbors import KNeighborsRegressor

# Clustering
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# t-SNE
from sklearn.manifold import TSNE

# PCA
from sklearn.decomposition import PCA

In [None]:
train_df = pd.read_csv('/content/gdrive/My Drive/boston/train.csv', index_col='ID')
test_df = pd.read_csv('/content/gdrive/My Drive/boston/test.csv', index_col='ID')

In [None]:
train_df.info()

Create a subset of data to work with

In [None]:
use_cols = ['chas', 'nox', 'rm', 'dis', 'ptratio', 'lstat']
#predictors = train_df.drop('medv', axis=1)
predictors = train_df[['rm']]
target = train_df['medv']
print(predictors.shape)
print(target.shape)

Create training and validation datasets

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3, random_state=40)

### Implement Linear Regression

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
_preds_df = pd.DataFrame(dict(observed=y_test, predicted=y_pred))
_preds_df.head()

How do we evaluate the accuracy of our model?

In [None]:
print('Score: {}'.format(lr.score(X_test, y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))

### Can we build a better model?
* Let's start by looking at our data again.

In [None]:
plt.scatter(_rooms, _medv)
plt.plot(_r, _m, color='red', linewidth=3)
plt.xlabel('No of Rooms')
plt.ylabel('Median Value')
plt.show()

* do the data points form a linear relationship? What if we hypothesize a quadratic relationship?

$y = w_0 + w_1.x_1 + w_2.x_1^2$

In [None]:
predictors.head()

* Let's engineer a new feature by taking the square of the rooms

In [None]:
predictors['rm_2'] = predictors['rm'].map(lambda x: x ** 2)

In [None]:
predictors.head()

### Let's train a new model

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3, random_state=40)

lr_2 = LinearRegression()
lr_2.fit(X_train, y_train)
y_pred = lr_2.predict(X_test)

_preds_df = pd.DataFrame(dict(observed=y_test, predicted=y_pred))
_preds_df.head()

print('Score: {}'.format(lr_2.score(X_test, y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))

## Train a new model using more features

In [None]:
train_df.info()

In [None]:
#extract y
y = train_df['medv'].values

#engineer new features (optional)
train_df['new_feature'] = train_df['tax'].map(lambda x: x ** 3)
#extract x
x = train_df[['crim', 'tax', 'black', 'new_feature']].values

In [None]:
#split the data using train_test_split

In [None]:
#use LinearRegresson to `.fit()`

In [None]:
#evaluate using `.score()`

In [None]:
sns.

In [None]:
use_cols = ['chas', 'nox', 'rm', 'dis', 'ptratio', 'lstat']
predictors = train_df.drop('medv', axis=1)
#predictors = train_df[use_cols]
target = train_df['medv']
print(predictors.shape)
print(target.shape)

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3, random_state=40)

In [None]:
steps = [
    ('scaler', MinMaxScaler()),
    ('model', LinearRegression())
]
pipeline = Pipeline(steps)

In [None]:
parameters = {
    'model__normalize': [False, True]
}

In [None]:
model = GridSearchCV(pipeline, parameters)
model.fit(X_train, y_train)

In [None]:
print('Score: {}'.format(model.score(X_test, y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, model.predict(X_test))))

In [None]:
predictors = train_df.drop('medv', axis=1)
predictors['rm_2'] = predictors['rm'].map(lambda x: x ** 2)
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3, random_state=40)
columns = predictors.columns
lr_3 = LinearRegression()
lr_3.fit(X_train, y_train)
print('Score: {}'.format(lr_3.score(X_test, y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, lr_3.predict(X_test))))
coefficients = lr_3.coef_
print(coefficients)
feature_importance = pd.Series(coefficients, index=columns)
ordered_feature_importance = feature_importance.abs().sort_values()
ordered_feature_importance.plot.barh()
plt.show()

In [None]:
predictors = train_df[['nox', 'rm', 'chas', 'dis', 'ptratio', 'lstat', 'rad']]
predictors['rm_2'] = predictors['rm'].map(lambda x: x ** 2)
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3, random_state=40)
columns = predictors.columns
lr_4 = LinearRegression()
lr_4.fit(X_train, y_train)
print('Score: {}'.format(lr_4.score(X_test, y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, lr_4.predict(X_test))))

In [None]:
predictors = train_df[['nox', 'rm', 'chas', 'dis', 'ptratio', 'lstat', 'rad']]
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(3)),
    ('model', LinearRegression())
]
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3, random_state=40)
columns = predictors.columns
lr_5 = Pipeline(steps)
lr_5.fit(X_train, y_train)
print('Score: {}'.format(lr_5.score(X_test, y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, lr_5.predict(X_test))))