# MEST Day 2

## Morning Session
### Bias, Variance, Regularization

In [1]:
import pandas as pd
import numpy as np
import math

from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# feature selection
from sklearn.feature_selection import RFECV

# pipeline
from sklearn.pipeline import Pipeline

# preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

# LinearRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

# Neighbors
from sklearn.neighbors import KNeighborsRegressor

# Clustering
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# t-SNE
from sklearn.manifold import TSNE

# PCA
from sklearn.decomposition import PCA

In [None]:
train_df = pd.read_csv('./boston/train.csv', index_col='ID')
test_df = pd.read_csv('./boston/test.csv', index_col='ID')

Let's over-engineer a Linear Regression Model

In [None]:
predictors = train_df[['nox', 'rm', 'chas', 'dis', 'ptratio', 'lstat', 'rad']]
target = train_df['medv']
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(3))
]
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3, random_state=40)
columns = predictors.columns
pipe = Pipeline(steps)
pipe.fit(X_train, y_train)
lr = LinearRegression()
lr.fit(pipe.transform(X_train), y_train)
print('Score: {}'.format(lr.score(pipe.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, lr.predict(pipe.transform(X_test)))))

### Our loss was reducing, and our model quality was improving, so what went wrong?

Let's create two models and compare the magnitude of coefficients

In [None]:
s1 = [
    ('scaler', MinMaxScaler())
]
p = Pipeline(s1)
p.fit(X_train, y_train)
lr_linear = LinearRegression()
lr_linear.fit(p.transform(X_train), y_train)
print('Score: {}'.format(lr_linear.score(p.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, lr_linear.predict(p.transform(X_test)))))
print(lr_linear.coef_)

In [None]:
s2 = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(2))
]
p_2 = Pipeline(s2)
p_2.fit(X_train, y_train)
lr_quad = LinearRegression()
lr_quad.fit(p_2.transform(X_train), y_train)
print('Score: {}'.format(lr_quad.score(p_2.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, lr_quad.predict(p_2.transform(X_test)))))
print(lr_quad.coef_)

* Let's look at the coefficients of our cubic equation

In [None]:
print(lr.coef_)

* We know that `lr_quad` is more complex than `lr_linear`
* A more complex model is said to have high variance
* A less complex model is said to high bias
* The longer you train a model, the lower the bias and the higher the variance
* What do we notice?
* What can we do about our model?

1. Get more data to increase model complexity during training
2. Use n-fold cross-validation
3. Simplify our model (Regularization) by reducing the variance

### What is regularization?
* Introducing a penalty to the cost function
* Use Feature Elimination

### L1 Regularization - Lasso Regression
* Sets the coefficient of certain features to zero
* Acts as a feature selection mechanism
* Add a penalty based on size of coefficients (weights)
* Sum of absolutes - L1 Norm $\sum\limits_{j=1}^{n}{\left|w_{j}\right| } $
* Add a regularization term - $\lambda$
* High $\lambda$ is high bias, low variance
* Low $\lambda$ is low bias, high variance

In [None]:
from sklearn.linear_model import Lasso

l_1 = Lasso(alpha = 0.9)
l_1.fit(pipe.transform(X_train), y_train)
print('Score: {}'.format(l_1.score(pipe.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, l_1.predict(pipe.transform(X_test)))))
print(l_1.coef_)

In [None]:
l_2 = Lasso(alpha = 0.1)
l_2.fit(pipe.transform(X_train), y_train)
print('Score: {}'.format(l_2.score(pipe.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, l_2.predict(pipe.transform(X_test)))))
print(l_2.coef_)

### L2 Regularization - Ridge Regression - Tikhonov Regularization
* Forces the magnitude of coefficients close to zero
* Add a penalty based on size of coefficients (weights)
* Sum of squares - L2 Norm $\sum\limits_{j=1}^{n}{w_{j}^2} $
* Add a regularization term - $\lambda$
* High $\lambda$ is high bias, low variance
* Low $\lambda$ is low bias, high variance

In [None]:
from sklearn.linear_model import Ridge

r_1 = Ridge(alpha = 0.9)
r_1.fit(pipe.transform(X_train), y_train)
print('Score: {}'.format(r_1.score(pipe.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, r_1.predict(pipe.transform(X_test)))))
print(r_1.coef_)

In [None]:

r_2 = Ridge(alpha = 0.1)
r_2.fit(pipe.transform(X_train), y_train)
print('Score: {}'.format(r_2.score(pipe.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, r_2.predict(pipe.transform(X_test)))))
print(r_2.coef_)

## Afternoon Session
### Logistic Regression
* Output a probability
* Probabilities are mutually exclusive
* Given two classes a, b $P(a) = 1 - P(b)$
* Loss Function (Log Loss or Cross Entropy): $\frac{1}{m} \sum\limits_{i=1}^{m}{-y.log(\hat{y}) + (1-y).log(1-\hat{y})}$
### More Scikit-Learn

In [None]:
import pandas as pd

_headers = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', names=_headers)
df.info()

In [None]:
df.tail()

In [None]:
df['class'].value_counts()

### Split Data

In [None]:
predictors = df.drop(['class'], axis=1).values
labels = df[['class']].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, = train_test_split(predictors, labels, test_size=0.5)

### Train a Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)

### Predict

In [None]:
y_pred = log_model.predict(X_test)
y_proba = log_model.predict_proba(X_test)

In [None]:
labels_df = pd.DataFrame(dict(observed=y_test.reshape(-1), predicted=y_pred))
labels_df.head(n=15)

### Evaluate

In [None]:
print('Score: {}'.format(log_model.score(X_test, y_test)))

### Import Cars

In [None]:
_headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'car']
df_cars = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data', names=_headers)
df_cars.info()

In [None]:
df_cars.head()

In [None]:
df_cars.tail()

In [None]:
predictors = df_cars.drop(['car'], axis=1).values
labels = df_cars['car'].values

X_train, X_test, y_train, y_test = train_test_split(predictors, labels, test_size=0.2)

log_model_2 = LogisticRegression()
log_model_2.fit(X_train, y_train)

### Convert categorical variables using `pd.get_dummies()`

In [None]:
?pd.get_dummies()

In [None]:
buying_df = pd.get_dummies(df_cars['buying'], prefix='buying', drop_first=True)

In [None]:
buying_df.info()

In [None]:
buying_df.head()

In [None]:
?pd.concat

In [None]:
df_cars = pd.concat([df_cars, buying_df], axis=1)

In [None]:
df_cars.info()


In [None]:
df_cars.drop(['buying'], axis=1, inplace=True)

In [None]:
df_cars.info()

In [None]:
df_cars['maint'].value_counts()

In [None]:
maint_df = pd.get_dummies(df_cars['maint'], prefix='maint', drop_first=True)
maint_df.info()

In [None]:
df_cars = pd.concat([df_cars, maint_df], axis=1)
df_cars.info()

In [None]:
df_cars.drop(['maint'], axis=1, inplace=True)

In [None]:
doors_df = pd.get_dummies(df_cars['doors'], prefix='doors', drop_first=True)
doors_df.head()

In [None]:
persons_df = pd.get_dummies(df_cars['persons'], prefix='persons', drop_first=True)
persons_df.head()

In [None]:

lug_boot_df = pd.get_dummies(df_cars['lug_boot'], prefix='lug_boot', drop_first=True)
lug_boot_df.head()

In [None]:
safety_df = pd.get_dummies(df_cars['safety'], prefix='safety', drop_first=True)
safety_df.head()

In [None]:
df_cars = pd.concat([df_cars, doors_df, persons_df, lug_boot_df, safety_df], axis=1)
df_cars.head()

In [None]:
df_cars.drop(['doors', 'persons', 'lug_boot', 'safety'], axis=1, inplace=True)

In [None]:
df_cars.info()

In [None]:
df_cars.head()

# split our data

In [None]:
predictors = df_cars.drop(['car'], axis=1).values
labels = df_cars['car'].values

X_train, X_test, y_train, y_test = train_test_split(predictors, labels, test_size=0.1)

In [None]:
log_model_2 = LogisticRegression()
log_model_2.fit(X_train, y_train)

In [None]:
print('Score: {}'.format(log_model_2.score(X_test, y_test)))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print('Accuracy: {}'.format(accuracy_score(y_test, log_model_2.predict(X_test))))

In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix

In [None]:
print('Precision: {}'.format(precision_score(y_test, log_model_2.predict(X_test), average=None)))
print('Recall: {}'.format(recall_score(y_test, log_model_2.predict(X_test), average=None)))

In [None]:
df_cars['car'].value_counts()

In [None]:
print(confusion_matrix(y_test, log_model_2.predict(X_test)))