In [3]:
%matplotlib notebook

import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


np.set_printoptions(precision=2)

fruits = pd.read_table('fruit_data_with_colors.txt')

feature_name_fruits = ['height', 'width', 'mass', 'color_score']

X_fruits =fruits[feature_name_fruits]
y_fruits = fruits['fruit_label']
target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']

X_fruits_2d = fruits[['height', 'width']]
y_fruits_2d = fruits['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X_fruits, y_fruits, random_state=0)

# feature normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

# combined method for above 2 steps
# X_train_scaled = scaler.fit_transform(X_train)

# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}'.format(knn.score(X_train_scaled, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'.format(knn.score(X_test_scaled, y_test)))

example_fruit = [[5.5, 2.2, 10, 0.70]]
example_fruit_scaled = scaler.transform(example_fruit)

target_names_fruits[knn.predict(example_fruit_scaled)[0]-1]
# knn.predict(example_fruit_scaled)[0]
print('Predicted fruit type for ', example_fruit, ' is ',  target_names_fruits[knn.predict(example_fruit_scaled)[0]-1])

Accuracy of K-NN classifier on training set: 0.95
Accuracy of K-NN classifier on test set: 1.00
Predicted fruit type for  [[5.5, 2.2, 10, 0.7]]  is  mandarin


# Datasets



In [5]:
from matplotlib.colors import ListedColormap

cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])

In [6]:
# synthetic dataset for simple regression

from sklearn.datasets import make_regression

plt.figure()
plt.title('Sample regresssion problem with one input variable')

X_R1, y_R1 = make_regression(n_samples=100, n_features=1,n_informative=1,bias=150,
                            noise=30, random_state=0)

plt.scatter(X_R1, y_R1, marker='o', s=50)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0xef6c9f0>

In [42]:
# synthetic dataset for more complex regression
from sklearn.datasets import make_friedman1

plt.figure()
plt.title('Complex regresssion problem with one input variable')
X_F1, y_F1 = make_friedman1(n_samples=100, n_features=7, random_state=0)

plt.scatter(X_F1[:,4], y_F1, marker='o', s=50)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x395f5f30>

In [7]:
# synthetic dataset for classification (binary) 
from sklearn.datasets import make_classification

plt.figure()
plt.title('Sample binary classification problem with two informative features')

X_C2, y_C2 = make_classification(n_samples=100, n_features=2, n_redundant=0, n_informative=2,
                                n_clusters_per_class=1, flip_y=0.1,
                                class_sep=0.5, random_state=0)

plt.scatter(X_C2[:,0], X_C2[:,1], c=y_C2, marker='o', s=50, cmap=cmap_bold)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0xbd6bd10>

In [83]:
# more difficult synthetic dataset for classification (binary) 
# with classes that are not linearly separable

from sklearn.datasets import make_blobs

X_D2, y_D2 = make_blobs(n_samples=100, n_features=2, centers=8, cluster_std=1.3, random_state=4)

y_D2 = y_D2 % 2

plt.figure()
plt.title('Sample binary classification problem with non-linearly separable classes')
plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2,
           marker= 'o', s=50, cmap=cmap_bold)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x3d6a9f50>

In [8]:
# Breast cancer dataset for classification
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

(X_cancer, y_cancer) = load_breast_cancer(return_X_y=True)

In [9]:
from adspy_shared_utilities import load_crime_dataset
# Communities and Crime dataset
(X_crime, y_crime) = load_crime_dataset()

# K-Nearest Neighbors

## Classification

In [10]:
from adspy_shared_utilities import plot_two_class_knn

X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0)

plot_two_class_knn(X_train, y_train, n_neighbors=1, weights='uniform', X_test=X_test, y_test=y_test)
plot_two_class_knn(X_train, y_train, n_neighbors=3, weights='uniform', X_test=X_test, y_test=y_test)
plot_two_class_knn(X_train, y_train, n_neighbors=13, weights='uniform', X_test=X_test, y_test=y_test)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Regression

In [11]:
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0)
knnreg = KNeighborsRegressor(n_neighbors=5).fit(X_train,y_train)

print(knnreg.predict(X_test))
print('R squared test score: {:.3f} '.format(knnreg.score(X_test, y_test)))

[231.71 148.36 150.59 150.59  72.15 166.51 141.91 235.57 208.26 102.1
 191.32 134.5  228.32 148.36 159.17 113.47 144.04 199.23 143.19 166.51
 231.71 208.26 128.02 123.14 141.91]
R squared test score: 0.425 


In [12]:
fig, subaxes = plt.subplots(1,2, figsize=(8,4))

X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state=0)

for thisaxis, K in zip(subaxes, [1,3]):
    knnreg = KNeighborsRegressor(n_neighbors=K).fit(X_train, y_train)
    y_predict_output = knnreg.predict(X_predict_input)
    
    thisaxis.set_xlim([-2.5, 0.75])
    thisaxis.plot(X_predict_input, y_predict_output, '^', markersize = 10,
                 label='Predicted', alpha=0.8)
    thisaxis.plot(X_train, y_train, 'o', label='True Value', alpha=0.8)
    thisaxis.set_xlabel('Input feature')
    thisaxis.set_ylabel('Target value')
    thisaxis.set_title('KNN regression (K={})'.format(K))
    thisaxis.legend()
    
plt.tight_layout()

<IPython.core.display.Javascript object>

## Regression model complexity as a function of K


In [13]:
# plot k-NN regression on sample dataset for different values of K

fig, subaxes = plt.subplots(5,1, figsize=(5,20))

X_predict_input = np.linspace(-3, 3,500).reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0)

for thisaxis, K in zip(subaxes, [1,3,7,15, 55]):
    knnreg = KNeighborsRegressor(n_neighbors=K).fit(X_train, y_train)
    
    y_predict_output = knnreg.predict(X_predict_input)
    train_score = knnreg.score(X_train, y_train)
    test_score = knnreg.score(X_test, y_test)
    
    thisaxis.plot(X_predict_input, y_predict_output)
    thisaxis.plot(X_train, y_train, 'o', alpha=0.9, label='Train')
    thisaxis.plot(X_test, y_test, '^', alpha=0.9, label='Test')
    thisaxis.set_xlabel('Input feature')
    thisaxis.set_ylabel('Target value')
    
    thisaxis.set_title('KNN Regression (K={}) \n\
    Train $R^2 = {:.3f}$, Test $R^2 = {:.3f}$'.format(K, train_score, test_score))
    thisaxis.legend()
    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

<IPython.core.display.Javascript object>

## Linear Regression

Least square Linear Regression / Ordinary Linear Regression

- Linear Model gives potentially stable but inaccurate predictions. Generalizes well
- KNN model gives potentially unstable but accurate predictions. Changes based on input data change

In [15]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1)

linearreg = LinearRegression().fit(X_train, y_train)

print('Linear model coeff (w) : {}'.format(linearreg.coef_))
print('Linear model interceptor (b) : {:.3f}'.format(linearreg.intercept_))

print('R squared training score : {:.3f}', linearreg.score(X_train, y_train))
print('R squared test score : {:.3f}', linearreg.score(X_test, y_test))



Linear model coeff (w) : [41.2]
Linear model interceptor (b) : 147.207
R squared training score : {:.3f} 0.6513477258602801
R squared test score : {:.3f} 0.6257745846508096


## Linear Regression Example Plot

In [16]:
plt.figure(figsize=(5,4))

plt.scatter(X_R1, y_R1,marker='o', s=50, alpha=0.8)
plt.plot(X_R1, linearreg.coef_ * X_R1 + linearreg.intercept_, 'r-')

plt.title('Least-squares linear regression')
plt.xlabel('Feature value (x)')
plt.ylabel('Target value (y)')

<IPython.core.display.Javascript object>

Text(0,0.5,'Target value (y)')

Linear Regression on Crime data

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state=0)

linreg = LinearRegression().fit(X_train, y_train)

print('Crime Dataset')
print('Linear model intercept : {}'.format(linreg.intercept_))
print('Linear model coeff :\n {}'.format(linreg.coef_))

print('R squared score (training) : {:.3f}'.format(linreg.score(X_train, y_train)))
print('R squared score (test) : {:.3f}'.format(linreg.score(X_test, y_test)))


Crime Dataset
Linear model intercept : 12521.28620735786
Linear model coeff :
 [ 1.62e-03 -1.13e+02  2.13e+01 -3.27e+01 -4.17e+00 -9.50e+00 -2.52e-03
  1.46e+00 -1.51e-02 -1.49e+01  7.63e+01 -1.03e+01 -4.24e+00 -1.15e+01
  2.09e+00  6.13e-03  1.34e-02  5.32e-03 -6.78e+00 -1.52e+01  4.43e+00
  7.34e+00  2.22e+00  1.75e+01 -1.58e+00  1.38e+00  6.28e+00 -1.36e+00
  1.74e+02  1.40e+01  1.39e+02 -3.17e+02 -1.28e+02  7.06e+00 -3.66e+01
  1.99e+00  1.75e+00  1.31e+00 -1.32e+01 -5.14e-03  4.32e+01 -1.16e-03
 -4.27e+00  9.95e+00 -5.65e+00  2.74e-01 -3.17e+01 -4.88e+01  3.60e+01
 -1.99e+01  1.92e+00 -2.19e+01  4.33e+01 -7.01e+01  6.58e+02  8.76e+01
 -3.38e+02 -3.36e+01  3.30e+01  1.01e+01  7.98e+01  2.02e-02 -2.70e-01
  4.06e+01  9.85e+00 -1.63e+00 -4.82e+00 -3.79e+00 -2.93e+01 -7.88e+10
 -1.59e-03  7.88e+10 -7.88e+10  9.94e+07 -5.07e-01 -9.94e+07  9.94e+07
  1.49e+00 -7.10e+00 -9.35e-01 -3.74e+01  5.20e-02  3.67e-01  2.53e+01
  1.40e+00 -4.82e+00 -4.60e+00  4.39e+00]
R squared score (training) 

## Ridge Regression

- #### In general, `regularization` works well especially when you have relatively small number of training data compared to the number of features

- #### `Regularization` becomes less important as the amount training data you have increases

- #### Lasso Regression is another form of regularized linear regression that uses an `L1 Regularization` penalty for training (instead of ridge's L2 penalty')

- #### `L1 penalty` has the effect of setting parameter weights in `w` to zero for the lease influential variables. This is calles _sparse_ solution

- #### The parameter $\alpha$ controls amount of L1 regularization (default = 1.0)

### When to use Lasso vs Ridge regression
>- Many small/medium sized effects : use Ridge
>- Only a few variables with medium/large effect: use Lasso

## Polynomial features with Linear Regression

- $x = (x_0, x_1)$ => $x = (x_0, x_1, x_0^2, x_0x_1, x_1^2)$
- Generate new features consisting of all polynomial combinations of the original 2 features ($x_0$, $x_1$)
- The _degree of polynomial_ specifies how many variables participate at a time in each new feature 

> Beware of polynomial feature exapansion with high degree , as this can lead to complex models that *overfit*

> Addition of many polynomial features often leads to overfitting , so we often use polynomial features in combination with regression that has a regularization penalty, like ridge regression

#### Ridge Regression

In [25]:
from sklearn.linear_model import Ridge

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state=0)

linRidge = Ridge(alpha = 20.0).fit(X_train, y_train)

print('Crime dataset')
print('Ridge regresssion linear model intercept : {}'.format(linRidge.intercept_))
print('Ridge regresssion linear model coef : {}'.format(linRidge.coef_))
print('R-squared training score : {:.3f}'.format(linRidge.score(X_train, y_train)))
print('R-squared test score : {:.3f}'.format(linRidge.score(X_test, y_test)))

Crime dataset
Ridge regresssion linear model intercept : -3352.4230358462028
Ridge regresssion linear model coef : [ 1.95e-03  2.19e+01  9.56e+00 -3.59e+01  6.36e+00 -1.97e+01 -2.81e-03
  1.66e+00 -6.61e-03 -6.95e+00  1.72e+01 -5.63e+00  8.84e+00  6.79e-01
 -7.34e+00  6.70e-03  9.79e-04  5.01e-03 -4.90e+00 -1.79e+01  9.18e+00
 -1.24e+00  1.22e+00  1.03e+01 -3.78e+00 -3.73e+00  4.75e+00  8.43e+00
  3.09e+01  1.19e+01 -2.05e+00 -3.82e+01  1.85e+01  1.53e+00 -2.20e+01
  2.46e+00  3.29e-01  4.02e+00 -1.13e+01 -4.70e-03  4.27e+01 -1.23e-03
  1.41e+00  9.35e-01 -3.00e+00  1.12e+00 -1.82e+01 -1.55e+01  2.42e+01
 -1.32e+01 -4.20e-01 -3.60e+01  1.30e+01 -2.81e+01  4.39e+01  3.87e+01
 -6.46e+01 -1.64e+01  2.90e+01  4.15e+00  5.34e+01  1.99e-02 -5.47e-01
  1.24e+01  1.04e+01 -1.57e+00  3.16e+00  8.78e+00 -2.95e+01 -2.32e-04
  3.14e-04 -4.14e-04 -1.79e-04 -5.74e-01 -5.18e-01 -4.21e-01  1.53e-01
  1.33e+00  3.85e+00  3.03e+00 -3.78e+01  1.38e-01  3.08e-01  1.57e+01
  3.31e-01  3.36e+00  1.61e-01 -2

#### Ridge regression with feature normalization

In [28]:
from sklearn.linear_model import Ridge

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state=0)

from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()

X_train_scaled = minmax_scaler.fit_transform(X_train)
X_test_scaled = minmax_scaler.transform(X_test)

linRidge = Ridge(alpha = 20.0).fit(X_train_scaled, y_train)

print('Crime dataset')
print('Ridge regresssion linear model intercept : {}'.format(linRidge.intercept_))
print('Ridge regresssion linear model coef : {}'.format(linRidge.coef_))
print('R-squared training score : {:.3f}'.format(linRidge.score(X_train_scaled, y_train)))
print('R-squared test score : {:.3f}'.format(linRidge.score(X_test_scaled, y_test)))

print('Number of non-zero features: {}'.format(np.sum(linRidge.coef_ != 0)))

Crime dataset
Ridge regresssion linear model intercept : 933.3906385044124
Ridge regresssion linear model coef : [  88.69   16.49  -50.3   -82.91  -65.9    -2.28   87.74  150.95   18.88
  -31.06  -43.14 -189.44   -4.53  107.98  -76.53    2.86   34.95   90.14
   52.46  -62.11  115.02    2.67    6.94   -5.67 -101.55  -36.91   -8.71
   29.12  171.26   99.37   75.07  123.64   95.24 -330.61 -442.3  -284.5
 -258.37   17.66 -101.71  110.65  523.14   24.82    4.87  -30.47   -3.52
   50.58   10.85   18.28   44.11   58.34   67.09  -57.94  116.14   53.81
   49.02   -7.62   55.14  -52.09  123.39   77.13   45.5   184.91  -91.36
    1.08  234.09   10.39   94.72  167.92  -25.14   -1.18   14.6    36.77
   53.2   -78.86   -5.9    26.05  115.15   68.74   68.29   16.53  -97.91
  205.2    75.97   61.38  -79.83   67.27   95.67  -11.88]
R-squared training score : 0.615
R-squared test score : 0.599
Number of non-zero features: 88


### Ridge regression with $\alpha$ regularization parameter

In [35]:
print(r'Ridge regression : effect of $\alpha$ regularization parameter')

for alpha in [0, 1, 10, 20, 50, 100, 1000]:
    
    linRidge = Ridge(alpha = alpha).fit(X_train_scaled, y_train)
    r2_train = linRidge.score(X_train_scaled, y_train)
    r2_test = linRidge.score(X_test_scaled, y_test)
    
    num_coeff_bigger = np.sum(abs(linRidge.coef_) > 1.0)
    
    print('Alpha : {}\n number of coef bigger : {} \n R-squared training : {:.2f} \n R-squared test: {:.2f}'.
         format(alpha, num_coeff_bigger, r2_train, r2_test))

Ridge regression : effect of $\alpha$ regularization parameter
Alpha : 0
 number of coef bigger : 88 
 R-squared training : 0.67 
 R-squared test: 0.49
Alpha : 1
 number of coef bigger : 87 
 R-squared training : 0.66 
 R-squared test: 0.56
Alpha : 10
 number of coef bigger : 87 
 R-squared training : 0.63 
 R-squared test: 0.59
Alpha : 20
 number of coef bigger : 88 
 R-squared training : 0.61 
 R-squared test: 0.60
Alpha : 50
 number of coef bigger : 86 
 R-squared training : 0.58 
 R-squared test: 0.58
Alpha : 100
 number of coef bigger : 87 
 R-squared training : 0.55 
 R-squared test: 0.55
Alpha : 1000
 number of coef bigger : 84 
 R-squared training : 0.31 
 R-squared test: 0.30


### Lasso regression

In [38]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime)

minmax_scaler = MinMaxScaler()
X_train_scaled =  minmax_scaler.fit_transform(X_train)
X_test_scaled = minmax_scaler.transform(X_test)

linlasso = Lasso(alpha=2.0, max_iter=10000).fit(X_train_scaled, y_train)

print('Crime dataset')
print('lasso regression intercept : {}'.format(linlasso.intercept_))
print('lasso regression coef : {}'.format(linlasso.coef_))
print('Non zero features : {}'.format(np.sum(linlasso.coef_ !=0)))

print('R-squared training score : {:.3f}'.format(linlasso.score(X_train_scaled, y_train)))
print('R-squared test score : {:.3f}'.format(linlasso.score(X_test_scaled, y_test)))

print('features with non-zero weight (sorted by absolute magnitude):')

for e in sorted (list( zip( list(X_crime), linlasso.coef_)), key=lambda e: -abs(e[1])):

    if e[1] != 0:
        print('\t {}, {:.3f}'.format(e[0], e[1]))


Crime dataset
lasso regression intercept : 1189.2636661408008
lasso regression coef : [    0.       0.      -0.    -104.61    -0.      -0.       0.     137.05
     0.      -0.       0.    -110.54    -0.       0.      -0.       0.
     0.       0.       0.      -0.       0.       0.       0.       0.
  -159.72    -0.      -0.       0.     333.43    -0.       0.       0.
     0.      -0.   -1086.02    -0.      -0.      -0.    -227.21     0.
  1619.59     0.       0.       0.       0.       0.       0.       0.
     0.       0.      -0.       0.      34.16     0.       0.       0.
     0.       0.       0.       0.       0.       0.    -212.95     0.
   239.34     0.      93.89    34.83     0.       0.       0.      31.18
     0.      -0.       0.       0.     100.26     0.       0.       0.
  -132.78     0.       0.     299.72    -0.       0.      35.13     0.  ]
Non zero features : 18
R-squared training score : 0.629
R-squared test score : 0.627
features with non-zero weight (sorted by 

### Lasso regression with regularization parameter $\alpha$

In [41]:
print('Effect of regularization parameter alpha \n\
on number of features kept in final model\n')

for alpha in [0.5, 1, 2, 3, 5, 10, 20,50]:
    linlass = Lasso(alpha=alpha, max_iter=10000).fit(X_train_scaled,y_train)
    r2_train = linlass.score(X_train_scaled, y_train)
    r2_test = linlass.score(X_test_scaled, y_test)
    
    print('Alpha : {} \n Features kept : {} \n r-squared training:{:.2f} \n r-squared test : {:.2f}'
         .format(alpha, np.sum(linlass.coef_ !=0), r2_train, r2_test))
    

Effect of regularization parameter alpha 
on number of features kept in final model

Alpha : 0.5 
 Features kept : 29 
 r-squared training:0.65 
 r-squared test : 0.64
Alpha : 1 
 Features kept : 22 
 r-squared training:0.64 
 r-squared test : 0.63
Alpha : 2 
 Features kept : 18 
 r-squared training:0.63 
 r-squared test : 0.63
Alpha : 3 
 Features kept : 16 
 r-squared training:0.62 
 r-squared test : 0.62
Alpha : 5 
 Features kept : 12 
 r-squared training:0.60 
 r-squared test : 0.61
Alpha : 10 
 Features kept : 8 
 r-squared training:0.57 
 r-squared test : 0.57
Alpha : 20 
 Features kept : 2 
 r-squared training:0.51 
 r-squared test : 0.51
Alpha : 50 
 Features kept : 1 
 r-squared training:0.32 
 r-squared test : 0.31


### Polynomial Regression 

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures

X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1, random_state = 0)

linreg = LinearRegression().fit(X_train, y_train)

print('linear model coef : {}'.format(linreg.coef_))
print('linear model intercept : {}'.format(linreg.intercept_))
print('R-squared score (training): {:.3f}'.format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(linreg.score(X_test, y_test)))

print('\nNow we transform the original input data to add \n\
polynomial features upto degree 2 (quadratic) \n')

poly = PolynomialFeatures(degree=2)

X_F1_poly = poly.fit_transform(X_F1)

X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('(poly degree 2) linear model coef : {}'.format(linreg.coef_))
print('(poly degree 2) linear model intercept : {}'.format(linreg.intercept_))
print('(poly degree 2) R-squared score (training): {:.3f}'.format(linreg.score(X_train, y_train)))
print('(poly degree 2) R-squared score (test): {:.3f}'.format(linreg.score(X_test, y_test)))

print('\nAddition of many polynomial features often leads to\n\
overfitting, so we often use polynomial features in combination\n\
with regression that has a regularization penalty, like ridge\n\
regression.\n')

X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, random_state = 0)
linridge = Ridge().fit(X_train, y_train)

print('(poly degree 2 + ridge) linear model coef : {}'.format(linridge.coef_))
print('(poly degree 2 + ridge) linear model intercept : {}'.format(linridge.intercept_))
print('(poly degree 2 + ridge) R-squared score (training): {:.3f}'.format(linridge.score(X_train, y_train)))
print('(poly degree 2 + ridge) R-squared score (test): {:.3f}'.format(linridge.score(X_test, y_test)))



# X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, random_state = 0)
# linlasso = Lasso().fit(X_train, y_train)

# print('(poly degree 2 + lasso) linear model coef : {}'.format(linlasso.coef_))
# print('(poly degree 2 + lasso) linear model intercept : {}'.format(linlasso.intercept_))
# print('(poly degree 2 + lasso) R-squared score (training): {:.3f}'.format(linlasso.score(X_train, y_train)))
# print('(poly degree 2 + lasso) R-squared score (test): {:.3f}'.format(linlasso.score(X_test, y_test)))

linear model coef : [ 4.42  6.    0.53 10.24  6.55 -2.02 -0.32]
linear model intercept : 1.542509197537301
R-squared score (training): 0.722
R-squared score (test): 0.722

Now we transform the original input data to add 
polynomial features upto degree 2 (quadratic) 

(poly degree 2) linear model coef : [ 3.41e-12  1.66e+01  2.67e+01 -2.21e+01  1.24e+01  6.93e+00  1.05e+00
  3.71e+00 -1.34e+01 -5.73e+00  1.62e+00  3.66e+00  5.05e+00 -1.46e+00
  1.95e+00 -1.51e+01  4.87e+00 -2.97e+00 -7.78e+00  5.15e+00 -4.65e+00
  1.84e+01 -2.22e+00  2.17e+00 -1.28e+00  1.88e+00  1.53e-01  5.62e-01
 -8.92e-01 -2.18e+00  1.38e+00 -4.90e+00 -2.24e+00  1.38e+00 -5.52e-01
 -1.09e+00]
(poly degree 2) linear model intercept : -3.2056743989291814
(poly degree 2) R-squared score (training): 0.969
(poly degree 2) R-squared score (test): 0.805

Addition of many polynomial features often leads to
overfitting, so we often use polynomial features in combination
with regression that has a regularization penalty, lik

# Linear Models for Classification

### Logistic regression : Regularization

- L2 regularization is on by default (like ridge regression)
- Parameter C controls the amount of regularization (default 1.0)
- As with the regularized linear regression, it can be important to normalize all features so they are on same scale


>Both for SVM and Logistic Regression 
- higher values of C corresponds to less regularization. 
- With large values of C, logistic regression tries to fit the training data as well as possible
- While with small values of C, the model tries harder to find the coefficients that are closer to 0, even if that model fits the training data a little bit worse
    - 

#### Logistic Regression for bainary classification on fruits datasets using height, width features( +ve: Apple, -ve:other)

In [56]:
from sklearn.linear_model import LogisticRegression
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

fig, subaxes = plt.subplots(1,1, figsize=(7,5))

y_fruits_apple = y_fruits_2d ==1  # make into a binary problem, apple vs everything

X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d.as_matrix(), y_fruits_apple.as_matrix(), random_state=0)

clf = LogisticRegression(C=100).fit(X_train, y_train)

plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None,
                                         None, 'Logistic regression \
for binary classification\nFruit dataset: Apple vs others',
                                         subaxes)

h = 6
w = 8
print('A fruits with height {} and width {} is predicted to be : {}'.format(h,w, ['not an apple', 'apple'][clf.predict([[h,w]])[0]]))

h = 10
w = 7
print('A fruit with height {} and width {} is predicted to be: {}'.format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]]))

subaxes.set_xlabel('height')
subaxes.set_ylabel('width')

print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

<IPython.core.display.Javascript object>

A fruits with height 6 and width 8 is predicted to be : apple
A fruit with height 10 and width 7 is predicted to be: not an apple
Accuracy of Logistic regression classifier on training set: 0.77
Accuracy of Logistic regression classifier on test set: 0.73




### Logistic Regression on simple syntethic dataset

In [60]:
from sklearn.linear_model import LogisticRegression
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0)

clf = LogisticRegression().fit(X_train, y_train)
fig, subaxes = plt.subplots(1,1, figsize=(7,5))

title = 'Logistic regression, simple synthetic dataset C = {:.3f}'.format(1.0)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subaxes)

print('Accuracy of Logistic regression classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))


<IPython.core.display.Javascript object>

Accuracy of Logistic regression classifier on training set: 0.80
Accuracy of Logistic regression classifier on test set: 0.80


#### Logistic regression regularization: C parameter

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d.as_matrix(), y_fruits_apple.as_matrix(), random_state=0)

fig, subaxes = plt.subplots(3,1, figsize=(4,10))

for this_C, subplot in zip([0.1, 1, 100], subaxes):
    clf = LogisticRegression(C=this_C).fit(X_train, y_train)
    title ='Logistic regression (apple vs rest), C = {:.3f}'.format(this_C)
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
                                             X_test, y_test, title,
                                             subplot)
plt.tight_layout()

<IPython.core.display.Javascript object>

### Application to real dataset

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0)

clf = LogisticRegression().fit(X_train, y_train)

print('Breast Cancer Dataset')
print("logistic regression coef : {}".format(clf.coef_))
print('logistic regression intercept : {}'.format(clf.intercept_))
print('Accuracy of Logisitic Regression on training set : {:.3f}'.format(clf.score(X_train, y_train)))
print('Accuracy of Logisitic Regression on test set : {:.3f}'.format(clf.score(X_test, y_test)))

Breast Cancer Dataset
logistic regression coef : [[ 1.72  0.09  0.11 -0.01 -0.13 -0.33 -0.5  -0.27 -0.27 -0.02  0.04  0.99
   0.12 -0.11 -0.01  0.01 -0.03 -0.03 -0.03  0.01  1.36 -0.29 -0.25 -0.02
  -0.22 -1.03 -1.45 -0.53 -0.65 -0.11]]
logistic regression intercept : [0.35]
Accuracy of Logisitic Regression on training set : 0.960
Accuracy of Logisitic Regression on test set : 0.958


# Support Vector Machine

#### Classifier Margin

Defined as the maximum width the decision boundary area can be increased before hitting a data point

Maximum Margin Linear Classifier : Linear Support Vector Machines

The linear classifier with maximum margin is linear  Support Vectory Machine (LSVM)

### Regularization for SVMs : the C parameter

- The strength of regularization is determined by C
- Larger the value of C: less regularization
    - Fit the training data as well as possible
    - Each individual data point is important to classify correctly
- Smaller the value of C: more regularization
    - more tolerant on individual data points

## Linear Models : Pros and Cons

Pros:
- Simple and easy to train
- Fast prediction
- Scales well to very large datasets
- Works well with sparse data
- Reasons for prediction are easy to interpret


Cons:
- For lower dimensional data, other models may have superior generalization performance
- For Classification, data may not be linearly separable

In [66]:
from sklearn.svm import SVC
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)
fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))

this_C = 1.0

clf = SVC(kernel='linear', C=this_C).fit(X_train, y_train)
title = 'Linear SVC , C={:.3f}'.format(this_C)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subaxes)

<IPython.core.display.Javascript object>

In [73]:
from sklearn.svm import LinearSVC
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)
fig, subaxes = plt.subplots(1, 2, figsize=(8, 4))

for this_C, subplot in zip([0.00001, 100], subaxes):
    clf = LinearSVC(C=this_C).fit(X_train, y_train)
    title = 'Linear SVC , C={:.5f}'.format(this_C)
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subplot)
    
plt.tight_layout()

<IPython.core.display.Javascript object>

### Application to real dataset

In [74]:
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)

clf = LinearSVC().fit(X_train, y_train)
print('Breast cancer dataset')
print('Accuracy of Linear SVC classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Breast cancer dataset
Accuracy of Linear SVC classifier on training set: 0.78
Accuracy of Linear SVC classifier on test set: 0.80


## Multiclass Classification

#### LinearSVC with M classes generates M one vs rest classifiers.

In [78]:
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_2d, random_state = 0)

clf = LinearSVC(C=5, random_state = 67).fit(X_train, y_train)
print('Coefficients:\n', clf.coef_)
print('Intercepts:\n', clf.intercept_)

Coefficients:
 [[-0.23  0.72]
 [-1.63  1.15]
 [ 0.08  0.31]
 [ 1.26 -1.68]]
Intercepts:
 [-3.32  1.2  -2.75  1.16]


#### Multi-class results on the fruit dataset

In [81]:
plt.figure(figsize=(6,6))
colors = ['r', 'g', 'b', 'y']
cmap_fruits = ListedColormap(['#FF0000', '#00FF00', '#0000FF','#FFFF00'])

plt.scatter(X_fruits_2d[['height']], X_fruits_2d[['width']],
           c=y_fruits_2d, cmap=cmap_fruits, edgecolor = 'black', alpha=.7)

x_0_range = np.linspace(-10, 15)

for w, b, color in zip(clf.coef_, clf.intercept_, ['r', 'g', 'b', 'y']):
    # Since class prediction with a linear model uses the formula y = w_0 x_0 + w_1 x_1 + b, 
    # and the decision boundary is defined as being all points with y = 0, to plot x_1 as a 
    # function of x_0 we just solve w_0 x_0 + w_1 x_1 + b = 0 for x_1:
    plt.plot(x_0_range, -(x_0_range * w[0] + b) / w[1], c=color, alpha=.8)
    
plt.legend(target_names_fruits)
plt.xlabel('height')
plt.ylabel('width')
plt.xlim(-2, 12)
plt.ylim(-2, 15)
plt.show()

<IPython.core.display.Javascript object>

ValueError: c of shape (59,) not acceptable as a color sequence for x with size 59, y with size 59

# Kernalized Support Vector Machine

## Kernalized Support Vector Machine : Pros and Cons

**Pros:**
- can perform well on range of dataset
- Versatile: different kernel function can be specified or custom kernel function can be defined for specific data type
- Works well for low and high dimensional data (including data with hundres, thousands or millions of dimensions, works well with text classification)
    
**Cons:**
- Efficiency(runtime speed and memory usage) decreases as training set size increases (e.g. over 50,000 samples)
- Need careful normalization of input data and parameter tuning
- Does not provide direct probability estimates (but can be estimated using e.g. Platt scaling)
- Difficult to interpret why prediction is made

### Model Complexity

- `kernel` : type of kernel function to be used
    - Default = `rbf` for radial basis function
    - other types include `polynomial`
- kernel parameters
    - `gamma` ( $\gamma$ ) : RBF kernel width
- `C`: regularization parameter
- Typically `C` and `gamma` are tuned at the same time

### Classification

In [84]:
from sklearn.svm import SVC
from adspy_shared_utilities import plot_class_regions_for_classifier

X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)

# The default SVC kernel is radial basis function (RBF)
plot_class_regions_for_classifier(SVC().fit(X_train, y_train),
                                 X_train, y_train, None, None,
                                 'Support Vector Classifier: RBF kernel')

# Compare decision boundries with polynomial kernel, degree = 3
plot_class_regions_for_classifier(SVC(kernel = 'poly', degree = 3)
                                 .fit(X_train, y_train), X_train,
                                 y_train, None, None,
                                 'Support Vector Classifier: Polynomial kernel, degree = 3')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Support Vector Machine with RBF kernel: gamma parameter

In [86]:
from adspy_shared_utilities import plot_class_regions_for_classifier

X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)
fig, subaxes = plt.subplots(3, 1, figsize=(4, 11))

for this_gamma, subplot in zip([0.01, 1.0, 10.0], subaxes):
    clf = SVC(kernel = 'rbf', gamma=this_gamma).fit(X_train, y_train)
    title = 'Support Vector Classifier: \nRBF kernel, gamma = {:.2f}'.format(this_gamma)
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
                                             None, None, title, subplot)
    plt.tight_layout()

<IPython.core.display.Javascript object>

### Support Vector Machine with RBF kernel: using both C and gamma parameter

In [87]:
from sklearn.svm import SVC
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)
fig, subaxes = plt.subplots(3, 4, figsize=(15, 10), dpi=50)

for this_gamma, this_axis in zip([0.01, 1, 5], subaxes):
    
    for this_C, subplot in zip([0.1, 1, 15, 250], this_axis):
        title = 'gamma = {:.2f}, C = {:.2f}'.format(this_gamma, this_C)
        clf = SVC(kernel = 'rbf', gamma = this_gamma,
                 C = this_C).fit(X_train, y_train)
        plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
                                                 X_test, y_test, title,
                                                 subplot)
        plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

<IPython.core.display.Javascript object>

# Cross Validation

In [88]:
from sklearn.model_selection import cross_val_score

clf = KNeighborsClassifier(n_neighbors = 5)
X = X_fruits_2d.as_matrix()
y = y_fruits_2d.as_matrix()

cv_scores = cross_val_score(clf, X, y)

print('Cross validation scores (3-fold) : ', cv_scores)
print('Mean cross-validation score (3-fold): {:.3f}'.format(np.mean(cv_scores)))


Cross validation scores (3-fold) :  [0.77 0.74 0.83]
Mean cross-validation score (3-fold): 0.781


### A note on performing cross-validation for more advanced scenarios.

In some cases (e.g. when feature values have very different ranges), we've seen the need to scale or normalize the training and test sets before use with a classifier. The proper way to do cross-validation when you need to scale the data is *not* to scale the entire dataset with a single transform, since this will indirectly leak information into the training data about the whole dataset, including the test data (see the lecture on data leakage later in the course).  Instead, scaling/normalizing must be computed and applied for each cross-validation fold separately.  To do this, the easiest way in scikit-learn is to use *pipelines*.  While these are beyond the scope of this course, further information is available in the scikit-learn documentation here:

http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

or the Pipeline section in the recommended textbook: Introduction to Machine Learning with Python by Andreas C. Müller and Sarah

In [90]:
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve

param_range = np.logspace(-3,3,4)

train_scores, test_scores = validation_curve(SVC(), X, y, param_name='gamma',
                                            param_range=param_range, cv=3)
print(train_scores)
print(test_scores)

[[0.49 0.42 0.41]
 [0.84 0.72 0.76]
 [0.92 0.9  0.93]
 [1.   1.   0.98]]
[[0.45 0.32 0.33]
 [0.82 0.68 0.61]
 [0.41 0.84 0.67]
 [0.36 0.21 0.39]]


In [91]:
# This code based on scikit-learn validation_plot example
#  See:  http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html
plt.figure()

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title('Validation Curve with SVM')
plt.xlabel('$\gamma$ (gamma)')
plt.ylabel('Score')
plt.ylim(0.0, 1.1)
lw = 2

plt.semilogx(param_range, train_scores_mean, label='Training score',
            color='darkorange', lw=lw)

plt.fill_between(param_range, train_scores_mean - train_scores_std,
                train_scores_mean + train_scores_std, alpha=0.2,
                color='darkorange', lw=lw)

plt.semilogx(param_range, test_scores_mean, label='Cross-validation score',
            color='navy', lw=lw)

plt.fill_between(param_range, test_scores_mean - test_scores_std,
                test_scores_mean + test_scores_std, alpha=0.2,
                color='navy', lw=lw)

plt.legend(loc='best')
plt.show()



<IPython.core.display.Javascript object>

# Decision Trees

**Pros**
- Easily visualized and interpreted
- No feature normalization or scaling typically needed
- Works well with datasets using a mixture of feature types(continuours, categorical, binary)

**Cons**
- Even after tuning, decision trees can often still overfit
- Usually need an ensemble of trees for better generalization performance

## DecisionTreeClassifier Key Parameters

- `max_depth` : controls maximum depts (number of split points). Most common way to reduce tree complexity and overfitting

- `min_samples_leaf`: threshold for the minimum # of data instances a leaf can have to avoid further splitting

- `max_leaf_nodes`: limits total number of leaves in the tree

- In practice, adjusting only one of these (e.g. `max_depth`) is enough to reduce overfitting


In [95]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from sklearn.model_selection import train_test_split

iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=3)
clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Accuracy of DecisionTree classifier on training set : {:.3f} '.format(clf.score(X_train, y_train)))
print('Accuracy of DecisionTree classifier on test set : {:.3f} '.format(clf.score(X_test, y_test)))

Accuracy of DecisionTree classifier on training set : 1.000 
Accuracy of DecisionTree classifier on test set : 0.947 


#### Setting max decision tree depth to help avoid overfitting

In [96]:
clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)

print('Accuracy of DecisionTree classifier on training set : {:.3f} '.format(clf2.score(X_train, y_train)))
print('Accuracy of DecisionTree classifier on test set : {:.3f} '.format(clf2.score(X_test, y_test)))

Accuracy of DecisionTree classifier on training set : 0.982 
Accuracy of DecisionTree classifier on test set : 0.974 


#### Visualizing decision trees

In [98]:
# plot_decision_tree(clf, iris.feature_names, iris.target_names)

#### Pre-pruned version (max_depth = 3)

In [100]:
# plot_decision_tree(clf2, iris.feature_names, iris.target_names)

#### Feature importance

In [101]:
from adspy_shared_utilities import plot_feature_importances

plt.figure(figsize=(10,4), dpi=80)
plot_feature_importances(clf, iris.feature_names)
plt.show()

print('Feature importances: {}'.format(clf.feature_importances_))



<IPython.core.display.Javascript object>

Feature importances: [0.   0.04 0.06 0.9 ]


In [102]:
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 0)
fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))

pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]
tree_max_depth = 4

for pair, axis in zip(pair_list, subaxes):
    X = X_train[:, pair]
    y = y_train
    
    clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y)
    title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth)
    plot_class_regions_for_classifier_subplot(clf, X, y, None,
                                             None, title, axis,
                                             iris.target_names)
    
    axis.set_xlabel(iris.feature_names[pair[0]])
    axis.set_ylabel(iris.feature_names[pair[1]])
    
plt.tight_layout()
plt.show()



<IPython.core.display.Javascript object>

### Decision Trees on a real-world dataset

In [104]:
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from adspy_shared_utilities import plot_feature_importances

X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)

clf = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 8,
                            random_state = 0).fit(X_train, y_train)

# plot_decision_tree(clf, cancer.feature_names, cancer.target_names)

In [105]:
print('Breast cancer dataset: decision tree')
print('Accuracy of DT classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of DT classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

plt.figure(figsize=(10,6),dpi=80)
plot_feature_importances(clf, cancer.feature_names)
plt.tight_layout()

plt.show()

Breast cancer dataset: decision tree
Accuracy of DT classifier on training set: 0.96
Accuracy of DT classifier on test set: 0.94




<IPython.core.display.Javascript object>