# AMMI Day 5

## Morning Session
### Bias, Variance, Regularization

In [0]:
import pandas as pd
import numpy as np
import math

from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# feature selection
from sklearn.feature_selection import RFECV

# pipeline
from sklearn.pipeline import Pipeline

# preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

# LinearRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

# Neighbors
from sklearn.neighbors import KNeighborsRegressor

# Clustering
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# t-SNE
from sklearn.manifold import TSNE

# PCA
from sklearn.decomposition import PCA

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
train_df = pd.read_csv('/content/gdrive/My Drive/boston/train.csv', index_col='ID')
test_df = pd.read_csv('/content/gdrive/My Drive/boston/test.csv', index_col='ID')

Let's over-engineer a Linear Regression Model

In [0]:
predictors = train_df[['nox', 'rm', 'chas', 'dis', 'ptratio', 'lstat', 'rad']]
target = train_df['medv']
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(3))
]
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3, random_state=40)
columns = predictors.columns
pipe = Pipeline(steps)
pipe.fit(X_train, y_train)
lr = LinearRegression()
lr.fit(pipe.transform(X_train), y_train)
print('Score: {}'.format(lr.score(pipe.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, lr.predict(pipe.transform(X_test)))))

Score: -0.028930654560475055
MSE: 79.13575117043798


### Our loss was reducing, and our model quality was improving, so what went wrong?

Let's create two models and compare the magnitude of coefficients

In [0]:
s1 = [
    ('scaler', MinMaxScaler())
]
p = Pipeline(s1)
p.fit(X_train, y_train)
lr_linear = LinearRegression()
lr_linear.fit(p.transform(X_train), y_train)
print('Score: {}'.format(lr_linear.score(p.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, lr_linear.predict(p.transform(X_test)))))
print(lr_linear.coef_)

Score: 0.6669016946829257
MSE: 25.61881550329229
[-11.55418765  21.13480639   3.24401229 -12.83713063  -8.46146441
 -19.45745781   2.49653134]


In [0]:
s2 = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(2))
]
p_2 = Pipeline(s2)
p_2.fit(X_train, y_train)
lr_quad = LinearRegression()
lr_quad.fit(p_2.transform(X_train), y_train)
print('Score: {}'.format(lr_quad.score(p_2.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, lr_quad.predict(p_2.transform(X_test)))))
print(lr_quad.coef_)

Score: 0.7916122605636297
MSE: 16.027241701775367
[ 9.39383793e-12 -1.83073187e+01  1.17451512e+01 -1.53436999e+00
 -5.06272783e+01  1.90804330e+00 -3.91666061e+01  2.27296825e+01
  2.90240854e+00  3.89982270e+00 -3.08711196e+00  1.48457410e+00
  2.48816332e+00  5.02068601e+00 -1.73778407e+00  2.20144376e+01
 -6.87448266e-01  2.13829575e+01 -1.95852071e+01 -1.46458397e+01
 -1.35730814e+01 -1.53436999e+00  1.03922323e+01 -2.52208876e-01
  9.51669952e+00  4.92584945e+00  1.99513584e+01  1.18277071e+01
  4.70383707e+01 -2.87877274e+01 -7.62208542e-01 -4.36099692e+00
  9.16827888e+00  3.94543022e+01 -3.79186155e+01 -2.90605465e+00]


* Let's look at the coefficients of our cubic equation

In [0]:
print(lr.coef_)

[ 4.77784883e-10 -3.66909847e+02  2.28295889e+02  3.92195136e+00
 -3.57276990e+02 -9.15272735e+01  2.99157379e+02  8.62142575e+01
  7.75126684e+01  4.46083246e+02 -9.26511068e+00  5.26499727e+02
  2.55771160e+02 -3.62181247e+00  3.96989706e+02 -3.28873706e+02
 -3.95212670e+00  2.80582491e+02 -1.24547139e+02 -8.06305567e+02
 -2.06818852e+02  3.92195136e+00  4.35996484e+00 -5.47704441e+00
 -6.41055769e+00 -1.95480607e+01  3.77425871e+02  9.23231271e+01
  1.50051360e+02  9.98394561e+01  1.98893101e+02 -3.54094345e+02
 -4.95656211e+01  1.90766684e+01 -3.02206780e+02 -1.30601883e+02
  1.24567727e+01 -9.57239846e+01 -2.26245418e+01 -1.07737850e+02
 -1.41349529e+02  2.29299462e+02  1.77563354e+01 -6.64778553e+01
  3.04944425e+01 -6.96566617e+02 -4.34665093e+01 -2.08874430e+02
 -1.28296637e+02 -9.26511068e+00  1.35254434e+01  2.99627119e+01
  2.08058612e+01  5.44941667e+01 -7.22971594e+01 -8.99438188e+01
  1.63595099e+02  6.38764848e+01 -3.60933245e+01  3.31197026e+02
 -8.00426838e+02 -3.58039

* We know that `lr_quad` is more complex than `lr_linear`
* A more complex model is said to have high variance
* A less complex model is said to high bias
* The longer you train a model, the lower the bias and the higher the variance
* What do we notice?
* What can we do about our model?

1. Get more data to increase model complexity during training
2. Use n-fold cross-validation
3. Simplify our model (Regularization) by reducing the variance

### What is regularization?
* Introducing a penalty to the cost function
* Use Feature Elimination

### L1 Regularization - Lasso Regression
* Sets the coefficient of certain features to zero
* Acts as a feature selection mechanism
* Add a penalty based on size of coefficients (weights)
* Sum of absolutes - L1 Norm $\sum\limits_{j=1}^{n}{\left|w_{j}\right| } $
* Add a regularization term - $\lambda$
* High $\lambda$ is high bias, low variance
* Low $\lambda$ is low bias, high variance

In [0]:
from sklearn.linear_model import Lasso

l_1 = Lasso(alpha = 0.9)
l_1.fit(pipe.transform(X_train), y_train)
print('Score: {}'.format(l_1.score(pipe.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, l_1.predict(pipe.transform(X_test)))))
print(l_1.coef_)

Score: 0.38238347735192224
MSE: 47.50130364801645
[ 0.         -0.          0.          0.          0.         -0.
 -6.03320734 -0.         -0.         -0.          0.         -0.
 -0.         -0.         -0.          6.93903733  0.          0.
 -0.         -0.         -0.          0.          0.          0.
 -0.          0.          0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.          0.          0.          0.         -0.         -0.
 -0.          0.          0.          0.         -0.          0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.          0.          0.
  0.          0.         -0.          0.          0.          0.
  0.          0.          0.          0.          0.         -0.
  0.         -0.         -0.         -0.         -0.         -0.
 -0.          0.          0.          0.

In [0]:
l_2 = Lasso(alpha = 0.1)
l_2.fit(pipe.transform(X_train), y_train)
print('Score: {}'.format(l_2.score(pipe.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, l_2.predict(pipe.transform(X_test)))))
print(l_2.coef_)

Score: 0.744984962825642
MSE: 19.613378644229975
[ 0.00000000e+00 -0.00000000e+00  0.00000000e+00  2.15990160e-15
 -1.13770924e+00 -1.77354639e+00 -1.81950740e+01  0.00000000e+00
 -1.21529197e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  8.38516402e-01  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -1.75239960e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000

### L2 Regularization - Ridge Regression - Tikhonov Regularization
* Forces the magnitude of coefficients close to zero
* Add a penalty based on size of coefficients (weights)
* Sum of squares - L2 Norm $\sum\limits_{j=1}^{n}{w_{j}^2} $
* Add a regularization term - $\lambda$
* High $\lambda$ is high bias, low variance
* Low $\lambda$ is low bias, high variance

In [0]:
from sklearn.linear_model import Ridge

r_1 = Ridge(alpha = 0.9)
r_1.fit(pipe.transform(X_train), y_train)
print('Score: {}'.format(r_1.score(pipe.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, r_1.predict(pipe.transform(X_test)))))
print(r_1.coef_)

Score: 0.829556195095051
MSE: 13.108948084807944
[ 0.00000000e+00 -1.23882360e+00  6.16950946e+00  3.98528623e-01
 -5.70677622e+00 -1.12725390e+00 -7.56979284e+00  3.15748981e+00
 -1.15514341e+00 -4.54744391e-01 -8.68895657e-01 -2.98225442e+00
 -1.76191901e+00 -3.47870647e+00  1.15335226e+00  1.13203534e+01
  5.52361555e-02  2.32595476e-01 -2.75560845e+00 -6.01027139e+00
 -1.71120122e+00  3.98528623e-01  2.17345060e-01  1.87065824e-01
  3.06899402e-02  9.26305778e-01 -2.21573704e-01 -2.33698107e+00
 -2.00784926e+00 -3.53754296e+00  6.76171669e-01 -3.50403322e+00
  2.47785220e+00  2.86864260e+00 -6.92934494e+00  2.54006608e+00
 -1.17872148e+00 -8.04031060e-01 -5.13013462e-01 -1.39396170e+00
 -1.51599515e+00 -1.31969103e+00 -4.42708100e-01  1.45001597e+00
 -7.16475350e-01 -1.23901383e+00 -2.77482466e+00 -2.85097519e+00
 -1.18067298e+00 -8.68895657e-01 -5.18282742e-01 -1.60826946e-01
 -5.88913755e-02 -4.46203100e-02 -5.47355094e-01 -2.15401716e+00
 -8.45569319e-01 -1.70157016e+00 -9.56473

In [0]:

r_2 = Ridge(alpha = 0.1)
r_2.fit(pipe.transform(X_train), y_train)
print('Score: {}'.format(r_2.score(pipe.transform(X_test), y_test)))
print('MSE: {}'.format(mean_squared_error(y_test, r_2.predict(pipe.transform(X_test)))))
print(r_2.coef_)

Score: 0.8309751293486335
MSE: 12.99981689358429
[ 0.00000000e+00 -3.00874786e+00  1.13041836e+01  5.76552188e-01
 -1.28952349e+01 -3.69396760e-01 -1.40590005e+01  6.91905042e+00
 -1.86054450e+00 -9.96790508e-01 -1.64384906e+00 -5.17314925e+00
 -1.65313546e+00 -6.05277129e+00  2.79153950e+00  1.53487022e+01
 -1.00612441e+00 -1.14460562e+00 -3.85133667e+00 -9.67965663e+00
 -1.92195947e+00  5.76552188e-01  1.70669096e+00  3.02256745e-01
  1.75549039e+00 -1.88116705e-01  3.36396250e-01 -2.44565934e+00
  2.39351816e+00 -5.89767619e+00  1.73358511e+00 -3.44344356e+00
  4.46036892e+00  8.70008119e+00 -1.29755340e+01  1.69359842e+00
  2.55106833e-01  7.00609933e-01  4.84634868e-01 -2.56716542e+00
 -3.86506665e+00  6.78008651e-01 -3.11770293e+00 -1.76443078e+00
 -5.88956308e-01 -3.95290350e+00 -3.59853475e-03 -2.22447080e+00
  4.30184828e+00 -1.64384906e+00 -8.75316276e-01 -1.12217092e-01
 -6.20018227e-01 -5.87369540e-01  7.98891177e-01 -3.08983702e+00
  1.88965698e+00 -2.31538962e+00  7.98545

## Afternoon Session
### Logistic Regression
* Output a probability
* Probabilities are mutually exclusive
* Given two classes a, b $P(a) = 1 - P(b)$
* Loss Function (Log Loss or Cross Entropy): $\frac{1}{m} \sum\limits_{i=1}^{m}{-y.log(\hat{y}) + (1-y).log(1-\hat{y})}$
### More Scikit-Learn

In [0]:
import pandas as pd

_headers = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', names=_headers)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
class           150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [0]:
df.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [0]:
df['class'].value_counts()

Iris-virginica     50
Iris-versicolor    50
Iris-setosa        50
Name: class, dtype: int64

### Split Data

In [0]:
predictors = df.drop(['class'], axis=1).values
labels = df[['class']].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, = train_test_split(predictors, labels, test_size=0.5)

### Train a Logistic Regression Model

In [0]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Predict

In [0]:
y_pred = log_model.predict(X_test)
y_proba = log_model.predict_proba(X_test)

In [0]:
labels_df = pd.DataFrame(dict(observed=y_test.reshape(-1), predicted=y_pred))
labels_df.head(n=15)

Unnamed: 0,observed,predicted
0,Iris-virginica,Iris-virginica
1,Iris-virginica,Iris-virginica
2,Iris-setosa,Iris-setosa
3,Iris-setosa,Iris-setosa
4,Iris-virginica,Iris-virginica
5,Iris-setosa,Iris-setosa
6,Iris-setosa,Iris-setosa
7,Iris-virginica,Iris-virginica
8,Iris-versicolor,Iris-versicolor
9,Iris-setosa,Iris-setosa


### Evaluate

In [0]:
print('Score: {}'.format(log_model.score(X_test, y_test)))

Score: 0.9333333333333333


### Import Cars

In [0]:
_headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'car']
df_cars = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data', names=_headers)
df_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
car         1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [0]:
df_cars.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [0]:
df_cars.tail()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good
1727,low,low,5more,more,big,high,vgood


In [0]:
predictors = df_cars.drop(['car'], axis=1).values
labels = df_cars['car'].values

X_train, X_test, y_train, y_test = train_test_split(predictors, labels, test_size=0.2)

log_model_2 = LogisticRegression()
log_model_2.fit(X_train, y_train)



ValueError: ignored

### Convert categorical variables using `pd.get_dummies()`

In [0]:
?pd.get_dummies()

In [0]:
buying_df = pd.get_dummies(df_cars['buying'], prefix='buying', drop_first=True)

In [0]:
buying_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 3 columns):
buying_low      1728 non-null uint8
buying_med      1728 non-null uint8
buying_vhigh    1728 non-null uint8
dtypes: uint8(3)
memory usage: 5.1 KB


In [0]:
buying_df.head()

Unnamed: 0,buying_low,buying_med,buying_vhigh
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


In [0]:
?pd.concat

In [0]:
df_cars = pd.concat([df_cars, buying_df], axis=1)

In [0]:
df_cars.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 10 columns):
buying          1728 non-null object
maint           1728 non-null object
doors           1728 non-null object
persons         1728 non-null object
lug_boot        1728 non-null object
safety          1728 non-null object
car             1728 non-null object
buying_low      1728 non-null uint8
buying_med      1728 non-null uint8
buying_vhigh    1728 non-null uint8
dtypes: object(7), uint8(3)
memory usage: 99.6+ KB


In [0]:
df_cars.drop(['buying'], axis=1, inplace=True)

In [0]:
df_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 9 columns):
maint           1728 non-null object
doors           1728 non-null object
persons         1728 non-null object
lug_boot        1728 non-null object
safety          1728 non-null object
car             1728 non-null object
buying_low      1728 non-null uint8
buying_med      1728 non-null uint8
buying_vhigh    1728 non-null uint8
dtypes: object(6), uint8(3)
memory usage: 86.1+ KB


In [0]:
df_cars['maint'].value_counts()

low      432
vhigh    432
med      432
high     432
Name: maint, dtype: int64

In [0]:
maint_df = pd.get_dummies(df_cars['maint'], prefix='maint', drop_first=True)
maint_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 3 columns):
maint_low      1728 non-null uint8
maint_med      1728 non-null uint8
maint_vhigh    1728 non-null uint8
dtypes: uint8(3)
memory usage: 5.1 KB


In [0]:
df_cars = pd.concat([df_cars, maint_df], axis=1)
df_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 12 columns):
maint           1728 non-null object
doors           1728 non-null object
persons         1728 non-null object
lug_boot        1728 non-null object
safety          1728 non-null object
car             1728 non-null object
buying_low      1728 non-null uint8
buying_med      1728 non-null uint8
buying_vhigh    1728 non-null uint8
maint_low       1728 non-null uint8
maint_med       1728 non-null uint8
maint_vhigh     1728 non-null uint8
dtypes: object(6), uint8(6)
memory usage: 91.2+ KB


In [0]:
df_cars.drop(['maint'], axis=1, inplace=True)

In [0]:
doors_df = pd.get_dummies(df_cars['doors'], prefix='doors', drop_first=True)
doors_df.head()

Unnamed: 0,doors_3,doors_4,doors_5more
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [0]:
persons_df = pd.get_dummies(df_cars['persons'], prefix='persons', drop_first=True)
persons_df.head()

Unnamed: 0,persons_4,persons_more
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [0]:

lug_boot_df = pd.get_dummies(df_cars['lug_boot'], prefix='lug_boot', drop_first=True)
lug_boot_df.head()

Unnamed: 0,lug_boot_med,lug_boot_small
0,0,1
1,0,1
2,0,1
3,1,0
4,1,0


In [0]:
safety_df = pd.get_dummies(df_cars['safety'], prefix='safety', drop_first=True)
safety_df.head()

Unnamed: 0,safety_low,safety_med
0,1,0
1,0,1
2,0,0
3,1,0
4,0,1


In [0]:
df_cars = pd.concat([df_cars, doors_df, persons_df, lug_boot_df, safety_df], axis=1)
df_cars.head()

Unnamed: 0,doors,persons,lug_boot,safety,car,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med
0,2,2,small,low,unacc,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0
1,2,2,small,med,unacc,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1
2,2,2,small,high,unacc,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
3,2,2,med,low,unacc,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0
4,2,2,med,med,unacc,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1


In [0]:
df_cars.drop(['doors', 'persons', 'lug_boot', 'safety'], axis=1, inplace=True)

In [0]:
df_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 16 columns):
car               1728 non-null object
buying_low        1728 non-null uint8
buying_med        1728 non-null uint8
buying_vhigh      1728 non-null uint8
maint_low         1728 non-null uint8
maint_med         1728 non-null uint8
maint_vhigh       1728 non-null uint8
doors_3           1728 non-null uint8
doors_4           1728 non-null uint8
doors_5more       1728 non-null uint8
persons_4         1728 non-null uint8
persons_more      1728 non-null uint8
lug_boot_med      1728 non-null uint8
lug_boot_small    1728 non-null uint8
safety_low        1728 non-null uint8
safety_med        1728 non-null uint8
dtypes: object(1), uint8(15)
memory usage: 38.9+ KB


In [0]:
df_cars.head()

Unnamed: 0,car,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med
0,unacc,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0
1,unacc,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1
2,unacc,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
3,unacc,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0
4,unacc,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1


# split our data

In [0]:
predictors = df_cars.drop(['car'], axis=1).values
labels = df_cars['car'].values

X_train, X_test, y_train, y_test = train_test_split(predictors, labels, test_size=0.1)

In [0]:
log_model_2 = LogisticRegression()
log_model_2.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
print('Score: {}'.format(log_model_2.score(X_test, y_test)))

Score: 0.8670520231213873


In [0]:
from sklearn.metrics import accuracy_score

In [0]:
print('Accuracy: {}'.format(accuracy_score(y_test, log_model_2.predict(X_test))))

Accuracy: 0.8670520231213873


In [0]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix

In [0]:
print('Precision: {}'.format(precision_score(y_test, log_model_2.predict(X_test), average=None)))
print('Recall: {}'.format(recall_score(y_test, log_model_2.predict(X_test), average=None)))

Precision: [0.70454545 0.         0.94444444 0.        ]
Recall: [0.79487179 0.         0.95967742 0.        ]


  'precision', 'predicted', average, warn_for)


In [0]:
df_cars['car'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: car, dtype: int64

In [0]:
print(confusion_matrix(y_test, log_model_2.predict(X_test)))

[[ 31   1   7   0]
 [  5   0   0   0]
 [  3   2 119   0]
 [  5   0   0   0]]
