In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline

# Introduction

In the well-known Credit Approval dataset (part of the famous UCI machine learning dataset repository), the task is to classify loan applications as positive or negative (i.e. application should be granted, or should not) according to their features. 

# 1. Data

Visit <https://archive.ics.uci.edu/ml/datasets/Credit+Approval> to get the  dataset. Read it into your python session, and take a look. Check that all columns are the type you expect. Check for unexpected columns or rows. Drop any rows with missing data. Extract `X` and `y`.

The file is called `crx.data`. Before we run any code, we could look at the data in a text editor. Here is a chunk:

```
b,34.17,1.54,u,g,cc,v,1.54,t,t,01,t,g,00520,50000,+
a,36.00,1,u,g,c,v,2,t,t,11,f,g,00000,456,+
b,25.50,0.375,u,g,m,v,0.25,t,t,03,f,g,00260,15108,+
b,19.42,6.5,u,g,w,h,1.46,t,t,07,f,g,00080,2954,+
b,35.17,25.125,u,g,x,h,1.625,t,t,01,t,g,00515,500,+
b,32.33,7.5,u,g,e,bb,1.585,t,f,0,t,s,00420,0,-
b,34.83,4,u,g,d,bb,12.5,t,f,0,t,g,?,0,-
a,38.58,5,u,g,cc,v,13.5,t,f,0,t,g,00980,0,-
b,44.25,0.5,u,g,m,v,10.75,t,f,0,f,s,00400,0,-
b,44.83,7,y,p,c,v,1.625,f,f,0,f,g,00160,2,-
```

Observations: there are strings and float and ints. The classification variable is last, with values `+` and `-`.

There is also a data dictionary, `crx.names`. It tells us that columns 1, 4, 5, 6, 7, 9, 10, 12, 13 are categorical (strings). (But note, in that text they count from 1, not 0.) And it explains that there are missing values represented by `?`.

In [None]:
d = pd.read_csv("../data/crx.data", header=None)

In [None]:
d.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [None]:
d.shape

(690, 16)

In [None]:
d.isnull().any() # are there missing values? NO!

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
dtype: bool

In [None]:
d.dtypes

0      object
1      object
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13     object
14      int64
15     object
dtype: object

We see several columns of type `object`. This is because there are question marks, i.e. strings, in columns we expected to be numerical. This also explains why we didn't see `True` for `isnull()` above. We'll replace `?` with `np.nan` and then drop all `nan` values.

In [None]:
# https://stackoverflow.com/questions/35682719/drop-rows-with-a-question-mark-value-in-any-column-in-a-pandas-dataframe
d = d.replace({'?': np.nan}).dropna()

In [None]:
d.shape

(653, 16)

In [None]:
d.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [None]:
d.dtypes

0      object
1      object
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13     object
14      int64
15     object
dtype: object

After dropping the `?`, several of these columns have only numeric values, but we have to tell Pandas that:

In [None]:
numeric_cols = (1, 2, 7, 10, 13, 14)

In [None]:
for col in numeric_cols: # numerical columns
    d[col] = pd.to_numeric(d[col])

In [None]:
d.dtypes

0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13      int64
14      int64
15     object
dtype: object

In [None]:
X = d.iloc[:, :-1]
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0


In [None]:
y = d.iloc[:, -1]
y.head()

0    +
1    +
2    +
3    +
4    +
Name: 15, dtype: object

In [None]:
print(X.shape, y.shape)

(653, 15) (653,)


# 2. Feature engineering

Many of the features are categorical: how can we encode them? Program this to produce a new dataset.

Remember that if we run `OneHotEncoder` on `X`, it will transform every column - even the numerical ones! Instead, we use `ColumnTransformer` to tell it which columns to transform.

In [None]:
categorical_cols = [i for i in range(15) if i not in numeric_cols] 

In [None]:
ct = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(), categorical_cols),
], 
                       remainder='passthrough') # leave numerical columns alone!

In [None]:
X_ohe = ct.fit_transform(X)

In [None]:
X_ohe.shape

(653, 46)

We now have 46 columns, and this looks right. Observe in the first row below, how the new columns correspond to the pre-OneHotEncoder columns. If we look in the data dictionary, we'll see that the categorical columns have a total of 40 values; and there are 6 numerical columns.

In [None]:
X_ohe[0]

array([  0.  ,   1.  ,   0.  ,   1.  ,   0.  ,   1.  ,   0.  ,   0.  ,
         0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
         0.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,   0.  ,
         0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,
         1.  ,   0.  ,   1.  ,   1.  ,   0.  ,   1.  ,   0.  ,   0.  ,
        30.83,   0.  ,   1.25,   1.  , 202.  ,   0.  ])

# 3. Train-test split

Make a 80-20 train-test split with a fixed random seed of 0 (so that we all have exactly the same split).

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.2, random_state=0)

# 4. Classification

Try several classification algorithms, ideally in a loop. Print out both the training and test performance.

Remember the workflow:

1. import (already done)
2. instantiate
3. fit
4. score
5. predict


In [None]:
np.random.seed(0)

RF = RandomForestClassifier()
RF.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [None]:
RF.score(X_train, y_train)

1.0

In [None]:
RF.score(X_test, y_test)

0.8702290076335878

In [None]:
RF.predict(X_test[:5])

array(['+', '+', '+', '-', '+'], dtype=object)

In [None]:
LR = LogisticRegression()

In [None]:
LR.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

I don't know why the Logistic Regression gradient descent didn't converge

In [None]:
NB = GaussianNB()
NB.fit(X_train, y_train)
NB.score(X_train, y_train)

0.8390804597701149

In [None]:
NB.score(X_test, y_test) # no overfitting!

0.8320610687022901

We can put these in a loop, so let's do that. The point is to observe that all these different algorithms have the same API.

In [None]:
clfs = [RandomForestClassifier(),
       GaussianNB()] 

for clf in clfs:
    clf.fit(X_train, y_train)
    print(clf.__class__.__name__) # we can get class name this way
    print("train:", clf.score(X_train, y_train))
    print("test:", clf.score(X_test, y_test))
    

RandomForestClassifier
train: 1.0
test: 0.8625954198473282
GaussianNB
train: 0.8390804597701149
test: 0.8320610687022901


# 5. Feature selection 

Try SelectKBest to choose a subset of features. Notice its effect on both training and test performance.


In [None]:
# we can use the chi^2 metric to give a ranking of the features
# (it just considers them individually for their relationship with the y variable)
sel = SelectKBest(chi2, k=5)
sel.fit(X_train, y_train)
X_train_sel = sel.transform(X_train)
X_test_sel = sel.transform(X_test)


In [None]:
for clf in clfs:
    clf.fit(X_train_sel, y_train)
    print(clf.__class__.__name__) # we can get class name this way
    print("train:", clf.score(X_train_sel, y_train))
    print("test:", clf.score(X_test_sel, y_test))

RandomForestClassifier
train: 0.9961685823754789
test: 0.9083969465648855
GaussianNB
train: 0.7413793103448276
test: 0.8015267175572519


Observations: this has actually improved the RF test performance! It was over-fitting previously, and it still is, but less so. The Gaussian Naive Bayes was not over-fitting, so feature selection made it worse.

# 6. Pipeline

Put everything together in a pipeline.

In [None]:
pipe = Pipeline([('encoder', 
                  ColumnTransformer(transformers=[('encoder', 
                                                   OneHotEncoder(), 
                                                   categorical_cols)], 
                       remainder='passthrough') # leave numerical columns alone!
                 ),
                 ('feature_selection',
                  SelectKBest(chi2, k=5)
                 ),
                 ('random_forest',
                  RandomForestClassifier(random_state=0)
                 )])

The idea of a pipeline is to put everything together so that we have just one object on which to call fit.

In [None]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('encoder',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('encoder', OneHotEncoder(),
                                                  [0, 3, 4, 5, 6, 8, 9, 11,
                                                   12])])),
                ('feature_selection',
                 SelectKBest(k=5, score_func=<function chi2 at 0x1a1f9b58c8>)),
                ('random_forest', RandomForestClassifier(random_state=0))])

In [None]:
pipe.score(X_test, y_test)

0.9007633587786259

# 7. Extra processing/Rough Work


In [None]:
X = pd.DataFrame({"x0": [0.5, 0.3, 0.2], "x1": ["a", "b", "a"], "x2": ["c", "c", "d"]})

In [None]:
X

Unnamed: 0,x0,x1,x2
0,0.5,a,c
1,0.3,b,c
2,0.2,a,d


Here, we'll make two `OneHotEncoder`s in parallel, working on column sets `[1]` and `[2]`. Each runs and produces the new columns it ought to produce, and these are concatenated together with the `passthrough` columns.

In [None]:
pipe = Pipeline([
                 ('encoder', 
                  ColumnTransformer(transformers=[
                      ('encoder 1', OneHotEncoder(), [1]), # [1]: "a", "b", "a"
                      ('encoder 2', OneHotEncoder(), [2]) # [2]: "c", "c", "d"
                  ],
                       remainder='passthrough')
                 ),
                ])
pipe.fit_transform(X) 

array([[1. , 0. , 1. , 0. , 0.5],
       [0. , 1. , 1. , 0. , 0.3],
       [1. , 0. , 0. , 1. , 0.2]])

Think of these columns as:

`x1==a, x1==b, x2==c, x2==d, x0`

Interestingly, `ColumnTransformer` can process the same column twice if we want:

In [None]:
pipe = Pipeline([
                 ('encoder', 
                  ColumnTransformer(transformers=[
                      ('encoder 1', OneHotEncoder(), [1]), # [1]: "a", "b", "a"
                      ('encoder 1 again', OneHotEncoder(), [1])  # [1]: "a", "b", "a"
                  ],
                       remainder='passthrough')
                 ),
                ])
pipe.fit_transform(X) 

array([[1.0, 0.0, 1.0, 0.0, 0.5, 'c'],
       [0.0, 1.0, 0.0, 1.0, 0.3, 'c'],
       [1.0, 0.0, 1.0, 0.0, 0.2, 'd']], dtype=object)

Think of these columns as:

`x1==a, x1==b, x2==c, x2==d, x0`

A final point -- we could ask the same question about `Pipeline` itself -- if we put multiple `ColumnTransformer`s in a `Pipeline`, these are definitely **in sequence**, not parallel. So how to refer to columns?

In [None]:
pipe = Pipeline([
                 ('encoder 0', 
                  ColumnTransformer(transformers=[
                      ('encoder', OneHotEncoder(), [1])], # [1]: "a", "b", "a"
                       remainder='passthrough') 
                 ),
                 ('encoder 1', 
                  ColumnTransformer(transformers=[
                      ('encoder', OneHotEncoder(), [2])], # what does [2] mean now??
                       remainder='passthrough') 
                 )
                ])
pipe.fit_transform(X) 

array([[0.0, 0.0, 1.0, 1.0, 0.0, 'c'],
       [0.0, 1.0, 0.0, 0.0, 1.0, 'c'],
       [1.0, 0.0, 0.0, 1.0, 0.0, 'd']], dtype=object)

As we can see, the `[2]` in `encoder 1` now does not refer to the `"c", "c", "d"` column. We now have a mess -- the numerical column has been one-hot encoded, which we didn't want.

The solution here is probably to try to do any transformations that require us knowing the column indices all in one go, in a single `ColumnTransformer`, early in the `Pipeline`. In practice this is not going to be a problem.