# Playground for Advanced Topics Slides

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2020-11-18 08:58:50.501448


In [2]:
import sklearn
sklearn.__version__

'0.23.1'

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV, RFE

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load Data

In [4]:
df = pd.read_csv('../data/GermanCredit.csv')
df['Class'] = df['Class'].map({'Good': 1, 'Bad': 0})
df.head()
X = df.drop(['Class'], axis=1)
y = df[['Class']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,Duration,Amount,InstallmentRatePercentage,ResidenceDuration,Age,NumberExistingCredits,NumberPeopleMaintenance,Telephone,ForeignWorker,Class,...,OtherInstallmentPlans.Bank,OtherInstallmentPlans.Stores,OtherInstallmentPlans.None,Housing.Rent,Housing.Own,Housing.ForFree,Job.UnemployedUnskilled,Job.UnskilledResident,Job.SkilledEmployee,Job.Management.SelfEmp.HighlyQualified
0,6,1169,4,4,67,2,1,0,1,1,...,0,0,1,0,1,0,0,0,1,0
1,48,5951,2,2,22,1,1,1,1,0,...,0,0,1,0,1,0,0,0,1,0
2,12,2096,2,3,49,1,2,1,1,1,...,0,0,1,0,1,0,0,1,0,0
3,42,7882,2,4,45,1,2,1,1,1,...,0,0,1,0,0,1,0,0,1,0
4,24,4870,3,4,53,2,2,1,1,0,...,0,0,1,0,0,1,0,0,1,0


In [5]:
X_train.shape

(800, 61)

In [6]:
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.49      0.48        59
           1       0.78      0.76      0.77       141

    accuracy                           0.68       200
   macro avg       0.62      0.63      0.62       200
weighted avg       0.69      0.68      0.68       200



# Feature Selection

## Variance Threshold (Filter Method, Unsupervised)

In [7]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0.1));
sel = sel.fit(X_train);

X_train_new = sel.transform(X_train)
X_test_new = sel.transform(X_test)

X_train_new.shape

clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_test_new)
print(classification_report(y_test, y_pred))

(800, 35)

              precision    recall  f1-score   support

           0       0.53      0.58      0.55        59
           1       0.82      0.79      0.80       141

    accuracy                           0.73       200
   macro avg       0.67      0.68      0.68       200
weighted avg       0.73      0.72      0.73       200



In [8]:
sel.variances_

array([1.39235975e+02, 8.38891826e+06, 1.25877500e+00, 1.22011094e+00,
       1.30061548e+02, 3.34623437e-01, 1.23085937e-01, 2.43600000e-01,
       3.37750000e-02, 1.95943750e-01, 1.95943750e-01, 5.74984375e-02,
       2.40735937e-01, 4.18359375e-02, 4.52437500e-02, 2.49375000e-01,
       8.29234375e-02, 2.06943750e-01, 1.81093750e-01, 9.39750000e-02,
       1.41923437e-01, 2.02693750e-01, 1.35609375e-02, 2.31859375e-02,
       4.41109375e-02, 0.00000000e+00, 8.67343750e-03, 8.89984375e-02,
       1.23437500e-02, 2.41443750e-01, 9.19937500e-02, 6.51000000e-02,
       4.63734375e-02, 1.51560938e-01, 1.39443750e-01, 2.23593750e-01,
       1.44375000e-01, 1.88743750e-01, 6.29437500e-02, 4.06937500e-02,
       2.16693750e-01, 2.47500000e-01, 8.19000000e-02, 0.00000000e+00,
       9.00000000e-02, 4.06937500e-02, 5.41937500e-02, 1.99375000e-01,
       1.79775000e-01, 2.23185937e-01, 1.30110937e-01, 1.17685938e-01,
       4.18359375e-02, 1.47600000e-01, 1.44375000e-01, 2.03235937e-01,
      

## Univariate Feature Selection (Filter Method, Supervised)

In [9]:
from sklearn.feature_selection import SelectKBest, chi2

sel = SelectKBest(chi2, k=10)
sel = sel.fit(X_train, y_train)

X_train_new = sel.transform(X_train)
X_test_new = sel.transform(X_test)

X_train_new.shape

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_test_new)
print(classification_report(y_test, y_pred))

(800, 10)

DecisionTreeClassifier(random_state=0)

              precision    recall  f1-score   support

           0       0.58      0.51      0.54        59
           1       0.80      0.84      0.82       141

    accuracy                           0.74       200
   macro avg       0.69      0.68      0.68       200
weighted avg       0.74      0.74      0.74       200



## Recursive Feature Elimination (Wrapper Method)

In [10]:
clf = DecisionTreeClassifier(random_state=0)
sel = RFE(estimator=clf, n_features_to_select=10)
sel = sel.fit(X_train, y_train)

X_train_new = sel.transform(X_train)
X_test_new = sel.transform(X_test)

X_train_new.shape

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_test_new)
print(classification_report(y_test, y_pred))

(800, 10)

DecisionTreeClassifier(random_state=0)

              precision    recall  f1-score   support

           0       0.45      0.51      0.48        59
           1       0.78      0.74      0.76       141

    accuracy                           0.68       200
   macro avg       0.62      0.63      0.62       200
weighted avg       0.69      0.68      0.68       200



In [11]:
sel.ranking_

array([ 1,  1,  1,  1,  1,  2, 15,  4, 45, 28, 33, 20,  1, 13, 11,  1, 31,
       46,  1,  5,  9, 10, 40, 23,  7, 51, 50, 37, 26,  1,  8, 25, 43, 38,
       19, 18, 21, 44, 32, 27, 35, 41, 14, 47, 29, 30, 36, 42, 16,  6, 17,
       52, 34,  1, 12, 22, 39, 48, 24,  3, 49])