# Caruana Replication

In [1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import math
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Define Algorithms

### Algorithm 1: Logistic Regression

In [2]:
def log_reg(X_train, X_test, y_train, y_test):
    # Logistic Regression binary classification
    C_list = [1e-4, 1e-3, 1e-2, 1e-1,1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8]
    scoring = ['accuracy', 'f1', 'roc_auc']
    parameters = {'C': C_list, 'penalty': ['none', 'l2']}
    classifier = LogisticRegression(n_jobs = -1)
    clf = GridSearchCV(classifier, parameters, return_train_score = True, n_jobs = -1, scoring = scoring, refit = False)
    clf.fit(X_train, y_train)
    
    # Find best parameters for each performance metric
    best_acc = 0
    best_C_acc = 0
    best_penalty_acc = ''
    for i in range(len(clf.cv_results_['mean_test_accuracy'])):
        if clf.cv_results_['mean_test_accuracy'][i] > best_acc:
            best_C_acc = clf.cv_results_['param_C'][i]
            best_penalty_acc = clf.cv_results_['param_penalty'][i]
        
    best_f1 = 0
    best_C_f1 = 0
    best_penalty_f1 = ''
    for j in range(len(clf.cv_results_['mean_test_f1'])):
        if clf.cv_results_['mean_test_f1'][j] > best_f1:
            best_C_f1 = clf.cv_results_['param_C'][j]
            best_penalty_f1 = clf.cv_results_['param_penalty'][j]
        
    best_auc = 0
    best_C_auc = 0
    best_penalty_auc = ''
    for k in range(len(clf.cv_results_['mean_test_roc_auc'])):
        if clf.cv_results_['mean_test_roc_auc'][k] > best_auc:
            best_C_auc = clf.cv_results_['param_C'][k]
            best_penalty_auc = clf.cv_results_['param_penalty'][k]
    print("Found Best Parameters!")
    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    best_model_acc = LogisticRegression(C=best_C_acc, n_jobs = -1, penalty=best_penalty_acc)
    best_model_f1 = LogisticRegression(C=best_C_f1, n_jobs = -1, penalty = best_penalty_f1)
    best_model_auc = LogisticRegression(C=best_C_auc, n_jobs = -1, penalty = best_penalty_auc)
    best_model_acc.fit(X_train, y_train)
    best_model_f1.fit(X_train, y_train)
    best_model_auc.fit(X_train, y_train)
    
    # Find and store accuracy, FSC, and AUC of the 3 models from previous line on test set
    y_pred1 = best_model_acc.predict(X_test)
    y_pred2 = best_model_f1.predict(X_test)
    y_pred3 = best_model_auc.predict(X_test)
    
    metrics_acc = accuracy_score(y_test, y_pred1)
    metrics_f1 = f1_score(y_test, y_pred2)
    metrics_auc = roc_auc_score(y_test, y_pred3)
    
    return metrics_acc, metrics_f1, metrics_auc,

### Algorithm 2: K-Nearest Neighbors

In [3]:
def knn(X_train, X_test, y_train, y_test):
    # K-Neighbors binary classification
    k_list = [i for i in range(1,106,4)]
    scoring = ['accuracy', 'f1', 'roc_auc']
    parameters = {'n_neighbors': k_list}
    classifier = KNeighborsClassifier(n_jobs = -1)
    clf = GridSearchCV(classifier, parameters, return_train_score = True, n_jobs = -1, scoring = scoring, refit = False)
    clf.fit(X_train, y_train)
    
    # Find best parameters for each performance metric
    best_acc = 0
    best_k_acc = 0
    for i in range(len(clf.cv_results_['mean_test_accuracy'])):
        if clf.cv_results_['mean_test_accuracy'][i] > best_acc:
            best_k_acc = clf.cv_results_['param_n_neighbors'][i]
        
    best_f1 = 0
    best_k_f1 = 0
    for j in range(len(clf.cv_results_['mean_test_f1'])):
        if clf.cv_results_['mean_test_f1'][j] > best_f1:
            best_k_f1 = clf.cv_results_['param_n_neighbors'][j]
        
    best_auc = 0
    best_k_auc = 0
    for k in range(len(clf.cv_results_['mean_test_roc_auc'])):
        if clf.cv_results_['mean_test_roc_auc'][k] > best_auc:
            best_k_auc = clf.cv_results_['param_n_neighbors'][k]
    
    print("Found Best Parameters!") 
    
    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    best_model_acc = KNeighborsClassifier(n_neighbors=best_k_acc, n_jobs = -1)
    best_model_f1 = KNeighborsClassifier(n_neighbors=best_k_f1, n_jobs = -1)
    best_model_auc = KNeighborsClassifier(n_neighbors=best_k_auc, n_jobs = -1)
    best_model_acc.fit(X_train, y_train)
    best_model_f1.fit(X_train, y_train)
    best_model_auc.fit(X_train, y_train)
    
    # Find and store accuracy, FSC, and AUC of the 3 models from previous line on test set
    y_pred1 = best_model_acc.predict(X_test)
    y_pred2 = best_model_f1.predict(X_test)
    y_pred3 = best_model_auc.predict(X_test)
    
    metrics_acc = accuracy_score(y_test, y_pred1)
    metrics_f1 = f1_score(y_test, y_pred2)
    metrics_auc = roc_auc_score(y_test, y_pred3)
    return metrics_acc, metrics_f1, metrics_auc

### Algorithm 3: Random Forest

In [4]:
def rand_for(X_train, X_test, y_train, y_test):
    # Random Forest binary classification
    feat_list = [1, 2, 4, 6, 8, 12, 16, 20]
    n_trees = [1024]
    scoring = ['accuracy', 'f1', 'roc_auc']
    parameters = {'n_estimators': n_trees, 'max_features': feat_list}
    classifier = RandomForestClassifier(n_jobs = -1)
    clf = GridSearchCV(classifier, parameters, return_train_score = True, n_jobs = -1, scoring = scoring, refit = False)
    clf.fit(X_train, y_train)
    
    # Find best parameters for each performance metric
    best_acc = 0
    best_feat_acc = 0
    for i in range(len(clf.cv_results_['mean_test_accuracy'])):
        if clf.cv_results_['mean_test_accuracy'][i] > best_acc:
            best_feat_acc = clf.cv_results_['param_max_features'][i]
        
    best_f1 = 0
    best_feat_f1 = 0
    for j in range(len(clf.cv_results_['mean_test_f1'])):
        if clf.cv_results_['mean_test_f1'][j] > best_f1:
            best_feat_f1 = clf.cv_results_['param_max_features'][j]
        
    best_auc = 0
    best_feat_auc = 0
    for k in range(len(clf.cv_results_['mean_test_roc_auc'])):
        if clf.cv_results_['mean_test_roc_auc'][k] > best_auc:
            best_feat_auc = clf.cv_results_['param_max_features'][k]
            
    print("Found Best Parameters!")
    
    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    best_model_acc = RandomForestClassifier(n_estimators=1024, n_jobs = -1, max_features = best_feat_acc)
    best_model_f1 = RandomForestClassifier(n_estimators=1024, n_jobs = -1, max_features = best_feat_f1)
    best_model_auc = RandomForestClassifier(n_estimators=1024, n_jobs = -1, max_features = best_feat_auc)
    best_model_acc.fit(X_train, y_train)
    best_model_f1.fit(X_train, y_train)
    best_model_auc.fit(X_train, y_train)
    
    # Find and store accuracy, FSC, and AUC of the 3 models from previous line on test set
    y_pred1 = best_model_acc.predict(X_test)
    y_pred2 = best_model_f1.predict(X_test)
    y_pred3 = best_model_auc.predict(X_test)
    
    metrics_acc = accuracy_score(y_test, y_pred1)
    metrics_f1 = f1_score(y_test, y_pred2)
    metrics_auc = roc_auc_score(y_test, y_pred3)
    return metrics_acc, metrics_f1, metrics_auc

In [5]:
def draw_heatmap(training_accuracy, C_list, label):
    # training_accuracy: A NumPy array with the shape (len(C_list))
    # C_list: List of C(s).
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(training_accuracy, annot=True, fmt='.3f', 
                     xticklabels=[], yticklabels=C_list)
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$C$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.title(label + 'accuracy w.r.t $C$')
    plt.show()

## Adult Classification Problem

### Import Data

In [6]:
cols1 = ['age', 'workclass', 'fnlwgt','education', 'education-num', 'marital-status', 
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
        'hours-per-week', 'native-country', 'class']
features = ['age', 'workclass', 'fnlwgt','education', 'education-num', 'marital-status', 
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
        'hours-per-week', 'native-country']
data1_1 = pd.read_csv('./ADULT/adult.data', header = None, names = cols1)
data1_2 = pd.read_csv('./ADULT/adult.test', header = None, names = cols1)
frames = [data1_1, data1_2]
data1 = pd.concat(frames)
data1 = data1.reset_index()

In [7]:
data1.shape

(48843, 16)

### Clean and Preprocess Data

In [8]:
# Types in each column
print(data1.dtypes)

# Drop any nulls, shown as question marks
for i in range(len(data1)):
    for col in cols1:
        if data1[col][i] == ' ?':
            data1.drop(i, inplace = True)
            break

index               int64
age                object
workclass          object
fnlwgt            float64
education          object
education-num     float64
marital-status     object
occupation         object
relationship       object
race               object
sex                object
capital-gain      float64
capital-loss      float64
hours-per-week    float64
native-country     object
class              object
dtype: object


In [9]:
data1.shape

(45223, 16)

In [10]:
# Preprocess last column
def transform_income(income):
    if income == ' <=50K':
        return 0
    else:
        return 1

In [11]:
data1['class'] = data1['class'].apply(transform_income)

In [12]:
data1.head(10)

Unnamed: 0,index,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,0
1,1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,0
2,2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,0
3,3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,0
4,4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,0
5,5,37,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,0
6,6,49,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,0
7,7,52,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,1
8,8,31,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,1
9,9,42,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,1


In [13]:
data1 = data1.dropna()

In [14]:
data1.shape

(45222, 16)

In [15]:
X1 = data1[features]
y1 = data1['class']

In [16]:
X1.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba


In [17]:
features

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [18]:
X1 = pd.get_dummies(X1, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race',
 'sex','native-country'])

In [19]:
# Types in each column
print(X1.dtypes)
print(y1.dtypes)

age                                 object
fnlwgt                             float64
education-num                      float64
capital-gain                       float64
capital-loss                       float64
                                    ...   
native-country_ Thailand             uint8
native-country_ Trinadad&Tobago      uint8
native-country_ United-States        uint8
native-country_ Vietnam              uint8
native-country_ Yugoslavia           uint8
Length: 104, dtype: object
int64


In [20]:
X1.shape

(45222, 104)

In [21]:
y1.shape

(45222,)

### Classification: Dataset 1

In [22]:
LRD1 = np.zeros((5, 3))
KNND1 = np.zeros((5, 3))
RDD1 = np.zeros((5, 3))
for trial in range(5):
    X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, train_size = 5000)
    metrics_acc_log, metrics_f1_log, metrics_auc_log = log_reg(X1_train, X1_test, y1_train, y1_test)
    metrics_acc_knn, metrics_f1_knn, metrics_auc_knn = knn(X1_train, X1_test, y1_train, y1_test)
    metrics_acc_rand, metrics_f1_rand, metrics_auc_rand = rand_for(X1_train, X1_test, y1_train, y1_test)
    LRD1[trial][0] = metrics_acc_log
    LRD1[trial][1] = metrics_f1_log
    LRD1[trial][2] = metrics_auc_log
    KNND1[trial][0] = metrics_acc_knn
    KNND1[trial][1] = metrics_f1_knn
    KNND1[trial][2] = metrics_auc_knn
    RDD1[trial][0] = metrics_acc_rand
    RDD1[trial][1] = metrics_f1_rand
    RDD1[trial][2] = metrics_auc_rand
LRD1 = pd.DataFrame(LRD1, columns=['accuracy', 'f1', 'roc_auc'])
KNND1 = pd.DataFrame(KNND1, columns=['accuracy', 'f1', 'roc_auc'])
RDD1 = pd.DataFrame(RDD1, columns=['accuracy', 'f1', 'roc_auc'])

AD1 = [[LRD1], [KNND1], [RDD1]]
AD1 = pd.DataFrame(AD1, columns = ['ADULT'], index = ['LR', 'KNN', 'RAND_FOR'])

Found Best Parameters!
Found Best Parameters!




Found Best Parameters!
Found Best Parameters!




Found Best Parameters!




Found Best Parameters!
Found Best Parameters!
Found Best Parameters!




Found Best Parameters!
Found Best Parameters!
Found Best Parameters!




Found Best Parameters!
Found Best Parameters!
Found Best Parameters!




Found Best Parameters!


In [23]:
LRD1

Unnamed: 0,accuracy,f1,roc_auc
0,0.558948,0.300859,0.559334
1,0.560564,0.301577,0.559408
2,0.560713,0.302861,0.559906
3,0.554324,0.427175,0.55407
4,0.561459,0.302724,0.560101


In [24]:
KNND1

Unnamed: 0,accuracy,f1,roc_auc
0,0.516011,0.361172,0.516265
1,0.520785,0.468026,0.520479
2,0.524961,0.406228,0.524526
3,0.52347,0.404215,0.523242
4,0.522227,0.452242,0.521764


In [25]:
RDD1

Unnamed: 0,accuracy,f1,roc_auc
0,0.637487,0.605393,0.636901
1,0.631172,0.606376,0.62999
2,0.63515,0.607681,0.635857
3,0.633161,0.599853,0.633389
4,0.633832,0.605095,0.63331


## Cover Type Classification Problem

### Import Data

In [26]:
cols2 = ['elevation', 'aspect', 'slope', 'horizontal_distance_to_hydrology', 'vertical_distance_to_hydrology',
       'horizontal_distance_to_roadways', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm', 
       'horizontal_distance_to_fire_points', 'wilderness_area1', 'wilderness_area2','wilderness_area3','wilderness_area4',
        'soil_type1','soil_type2','soil_type3','soil_type4','soil_type5','soil_type6','soil_type7','soil_type8',
        'soil_type9','soil_type10','soil_type11','soil_type12','soil_type13','soil_type14','soil_type15','soil_type16','soil_type17',
        'soil_type18', 'soil_type19', 'soil_type20', 'soil_type21', 'soil_type22', 'soil_type23', 'soil_type24', 'soil_type25', 'soil_type26',
        'soil_type27', 'soil_type28', 'soil_type29', 'soil_type30', 'soil_type31', 'soil_type32', 'soil_type33', 'soil_type34', 'soil_type35', 
        'soil_type36', 'soil_type37', 'soil_type38', 'soil_type39', 'soil_type40', 'cover_type']
data2 = pd.read_csv('./COVTYPE/covtype.data', header = None, names = cols2)

In [27]:
data2.head()

Unnamed: 0,elevation,aspect,slope,horizontal_distance_to_hydrology,vertical_distance_to_hydrology,horizontal_distance_to_roadways,hillshade_9am,hillshade_noon,hillshade_3pm,horizontal_distance_to_fire_points,...,soil_type32,soil_type33,soil_type34,soil_type35,soil_type36,soil_type37,soil_type38,soil_type39,soil_type40,cover_type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [28]:
data2.shape

(581012, 55)

### Clean and Preprocess Data

In [29]:
# Types in each column
print(data2.dtypes)

elevation                             int64
aspect                                int64
slope                                 int64
horizontal_distance_to_hydrology      int64
vertical_distance_to_hydrology        int64
horizontal_distance_to_roadways       int64
hillshade_9am                         int64
hillshade_noon                        int64
hillshade_3pm                         int64
horizontal_distance_to_fire_points    int64
wilderness_area1                      int64
wilderness_area2                      int64
wilderness_area3                      int64
wilderness_area4                      int64
soil_type1                            int64
soil_type2                            int64
soil_type3                            int64
soil_type4                            int64
soil_type5                            int64
soil_type6                            int64
soil_type7                            int64
soil_type8                            int64
soil_type9                      

In [30]:
# Check if there are any null variables
data2.isnull().values.any()

False

In [31]:
# Method in Caruana Paper: Largest class(7) is positive and everything else is negative
def transform_type(covtype):
    if covtype == 7:
        return 1
    else:
        return 0

In [32]:
data2['cover_type'] = data2['cover_type'].apply(transform_type)

In [33]:
data2.head()

Unnamed: 0,elevation,aspect,slope,horizontal_distance_to_hydrology,vertical_distance_to_hydrology,horizontal_distance_to_roadways,hillshade_9am,hillshade_noon,hillshade_3pm,horizontal_distance_to_fire_points,...,soil_type32,soil_type33,soil_type34,soil_type35,soil_type36,soil_type37,soil_type38,soil_type39,soil_type40,cover_type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,0
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,0
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,0
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Only picked a subset of dataset to match size in Caruana paper
X2 = data2.iloc[:30000, :-1]
y2 = data2.iloc[:30000, -1]

### Classification: Dataset 2

In [35]:
LRD2 = np.zeros((5, 3))
KNND2 = np.zeros((5, 3))
RDD2 = np.zeros((5, 3))
for trial in range(5):
    X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, train_size = 5000)
    metrics_acc_log, metrics_f1_log, metrics_auc_log = log_reg(X2_train, X2_test, y2_train, y2_test)
    metrics_acc_knn, metrics_f1_knn, metrics_auc_knn = knn(X2_train, X2_test, y2_train, y2_test)
    metrics_acc_rand, metrics_f1_rand, metrics_auc_rand = rand_for(X2_train, X2_test, y2_train, y2_test)
    LRD2[trial][0] = metrics_acc_log
    LRD2[trial][1] = metrics_f1_log
    LRD2[trial][2] = metrics_auc_log
    KNND2[trial][0] = metrics_acc_knn
    KNND2[trial][1] = metrics_f1_knn
    KNND2[trial][2] = metrics_auc_knn
    RDD2[trial][0] = metrics_acc_rand
    RDD2[trial][1] = metrics_f1_rand
    RDD2[trial][2] = metrics_auc_rand
LRD2 = pd.DataFrame(LRD2, columns=['accuracy', 'f1', 'roc_auc'])
KNND2 = pd.DataFrame(KNND2, columns=['accuracy', 'f1', 'roc_auc'])
RDD2 = pd.DataFrame(RDD2, columns=['accuracy', 'f1', 'roc_auc'])

AD2 = [[LRD2], [KNND2], [RDD2]]
AD2 = pd.DataFrame(AD2, columns = ['COVTYPE'], index = ['LR', 'KNN', 'RAND_FOR'])

Found Best Parameters!




Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!


In [36]:
LRD2

Unnamed: 0,accuracy,f1,roc_auc
0,0.9646,0.736529,0.834986
1,0.96924,0.781348,0.874848
2,0.96832,0.772152,0.868204
3,0.96912,0.77662,0.868069
4,0.96776,0.769714,0.86712


In [37]:
KNND2

Unnamed: 0,accuracy,f1,roc_auc
0,0.9388,0.373977,0.622427
1,0.94232,0.43273,0.648831
2,0.94196,0.389053,0.626604
3,0.941,0.406916,0.637284
4,0.9412,0.425781,0.646944


In [38]:
RDD2

Unnamed: 0,accuracy,f1,roc_auc
0,0.98632,0.907346,0.953789
1,0.98444,0.892916,0.94952
2,0.98648,0.908152,0.96367
3,0.98464,0.895368,0.954262
4,0.98464,0.894014,0.948066


## Letter Classification Problem

### Import Data

In [39]:
cols3 = ['lettr', 'x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 
         'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy',
        'y-ege', 'yegvx']
data3 = pd.read_csv('./LETTER/letter-recognition.data', header = None, names = cols3)

In [40]:
data3.head()

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


### Clean and Preprocess Data

In [41]:
# Types in each column
print(data3.dtypes)

# Check if there are any null variables
data3.isnull().values.any()

lettr    object
x-box     int64
y-box     int64
width     int64
high      int64
onpix     int64
x-bar     int64
y-bar     int64
x2bar     int64
y2bar     int64
xybar     int64
x2ybr     int64
xy2br     int64
x-ege     int64
xegvy     int64
y-ege     int64
yegvx     int64
dtype: object


False

In [42]:
# Method in Caruana Paper: A-M is positive and everything else is negative
def transform_letter(letter):
    positive = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M']
    if letter in positive:
        return 1
    else:
        return 0

In [43]:
data3['lettr'] = data3['lettr'].apply(transform_letter)

In [44]:
data3.head()

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,1,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,0,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,1,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [45]:
X3 = data3.iloc[:, 1:]
y3 = data3.iloc[:, 0]

In [46]:
X3.head()

Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [47]:
X3.shape

(20000, 16)

In [48]:
y3.head()

0    0
1    1
2    1
3    0
4    1
Name: lettr, dtype: int64

In [49]:
y3.shape

(20000,)

In [50]:
X3 = X3.dropna()

In [51]:
X3.shape

(20000, 16)

### Classification: Dataset 3

In [52]:
LRD3 = np.zeros((5, 3))
KNND3 = np.zeros((5, 3))
RDD3 = np.zeros((5, 3))
for trial in range(5):
    X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, train_size = 5000)
    metrics_acc_log, metrics_f1_log, metrics_auc_log = log_reg(X3_train, X3_test, y3_train, y3_test)
    metrics_acc_knn, metrics_f1_knn, metrics_auc_knn = knn(X3_train, X3_test, y3_train, y3_test)
    metrics_acc_rand, metrics_f1_rand, metrics_auc_rand = rand_for(X3_train, X3_test, y3_train, y3_test)
    LRD3[trial][0] = metrics_acc_log
    LRD3[trial][1] = metrics_f1_log
    LRD3[trial][2] = metrics_auc_log
    KNND3[trial][0] = metrics_acc_knn
    KNND3[trial][1] = metrics_f1_knn
    KNND3[trial][2] = metrics_auc_knn
    RDD3[trial][0] = metrics_acc_rand
    RDD3[trial][1] = metrics_f1_rand
    RDD3[trial][2] = metrics_auc_rand
LRD3 = pd.DataFrame(LRD3, columns=['accuracy', 'f1', 'roc_auc'])
KNND3 = pd.DataFrame(KNND3, columns=['accuracy', 'f1', 'roc_auc'])
RDD3 = pd.DataFrame(RDD3, columns=['accuracy', 'f1', 'roc_auc'])

AD3 = [[LRD3], [KNND3], [RDD3]]
AD3 = pd.DataFrame(AD3, columns = ['LETTER'], index = ['LR', 'KNN', 'RAND_FOR'])

Found Best Parameters!
Found Best Parameters!


 0.91894725        nan]
 0.98274894        nan]


Found Best Parameters!
Found Best Parameters!
Found Best Parameters!


 0.91774069        nan]
 0.98221043        nan]


Found Best Parameters!
Found Best Parameters!
Found Best Parameters!


 0.93462462        nan]
 0.98643778        nan]


Found Best Parameters!
Found Best Parameters!
Found Best Parameters!


 0.92449575        nan]
 0.98439961        nan]


Found Best Parameters!
Found Best Parameters!
Found Best Parameters!


 0.92356739        nan]
 0.98584008        nan]


Found Best Parameters!


In [53]:
LRD3

Unnamed: 0,accuracy,f1,roc_auc
0,0.702067,0.671325,0.699173
1,0.6938,0.66364,0.691088
2,0.698267,0.672266,0.696408
3,0.6916,0.658699,0.688224
4,0.699267,0.668236,0.696144


In [54]:
KNND3

Unnamed: 0,accuracy,f1,roc_auc
0,0.809733,0.788561,0.80706
1,0.799467,0.773494,0.795408
2,0.814667,0.795888,0.812543
3,0.798267,0.769008,0.793283
4,0.806667,0.782869,0.80294


In [55]:
RDD3

Unnamed: 0,accuracy,f1,roc_auc
0,0.931933,0.925057,0.930352
1,0.933,0.926951,0.932429
2,0.948267,0.942512,0.946897
3,0.940267,0.932671,0.937482
4,0.938933,0.933871,0.936909


## MUSH Classification Problem

### Import Data

In [56]:
cols4 = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 
         'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 
        'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 
        'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 
        'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 
        'habitat']
new_cols = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 
         'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 
            'stalk-surface-above-ring', 'stalk-surface-below-ring', 
        'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 
        'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 
        'habitat']
data4 = pd.read_csv('./MUSH/mushroom.data', header = None, names = cols4)

In [57]:
data4.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


### Clean and Preprocess Data

In [58]:
# Find null value in dataframe
data4['stalk-root'].unique()

array(['e', 'c', 'b', 'r', '?'], dtype=object)

In [59]:
# See how many rows with nulls are in dataset
null_rows = 0
for i in range(len(data4)):
    for col in cols4:
        if data4[col][i] == '?':
            null_rows += 1
            break

In [60]:
null_rows

2480

There are a significant number of nulls in the dataset, so instead of dropping the instances, we can note that the nulls are all in column 11, so we can drop a feature, since we have a large amount of features.

In [61]:
data4 = data4.drop(['stalk-root'], axis=1)

In [62]:
# Check nulls again
null_rows = 0
for i in range(len(data4)):
    for col in new_cols:
        if data4[col][i] == '?':
            null_rows += 1
            break

In [63]:
null_rows

0

In [64]:
data4.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'],
      dtype='object')

In [65]:
data4 = pd.get_dummies(data4, columns=['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'])

In [66]:
data4.shape

(8124, 113)

In [67]:
def transform_class(mushroom):
    if mushroom == 'e':
        return 1
    else:
        return 0

In [68]:
data4['class'] = data4['class'].apply(transform_class)

In [69]:
data4.head()

Unnamed: 0,class,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [70]:
X4 = data4.iloc[:, 1:]
y4 = data4.iloc[:, 0]

### Classification: Dataset 4

In [71]:
LRD4 = np.zeros((5, 3))
KNND4 = np.zeros((5, 3))
RDD4 = np.zeros((5, 3))
for trial in range(5):
    X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, train_size = 5000)
    metrics_acc_log, metrics_f1_log, metrics_auc_log = log_reg(X4_train, X4_test, y4_train, y4_test)
    metrics_acc_knn, metrics_f1_knn, metrics_auc_knn = knn(X4_train, X4_test, y4_train, y4_test)
    metrics_acc_rand, metrics_f1_rand, metrics_auc_rand = rand_for(X4_train, X4_test, y4_train, y4_test)
    LRD4[trial][0] = metrics_acc_log
    LRD4[trial][1] = metrics_f1_log
    LRD4[trial][2] = metrics_auc_log
    KNND4[trial][0] = metrics_acc_knn
    KNND4[trial][1] = metrics_f1_knn
    KNND4[trial][2] = metrics_auc_knn
    RDD4[trial][0] = metrics_acc_rand
    RDD4[trial][1] = metrics_f1_rand
    RDD4[trial][2] = metrics_auc_rand
LRD4 = pd.DataFrame(LRD4, columns=['accuracy', 'f1', 'roc_auc'])
KNND4 = pd.DataFrame(KNND4, columns=['accuracy', 'f1', 'roc_auc'])
RDD4 = pd.DataFrame(RDD4, columns=['accuracy', 'f1', 'roc_auc'])

AD4 = [[LRD4], [KNND4], [RDD4]]
AD4 = pd.DataFrame(AD4, columns = ['MUSH'], index = ['LR', 'KNN', 'RAND_FOR'])



Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!
Found Best Parameters!


In [72]:
LRD4

Unnamed: 0,accuracy,f1,roc_auc
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,1.0,1.0,1.0
3,1.0,1.0,1.0
4,1.0,1.0,1.0


In [73]:
KNND4

Unnamed: 0,accuracy,f1,roc_auc
0,0.987516,0.987824,0.987445
1,0.986556,0.986692,0.986535
2,0.987836,0.988228,0.987889
3,0.980474,0.981074,0.980649
4,0.987516,0.987591,0.987507


In [74]:
RDD4

Unnamed: 0,accuracy,f1,roc_auc
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,1.0,1.0,1.0
3,1.0,1.0,1.0
4,1.0,1.0,1.0


## Results

In [75]:
results = [AD1, AD2, AD3, AD4]

In [82]:
results

[                                                      ADULT
 LR           accuracy        f1   roc_auc
 0  0.558948  0...
 KNN          accuracy        f1   roc_auc
 0  0.516011  0...
 RAND_FOR     accuracy        f1   roc_auc
 0  0.637487  0...,
                                                     COVTYPE
 LR           accuracy        f1   roc_auc
 0   0.96460  0...
 KNN          accuracy        f1   roc_auc
 0   0.93880  0...
 RAND_FOR     accuracy        f1   roc_auc
 0   0.98632  0...,
                                                      LETTER
 LR           accuracy        f1   roc_auc
 0  0.702067  0...
 KNN          accuracy        f1   roc_auc
 0  0.809733  0...
 RAND_FOR     accuracy        f1   roc_auc
 0  0.931933  0...,
                                                        MUSH
 LR           accuracy   f1  roc_auc
 0       1.0  1.0    ...
 KNN          accuracy        f1   roc_auc
 0  0.987516  0...
 RAND_FOR     accuracy   f1  roc_auc
 0       1.0  1.0    ...]

In [97]:
# An example
results[1]['COVTYPE']['KNN']

Unnamed: 0,accuracy,f1,roc_auc
0,0.9388,0.373977,0.622427
1,0.94232,0.43273,0.648831
2,0.94196,0.389053,0.626604
3,0.941,0.406916,0.637284
4,0.9412,0.425781,0.646944


In [110]:
algorithms = ['LR', 'KNN', 'RAND_FOR']
datasets = ['ADULT', 'COVTYPE', 'LETTER', 'MUSH']
metrics = ['accuracy', 'f1', 'roc_auc']

In [133]:
# Get metrics each algo/data combo averaged across 5 trials (algorithms in rows, data sets in columns)
results1 = np.zeros((3, 4))
for col, data in enumerate(datasets):
    for row, algo in enumerate(algorithms):
        sums = [sum_ for sum_ in results[col][data][algo].sum()]
        avg = sum(sums)/15
        results1[row][col] = avg

In [134]:
results1

array([[0.48160142, 0.86590865, 0.68601344, 1.        ],
       [0.48704081, 0.66105509, 0.79665704, 0.98608878],
       [0.62430979, 0.94624152, 0.93583534, 1.        ]])

In [135]:
results1 = pd.DataFrame(results1, columns = datasets, index = algorithms)

In [137]:
results1

Unnamed: 0,ADULT,COVTYPE,LETTER,MUSH
LR,0.481601,0.865909,0.686013,1.0
KNN,0.487041,0.661055,0.796657,0.986089
RAND_FOR,0.62431,0.946242,0.935835,1.0


In [158]:
# Get accuracy, FSC, and AUC averaged across all data sets (algorithms in rows, metrics in columns)
acc = 0
f1 = 0
roc = 0
results2 = np.zeros((3, 3))

for row, algo in enumerate(algorithms):
    for i, data in enumerate(datasets):
        sums = [sum_ for sum_ in results[i][data][algo].sum()]
        acc += sums[0]
        f1 += sums[1]
        roc += sums[2]
    results2[row][0] = acc/20
    results2[row][1] = f1/20
    results2[row][2] = roc/20
    acc = 0
    f1 = 0
    roc = 0

In [159]:
results2

array([[0.80600236, 0.69028612, 0.77885415],
       [0.81357156, 0.64807847, 0.73648125],
       [0.8894861 , 0.85916285, 0.88114104]])

In [160]:
results2 = pd.DataFrame(results2, columns = metrics, index = algorithms)

In [161]:
results2

Unnamed: 0,accuracy,f1,roc_auc
LR,0.806002,0.690286,0.778854
KNN,0.813572,0.648078,0.736481
RAND_FOR,0.889486,0.859163,0.881141


In [170]:
#results2.sum(axis=1)
sum_metrics = [sum_/3 for sum_ in results2.sum(axis=1)]
results2['avg'] = sum_metrics

In [171]:
results2

Unnamed: 0,accuracy,f1,roc_auc,avg
LR,0.806002,0.690286,0.778854,0.758381
KNN,0.813572,0.648078,0.736481,0.73271
RAND_FOR,0.889486,0.859163,0.881141,0.876597


In [None]:
results1.to_csv('results_over_datasets'index=False)