# Caruana Replication

In [26]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

## Define Algorithms

### Algorithm 1: Logistic Regression

In [2]:
def log_reg(X_train, X_test, y_train, y_test):
    # Logistic Regression binary classification
    C_list = [1e-4, 1e-3, 1e-2, 1e-1,1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8]
    scoring = ['accuracy', 'f1', 'roc_auc']
    param_grid = [{'classifier__C': C_list, 'classifier__penalty': ['l2']}, 
                  {'classifier__C': [None], 'classifier__penalty': ['none']}]
    pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])
    clf = GridSearchCV(pipe, param_grid, return_train_score = True, 
                       n_jobs = -1, scoring = scoring, refit = False, cv=StratifiedKFold(n_splits=5))
    clf.fit(X_train, y_train)
    
    results = clf.cv_results_
    
    # Find best parameters and optimal for each performance metric
    best_C_acc = clf.cv_results_['params'][np.argmin(results['rank_test_accuracy'])]['classifier__C']
    best_penalty_acc = clf.cv_results_['params'][np.argmin(results['rank_test_accuracy'])]['classifier__penalty']
    
    best_C_f1 = clf.cv_results_['params'][np.argmin(results['rank_test_f1'])]['classifier__C']
    best_penalty_f1 = clf.cv_results_['params'][np.argmin(results['rank_test_f1'])]['classifier__penalty']

    best_C_roc = clf.cv_results_['params'][np.argmin(results['rank_test_roc_auc'])]['classifier__C']
    best_penalty_roc = clf.cv_results_['params'][np.argmin(results['rank_test_roc_auc'])]['classifier__penalty']

    print("Found Best Parameters!")
    
    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    best_model_acc = LogisticRegression(C=best_C_acc, n_jobs = -1, penalty=best_penalty_acc)
    best_model_f1 = LogisticRegression(C=best_C_f1, n_jobs = -1, penalty = best_penalty_f1)
    best_model_roc = LogisticRegression(C=best_C_roc, n_jobs = -1, penalty = best_penalty_roc)
    best_model_acc.fit(X_train, y_train)
    best_model_f1.fit(X_train, y_train)
    best_model_roc.fit(X_train, y_train)
    
    # Find and store accuracy, FSC, and AUC of the 3 models from previous line on test set
    y_pred1 = best_model_acc.predict(X_test)
    y_pred2 = best_model_f1.predict(X_test)
    y_pred3 = best_model_roc.predict(X_test)
    
    metrics_acc = accuracy_score(y_test, y_pred1)
    metrics_f1 = f1_score(y_test, y_pred2)
    metrics_auc = roc_auc_score(y_test, y_pred3)
    
    return metrics_acc, metrics_f1, metrics_auc, results

### Algorithm 2: KNN

In [3]:
def knn(X_train, X_test, y_train, y_test):
    # KNN binary classification
    k_list = [i for i in range(1,106,4)]
    weights = ['uniform', 'distance']
    param_grid = [{'classifier__n_neighbors': k_list, 'classifier__weights': weights}]
    scoring = ['accuracy', 'f1', 'roc_auc']
    pipe = Pipeline([('classifier', KNeighborsClassifier())])
    clf = GridSearchCV(pipe, param_grid, return_train_score = True, 
                       n_jobs = -1, scoring = scoring, refit = False, cv=StratifiedKFold(n_splits=5))
    clf.fit(X_train, y_train)
    
    results = clf.cv_results_
    
    # Find best parameters for each performance metric
    best_k_acc = results['params'][np.argmin(results['rank_test_accuracy'])]['classifier__n_neighbors']
    best_weight_acc = results['params'][np.argmin(results['rank_test_accuracy'])]['classifier__weights']
    
    best_k_f1 = results['params'][np.argmin(results['rank_test_f1'])]['classifier__n_neighbors']
    best_weight_f1 = results['params'][np.argmin(results['rank_test_f1'])]['classifier__weights']
    
    best_k_roc = results['params'][np.argmin(results['rank_test_roc_auc'])]['classifier__n_neighbors']
    best_weight_roc = results['params'][np.argmin(results['rank_test_roc_auc'])]['classifier__weights']
    
    print("Found Best Parameters!") 
    
    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    best_model_acc = KNeighborsClassifier(n_neighbors = best_k_acc, weights = best_weight_acc)
    best_model_f1 = KNeighborsClassifier(n_neighbors = best_k_f1, weights = best_weight_f1)
    best_model_roc = KNeighborsClassifier(n_neighbors = best_k_roc, weights = best_weight_roc)
    best_model_acc.fit(X_train, y_train)
    best_model_f1.fit(X_train, y_train)
    best_model_roc.fit(X_train, y_train)
    
    # Find and store accuracy, FSC, and AUC of the 3 models from previous line on test set
    y_pred1 = best_model_acc.predict(X_test)
    y_pred2 = best_model_f1.predict(X_test)
    y_pred3 = best_model_roc.predict(X_test)
    
    metrics_acc = accuracy_score(y_test, y_pred1)
    metrics_f1 = f1_score(y_test, y_pred2)
    metrics_auc = roc_auc_score(y_test, y_pred3)
    return metrics_acc, metrics_f1, metrics_auc, results

### Algorithm 3: Random Forest

In [4]:
def rand_for(X_train, X_test, y_train, y_test):
    # Random Forest binary classification
    feat_list = [1, 2, 4, 6, 8, 12, 16, 20]
    n_trees = [1024]
    scoring = ['accuracy', 'f1', 'roc_auc']
    param_grid = {'classifier__n_estimators': n_trees, 'classifier__max_features': feat_list}
    pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', RandomForestClassifier())])
    clf = GridSearchCV(pipe, param_grid, return_train_score = True, 
                       n_jobs = -1, scoring = scoring, refit = False, cv=StratifiedKFold(n_splits=5))
    clf.fit(X_train, y_train)
    
    results = clf.cv_results_
    
    # Find best parameters for each performance metric
    best_est_acc = results['params'][np.argmin(results['rank_test_accuracy'])]['classifier__n_estimators']
    best_feat_acc = results['params'][np.argmin(results['rank_test_accuracy'])]['classifier__max_features']
    
    best_est_f1 = results['params'][np.argmin(results['rank_test_f1'])]['classifier__n_estimators']
    best_feat_f1 = results['params'][np.argmin(results['rank_test_f1'])]['classifier__max_features']
    
    best_est_roc = results['params'][np.argmin(results['rank_test_roc_auc'])]['classifier__n_estimators']
    best_feat_roc = results['params'][np.argmin(results['rank_test_roc_auc'])]['classifier__max_features']
            
    print("Found Best Parameters!")
    
    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    best_model_acc = RandomForestClassifier(n_estimators=best_est_acc, n_jobs = -1, max_features = best_feat_acc)
    best_model_f1 = RandomForestClassifier(n_estimators=best_est_f1, n_jobs = -1, max_features = best_feat_f1)
    best_model_auc = RandomForestClassifier(n_estimators=best_est_roc, n_jobs = -1, max_features = best_feat_roc)
    best_model_acc.fit(X_train, y_train)
    best_model_f1.fit(X_train, y_train)
    best_model_auc.fit(X_train, y_train)
    
    # Find and store accuracy, FSC, and AUC of the 3 models from previous line on test set
    y_pred1 = best_model_acc.predict(X_test)
    y_pred2 = best_model_f1.predict(X_test)
    y_pred3 = best_model_auc.predict(X_test)
    
    metrics_acc = accuracy_score(y_test, y_pred1)
    metrics_f1 = f1_score(y_test, y_pred2)
    metrics_auc = roc_auc_score(y_test, y_pred3)
    return metrics_acc, metrics_f1, metrics_auc, results

In [5]:
def draw_heatmap(training_accuracy, C_list, label):
    # training_accuracy: A NumPy array with the shape (len(C_list))
    # C_list: List of C(s).
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(training_accuracy, annot=True, fmt='.3f', 
                     xticklabels=[], yticklabels=C_list)
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$C$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.title(label + 'accuracy w.r.t $C$')
    plt.show()

## Adult Classification Problem

### Import Data

In [6]:
cols1 = ['age', 'workclass', 'fnlwgt','education', 'education-num', 'marital-status', 
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
        'hours-per-week', 'native-country', 'class']
features = ['age', 'workclass', 'fnlwgt','education', 'education-num', 'marital-status', 
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
        'hours-per-week', 'native-country']
data1_1 = pd.read_csv('./ADULT/adult.data', header = None, names = cols1)
data1_2 = pd.read_csv('./ADULT/adult.test', header = None, names = cols1)
frames = [data1_1, data1_2]
data1 = pd.concat(frames)
data1 = data1.reset_index()

In [7]:
data1.shape

(48843, 16)

### Clean and Preprocess Data

In [8]:
# Types in each column
print(data1.dtypes)

# Drop any nulls, shown as question marks
for i in range(len(data1)):
    for col in cols1:
        if data1[col][i] == ' ?':
            data1.drop(i, inplace = True)
            break

index               int64
age                object
workclass          object
fnlwgt            float64
education          object
education-num     float64
marital-status     object
occupation         object
relationship       object
race               object
sex                object
capital-gain      float64
capital-loss      float64
hours-per-week    float64
native-country     object
class              object
dtype: object


In [9]:
data1.shape

(45223, 16)

In [10]:
# Preprocess last column
def transform_income(income):
    if income == ' <=50K':
        return 0
    else:
        return 1

In [11]:
data1['class'] = data1['class'].apply(transform_income)

In [12]:
data1.head(10)

Unnamed: 0,index,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,0
1,1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,0
2,2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,0
3,3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,0
4,4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,0
5,5,37,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,0
6,6,49,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,0
7,7,52,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,1
8,8,31,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,1
9,9,42,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,1


In [13]:
data1 = data1.dropna()

In [14]:
data1.shape

(45222, 16)

In [15]:
X1 = data1[features]
y1 = data1['class']

In [16]:
X1.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba


In [17]:
features

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [18]:
X1 = pd.get_dummies(X1, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race',
 'sex','native-country'])

In [19]:
# Types in each column
print(X1.dtypes)
print(y1.dtypes)

age                                 object
fnlwgt                             float64
education-num                      float64
capital-gain                       float64
capital-loss                       float64
                                    ...   
native-country_ Thailand             uint8
native-country_ Trinadad&Tobago      uint8
native-country_ United-States        uint8
native-country_ Vietnam              uint8
native-country_ Yugoslavia           uint8
Length: 104, dtype: object
int64


In [20]:
X1.shape

(45222, 104)

In [21]:
y1.shape

(45222,)

### Classification: Dataset 1

In [None]:
%%time
LRD1 = np.zeros((5, 3))
KNND1 = np.zeros((5, 3))
RDF1 = np.zeros((5, 3))
for trial in range(5):
    X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, train_size = 5000)
    metrics_acc_log, metrics_f1_log, metrics_auc_log, results_log_d1 = log_reg(X1_train, X1_test, y1_train, y1_test)
    metrics_acc_knn, metrics_f1_knn, metrics_auc_knn, results_knn_d1 = knn(X1_train, X1_test, y1_train, y1_test)
    metrics_acc_rf, metrics_f1_rf, metrics_auc_rf, results_rf_d1 = rand_for(X1_train, X1_test, y1_train, y1_test)
    LRD1[trial][0] = metrics_acc_log
    LRD1[trial][1] = metrics_f1_log
    LRD1[trial][2] = metrics_auc_log
    KNND1[trial][0] = metrics_acc_knn
    KNND1[trial][1] = metrics_f1_knn
    KNND1[trial][2] = metrics_auc_knn
    RDF1[trial][0] = metrics_acc_rf
    RDF1[trial][1] = metrics_f1_rf
    RDF1[trial][2] = metrics_auc_rf
LRD1 = pd.DataFrame(LRD1, columns=['accuracy', 'f1', 'roc_auc'])
KNND1 = pd.DataFrame(KNND1, columns=['accuracy', 'f1', 'roc_auc'])
RDF1 = pd.DataFrame(RDF1, columns=['accuracy', 'f1', 'roc_auc'])

AD1 = [[LRD1], [KNND1], [RDF1]]
AD1 = pd.DataFrame(AD1, columns = ['ADULT'], index = ['LR', 'KNN', 'RAND_FOR'])

 0.6486 0.6486 0.6486    nan]
 0.66605 0.66605 0.66605 0.66605     nan]
 0.64634399 0.64634399 0.64634399 0.64634399 0.64634399 0.64634399
 0.64634399        nan]
 0.66172058 0.66172058 0.66168604 0.66168604 0.66168604 0.66168604
 0.66168604        nan]
 0.69665024 0.69663264 0.69664624 0.69664224 0.69664224 0.69664304
 0.69664304        nan]
 0.72351942 0.72351732 0.72351882 0.72351882 0.72351897 0.72351942
 0.72351947        nan]


Found Best Parameters!
Found Best Parameters!


In [None]:
LRD1

In [None]:
KNND1

In [None]:
RDD1

## Cover Type Classification Problem

### Import Data

In [None]:
cols2 = ['elevation', 'aspect', 'slope', 'horizontal_distance_to_hydrology', 'vertical_distance_to_hydrology',
       'horizontal_distance_to_roadways', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm', 
       'horizontal_distance_to_fire_points', 'wilderness_area1', 'wilderness_area2','wilderness_area3','wilderness_area4',
        'soil_type1','soil_type2','soil_type3','soil_type4','soil_type5','soil_type6','soil_type7','soil_type8',
        'soil_type9','soil_type10','soil_type11','soil_type12','soil_type13','soil_type14','soil_type15','soil_type16','soil_type17',
        'soil_type18', 'soil_type19', 'soil_type20', 'soil_type21', 'soil_type22', 'soil_type23', 'soil_type24', 'soil_type25', 'soil_type26',
        'soil_type27', 'soil_type28', 'soil_type29', 'soil_type30', 'soil_type31', 'soil_type32', 'soil_type33', 'soil_type34', 'soil_type35', 
        'soil_type36', 'soil_type37', 'soil_type38', 'soil_type39', 'soil_type40', 'cover_type']
data2 = pd.read_csv('./COVTYPE/covtype.data', header = None, names = cols2)

In [None]:
data2.head()

In [None]:
data2.shape

### Clean and Preprocess Data

In [None]:
# Types in each column
print(data2.dtypes)

In [None]:
# Check if there are any null variables
data2.isnull().values.any()

In [None]:
# Method in Caruana Paper: Largest class(7) is positive and everything else is negative
def transform_type(covtype):
    if covtype == 7:
        return 1
    else:
        return 0

In [None]:
data2['cover_type'] = data2['cover_type'].apply(transform_type)

In [None]:
data2.head()

In [None]:
# Only picked a subset of dataset to match size in Caruana paper
X2 = data2.iloc[:30000, :-1]
y2 = data2.iloc[:30000, -1]

### Classification: Dataset 2

In [None]:
%%time
LRD2 = np.zeros((5, 3))
KNND2 = np.zeros((5, 3))
RDF2 = np.zeros((5, 3))
for trial in range(5):
    X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, train_size = 5000)
    metrics_acc_log, metrics_f1_log, metrics_auc_log, results_log_d2 = log_reg(X2_train, X2_test, y2_train, y2_test)
    metrics_acc_knn, metrics_f1_knn, metrics_auc_knn, results_knn_d2 = knn(X2_train, X2_test, y2_train, y2_test)
    metrics_acc_rf, metrics_f1_rf, metrics_auc_rf, results_rf_d2 = rand_for(X2_train, X2_test, y2_train, y2_test)
    LRD2[trial][0] = metrics_acc_log
    LRD2[trial][1] = metrics_f1_log
    LRD2[trial][2] = metrics_auc_log
    KNND2[trial][0] = metrics_acc_knn
    KNND2[trial][1] = metrics_f1_knn
    KNND2[trial][2] = metrics_auc_knn
    RDF2[trial][0] = metrics_acc_rf
    RDF2[trial][1] = metrics_f1_rf
    RDF2[trial][2] = metrics_auc_rf
LRD2 = pd.DataFrame(LRD2, columns=['accuracy', 'f1', 'roc_auc'])
KNND2 = pd.DataFrame(KNND2, columns=['accuracy', 'f1', 'roc_auc'])
RDF2 = pd.DataFrame(RDF2, columns=['accuracy', 'f1', 'roc_auc'])

AD2 = [[LRD2], [KNND2], [RDF2]]
AD2 = pd.DataFrame(AD2, columns = ['COVTYPE'], index = ['LR', 'KNN', 'RAND_FOR'])

In [None]:
LRD2

In [None]:
KNND2

In [None]:
RDD2

## Letter Classification Problem

### Import Data

In [None]:
cols3 = ['lettr', 'x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 
         'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy',
        'y-ege', 'yegvx']
data3 = pd.read_csv('./LETTER/letter-recognition.data', header = None, names = cols3)

In [None]:
data3.head()

### Clean and Preprocess Data

In [None]:
# Types in each column
print(data3.dtypes)

# Check if there are any null variables
data3.isnull().values.any()

In [None]:
# Method in Caruana Paper: A-M is positive and everything else is negative
def transform_letter(letter):
    positive = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M']
    if letter in positive:
        return 1
    else:
        return 0

In [None]:
data3['lettr'] = data3['lettr'].apply(transform_letter)

In [None]:
data3.head()

In [None]:
X3 = data3.iloc[:, 1:]
y3 = data3.iloc[:, 0]

In [None]:
X3.head()

In [None]:
X3.shape

In [None]:
y3.head()

In [None]:
y3.shape

In [None]:
X3 = X3.dropna()

In [None]:
X3.shape

### Classification: Dataset 3

In [None]:
%%time
LRD3 = np.zeros((5, 3))
KNND3 = np.zeros((5, 3))
RDF3 = np.zeros((5, 3))
for trial in range(5):
    X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, train_size = 5000)
    metrics_acc_log, metrics_f1_log, metrics_auc_log, results_log_d3 = log_reg(X3_train, X3_test, y3_train, y3_test)
    metrics_acc_knn, metrics_f1_knn, metrics_auc_knn, results_knn_d3 = knn(X3_train, X3_test, y3_train, y3_test)
    metrics_acc_rf, metrics_f1_rf, metrics_auc_rf, results_rf_d3 = rand_for(X3_train, X3_test, y3_train, y3_test)
    LRD3[trial][0] = metrics_acc_log
    LRD3[trial][1] = metrics_f1_log
    LRD3[trial][2] = metrics_auc_log
    KNND3[trial][0] = metrics_acc_knn
    KNND3[trial][1] = metrics_f1_knn
    KNND3[trial][2] = metrics_auc_knn
    RDF3[trial][0] = metrics_acc_rf
    RDF3[trial][1] = metrics_f1_rf
    RDF3[trial][2] = metrics_auc_rf
LRD3 = pd.DataFrame(LRD3, columns=['accuracy', 'f1', 'roc_auc'])
KNND3 = pd.DataFrame(KNND3, columns=['accuracy', 'f1', 'roc_auc'])
RDF3 = pd.DataFrame(RDF3, columns=['accuracy', 'f1', 'roc_auc'])

AD3 = [[LRD3], [KNND3], [RDF3]]
AD3 = pd.DataFrame(AD3, columns = ['LETTER'], index = ['LR', 'KNN', 'RAND_FOR'])

In [None]:
LRD3

In [None]:
KNND3

In [None]:
RDD3

## MUSH Classification Problem

### Import Data

In [None]:
cols4 = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 
         'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 
        'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 
        'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 
        'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 
        'habitat']
new_cols = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 
         'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 
            'stalk-surface-above-ring', 'stalk-surface-below-ring', 
        'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 
        'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 
        'habitat']
data4 = pd.read_csv('./MUSH/mushroom.data', header = None, names = cols4)

In [None]:
data4.head()

### Clean and Preprocess Data

In [None]:
# Find null value in dataframe
data4['stalk-root'].unique()

In [None]:
# See how many rows with nulls are in dataset
null_rows = 0
for i in range(len(data4)):
    for col in cols4:
        if data4[col][i] == '?':
            null_rows += 1
            break

In [None]:
null_rows

There are a significant number of nulls in the dataset, so instead of dropping the instances, we can note that the nulls are all in column 11, so we can drop a feature, since we have a large amount of features.

In [None]:
data4 = data4.drop(['stalk-root'], axis=1)

In [None]:
# Check nulls again
null_rows = 0
for i in range(len(data4)):
    for col in new_cols:
        if data4[col][i] == '?':
            null_rows += 1
            break

In [None]:
null_rows

In [None]:
data4.columns

In [None]:
data4 = pd.get_dummies(data4, columns=['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'])

In [None]:
data4.shape

In [None]:
def transform_class(mushroom):
    if mushroom == 'e':
        return 1
    else:
        return 0

In [None]:
data4['class'] = data4['class'].apply(transform_class)

In [None]:
data4.head()

In [None]:
X4 = data4.iloc[:, 1:]
y4 = data4.iloc[:, 0]

### Classification: Dataset 4

In [None]:
%%time
LRD4 = np.zeros((5, 3))
KNND4 = np.zeros((5, 3))
RDF4 = np.zeros((5, 3))
for trial in range(5):
    X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, train_size = 5000)
    metrics_acc_log, metrics_f1_log, metrics_auc_log, results_log_d4 = log_reg(X4_train, X4_test, y4_train, y4_test)
    metrics_acc_knn, metrics_f1_knn, metrics_auc_knn, results_knn_d4 = knn(X4_train, X4_test, y4_train, y4_test)
    metrics_acc_rf, metrics_f1_rf, metrics_auc_rf, results_rf_d4 = rand_for(X4_train, X4_test, y4_train, y4_test)
    LRD4[trial][0] = metrics_acc_log
    LRD4[trial][1] = metrics_f1_log
    LRD4[trial][2] = metrics_auc_log
    KNND4[trial][0] = metrics_acc_knn
    KNND4[trial][1] = metrics_f1_knn
    KNND4[trial][2] = metrics_auc_knn
    RDF4[trial][0] = metrics_acc_rf
    RDF4[trial][1] = metrics_f1_rf
    RDF4[trial][2] = metrics_auc_rf
LRD4 = pd.DataFrame(LRD4, columns=['accuracy', 'f1', 'roc_auc'])
KNND4 = pd.DataFrame(KNND4, columns=['accuracy', 'f1', 'roc_auc'])
RDF4 = pd.DataFrame(RDF4, columns=['accuracy', 'f1', 'roc_auc'])

AD4 = [[LRD4], [KNND4], [RDF4]]
AD4 = pd.DataFrame(AD4, columns = ['MUSH'], index = ['LR', 'KNN', 'RAND_FOR'])

In [None]:
LRD4

In [None]:
KNND4

In [None]:
RDD4

## Collect Necessary Tables

In [None]:
results_log_d1 = pd.DataFrame(results_log_d1)
results_knn_d1 = pd.DataFrame(results_knn_d1)
results_rf_d1 = pd.DataFrame(results_rf_d1)
results_log_d2 = pd.DataFrame(results_log_d2)
results_knn_d2 = pd.DataFrame(results_knn_d2)
results_rf_d2 = pd.DataFrame(results_rf_d2)
results_log_d3 = pd.DataFrame(results_log_d3)
results_knn_d3 = pd.DataFrame(results_knn_d3)
results_rf_d3 = pd.DataFrame(results_rf_d3)
results_log_d4 = pd.DataFrame(results_log_d4)
results_knn_d4 = pd.DataFrame(results_knn_d4)
results_rf_d4 = pd.DataFrame(results_rf_d4)

In [None]:
results_log_d1.to_csv('results/results_log_d1.csv'index=False)
results_knn_d1.to_csv('results/results_knn_d1.csv'index=False)
results_rf_d1.to_csv('results/results_rf_d1.csv'index=False)
results_log_d2.to_csv('results/result_log_d2.csv'index=False)
results_knn_d2.to_csv('results/results_knn_d2.csv'index=False)
results_rf_d2.to_csv('results/results_rf_d2.csv'index=False)
results_log_d3.to_csv('results/results_log_d3.csv'index=False)
results_knn_d3.to_csv('results/results_knn_d3.csv'index=False)
results_rf_d3.to_csv('results/results_rf_d3.csv'index=False)
results_log_d4.to_csv('results/results_log_d4.csv'index=False)
results_knn_d4.to_csv('results/results_knn_d4.csv'index=False)
results_rf_d4.to_csv('results/results_rf_d4.csv'index=False)

## Results

In [None]:
results = [AD1, AD2, AD3, AD4]

In [None]:
results

In [None]:
# An example
results[1]['COVTYPE']['KNN']

In [None]:
algorithms = ['LR', 'KNN', 'RAND_FOR']
datasets = ['ADULT', 'COVTYPE', 'LETTER', 'MUSH']
metrics = ['accuracy', 'f1', 'roc_auc']

In [None]:
# Get metrics each algo/data combo averaged across 5 trials (algorithms in rows, data sets in columns)
results1 = np.zeros((3, 4))
for col, data in enumerate(datasets):
    for row, algo in enumerate(algorithms):
        sums = [sum_ for sum_ in results[col][data][algo].sum()]
        avg = sum(sums)/15
        results1[row][col] = avg

In [None]:
results1

In [None]:
results1 = pd.DataFrame(results1, columns = datasets, index = algorithms)

In [None]:
results1

In [None]:
# Get accuracy, FSC, and AUC averaged across all data sets (algorithms in rows, metrics in columns)
acc = 0
f1 = 0
roc = 0
results2 = np.zeros((3, 3))

for row, algo in enumerate(algorithms):
    for i, data in enumerate(datasets):
        sums = [sum_ for sum_ in results[i][data][algo].sum()]
        acc += sums[0]
        f1 += sums[1]
        roc += sums[2]
    results2[row][0] = acc/20
    results2[row][1] = f1/20
    results2[row][2] = roc/20
    acc = 0
    f1 = 0
    roc = 0

In [None]:
results2

In [None]:
results2 = pd.DataFrame(results2, columns = metrics, index = algorithms)

In [None]:
results2

In [None]:
#results2.sum(axis=1)
sum_metrics = [sum_/3 for sum_ in results2.sum(axis=1)]
results2['avg'] = sum_metrics

In [None]:
results2

In [None]:
results1.to_csv('tables/results_over_datasets'index=False)
results2.to_csv('tables/results_over_metrics'index=False)