In [1]:
# Import 
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


In [2]:
# Load in datasets
adult_df = pd.read_csv('adult.data')
cov_df = pd.read_csv('covtype.data')
bean_df = pd.read_csv('Dry_Bean_Dataset.csv')
occupancy_df = pd.read_csv('datatraining.csv')

In [3]:
def standard_scaler(dataset, columns):
    ss = StandardScaler()

    dataset = pd.DataFrame(ss.fit_transform(dataset), columns=dataset.columns)

    dataset.iloc[:, -1] = (dataset.iloc[:, -1] > 0).astype(int)
    
    return dataset

In [4]:
occupancy_df = occupancy_df.drop(columns='date')

In [5]:
occupancy_df['Occupancy'].value_counts()

0    6414
1    1729
Name: Occupancy, dtype: int64

In [6]:
occupancy_df = standard_scaler(occupancy_df, occupancy_df.columns)

In [7]:
occupancy_df['Occupancy'].loc[occupancy_df['Occupancy']==0] = -1
occupancy_df['Occupancy'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


-1    6414
 1    1729
Name: Occupancy, dtype: int64

In [8]:
occupancy_df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,2.51847,0.278526,1.573763,0.364948,1.091757,1
1,2.488967,0.277713,1.591735,0.341881,1.080555,1
2,2.488967,0.273645,1.573763,0.34029,1.075888,1
3,2.488967,0.265508,1.573763,0.323587,1.066555,1
4,2.439796,0.265508,1.573763,0.311655,1.049523,1


In [9]:
bean_df.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417.0,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


In [10]:
# Clean bean_df
condition = (bean_df['Class']=='SIRA') | (bean_df['Class']=='HOROZ')

print(len(bean_df))
bean_df = bean_df.dropna()
print(len(bean_df))

bean_df['Class'].loc[condition] = 1
bean_df['Class'].loc[~condition] = -1

bean_df['Class'].value_counts()

13612
13611


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


-1    9047
 1    4564
Name: Class, dtype: int64

In [11]:
bean_df['Class'] = bean_df['Class'].astype(int)

In [12]:
bean_df.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,-1
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,-1
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,-1
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,-1
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417.0,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,-1


In [13]:
# Before cleaning adult_df

# Change name of columns to what UCI has on their page
adult_columns = ['age',
            'workclass',
            'fnlwgt',
            'education',
            'education-num',
            'marital-status',
            'occupation',
            'relationship',
            'race',
            'sex',
            'capital-gain',
            'capital-loss',
            'hours-per-week',
            'native-country',
            'above-50k']

adult_df.columns = adult_columns
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,above-50k
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [14]:
# Clean adult_df

# Only keep jobs with a count of over 600
counts = adult_df['education'].value_counts()
adult_df = adult_df[adult_df['education'].isin(counts[counts > 600].index)]

In [15]:
# Only keep marital status with a count of over 1000
counts = adult_df['marital-status'].value_counts()
adult_df = adult_df[adult_df['marital-status'].isin(counts[counts > 1000].index)]

In [16]:
# Drop ? and never worked
adult_df['occupation'].drop(adult_df['occupation'].loc[adult_df['occupation']=='?'])
adult_df['occupation'].drop(adult_df['occupation'].loc[adult_df['occupation']=='Never-worked'])

# Keep remaining jobs with count of over 2000
counts = adult_df['occupation'].value_counts()
adult_df = adult_df[adult_df['occupation'].isin(counts[counts > 2000].index)]

In [17]:
adult_df.dropna(inplace=True)

holder = [] # Keep everything

# Label encoding
for col in range(1, adult_df.shape[1]):
    
    if adult_df.iloc[:, col].dtype != int:
        holder.append(col)
        uniq = adult_df.iloc[:, col].unique()

        adult_df.iloc[:, col] = adult_df.iloc[:, col].replace(uniq, np.arange(len(uniq)))

holder = holder[:-1] # Keep everything except above-50k  

object_cols = adult_df.iloc[:, holder].astype(object)
adult_enc = pd.get_dummies(object_cols, prefix=np.array(adult_columns)[holder])


In [18]:
adult_enc['above-50k'] = adult_df['above-50k']

In [19]:
adult_df = adult_enc

In [20]:
adult_df.head()

Unnamed: 0,workclass_0,workclass_1,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,education_0,education_1,education_2,...,native-country_32,native-country_33,native-country_34,native-country_35,native-country_36,native-country_37,native-country_38,native-country_39,native-country_40,above-50k
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
7,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
adult_df['above-50k'].loc[adult_df['above-50k']==0] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [22]:
adult_df['above-50k'].value_counts()

-1    14298
 1     5486
Name: above-50k, dtype: int64

In [23]:
adult_df

Unnamed: 0,workclass_0,workclass_1,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,education_0,education_1,education_2,...,native-country_32,native-country_33,native-country_34,native-country_35,native-country_36,native-country_37,native-country_38,native-country_39,native-country_40,above-50k
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,-1
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,-1
4,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,-1
6,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
7,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32549,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
32551,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
32553,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
32558,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,-1


In [24]:
cov_df.head()

Unnamed: 0,2596,51,3,258,0,510,221,232,148,6279,...,0.34,0.35,0.36,0.37,0.38,0.39,0.40,0.41,0.42,5
0,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
1,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
2,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
3,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
4,2579,132,6,300,-15,67,230,237,140,6031,...,0,0,0,0,0,0,0,0,0,2


In [25]:
# Clean covtype
cov_df.dropna(inplace=True)
cov_df['5'].loc[cov_df['5']<7] = -1
cov_df['5'].loc[cov_df['5']==7] = 1

print(cov_df['5'].value_counts())

-1    560501
 1     20510
Name: 5, dtype: int64


In [26]:
cov_df = standard_scaler(cov_df, cov_df.columns)

In [27]:
cov_df['5'].loc[cov_df['5']==0] = -1
cov_df['5'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


-1    560501
 1     20510
Name: 5, dtype: int64

In [28]:
cov_df.head()

Unnamed: 0,2596,51,3,258,0,510,221,232,148,6279,...,0.34,0.35,0.36,0.37,0.38,0.39,0.40,0.41,0.42,5
0,-1.319238,-0.890481,-1.616367,-0.270188,-0.899198,-1.257108,0.293388,0.590899,0.221342,3.205535,...,-0.315238,-0.290284,-0.05273,-0.057143,-0.014313,-0.022653,-0.165956,-0.156014,-0.123654,-1
1,-0.554909,-0.148838,-0.681566,-0.00672,0.318741,0.53221,0.816364,0.742654,-0.196691,3.126996,...,-0.315238,-0.290284,-0.05273,-0.057143,-0.014313,-0.022653,-0.165956,-0.156014,-0.123654,-1
2,-0.62277,-0.00587,0.52032,-0.129044,1.227907,0.47449,0.965785,0.742654,-0.536342,3.194963,...,-0.315238,-0.290284,-0.05273,-0.057143,-0.014313,-0.022653,-0.165956,-0.156014,-0.123654,-1
3,-1.30138,-0.988771,-1.616367,-0.54777,-0.813428,-1.256467,0.293388,0.540314,0.195215,3.165511,...,-0.315238,-0.290284,-0.05273,-0.057143,-0.014313,-0.022653,-0.165956,-0.156014,-0.123654,-1
4,-1.358526,-0.211386,-1.082195,0.143834,-1.053585,-1.464259,0.666942,0.692069,-0.066056,3.05903,...,-0.315238,-0.290284,-0.05273,-0.057143,-0.014313,-0.022653,-0.165956,-0.156014,-0.123654,-1


In [29]:
# Get training and test data
def sample_func(dataset):
    samples = dataset.sample(n=5000, replace=True)
    total = samples.shape[0]

    X_train = samples.iloc[:, :-1]
    y_train = samples.iloc[:, -1]
    
    n = [i for i in dataset.index if i not in samples.index]
    test_space = dataset.loc[n]
    X_test = test_space.iloc[:, :-1]
    y_test = test_space.iloc[:, -1]
    
    return X_train, y_train, X_test, y_test

In [30]:
# Store training and test data
X_train_list = []
y_train_list = []
X_test_list = []
y_test_list = []

# List of datasets
datasets = [cov_df, adult_df, bean_df, occupancy_df]

# Loop through each dataset
for data in datasets:

# Enough data for 5 trials
    for i in range(5):
        X_train, y_train, X_test, y_test = sample_func(data)
        X_train_list.append(X_train)
        y_train_list.append(y_train)
        X_test_list.append(X_test)
        y_test_list.append(y_test)

In [31]:
# Save best metrics and models
best_models_l = []
best_acc_l = []
best_roc_auc_l = []
best_f1_l = []

# What we want to use for our scoring
scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro']

# Loop through 
for i in range(len(X_train_list)):
    
    search_space = [{'solver': ['saga'],
                     'penalty': ['l1', 'l2'],
                     'C': [0.00000001, 0.0000001, 0.000001, 0.00001,
                          0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
                    {'solver': ['lbfgs'],
                     'penalty': ['l2'],
                     'C': [0.00000001, 0.0000001, 0.000001, 0.00001,
                          0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
                    {'solver': ['lbfgs', 'saga'],
                     'penalty': ['none']},
                    ]

    # Create grid search 
    clf = GridSearchCV(LogisticRegression(max_iter=10000), search_space, n_jobs=-1, cv=StratifiedKFold(n_splits=5), 
                       scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False,
                       verbose=0, return_train_score=True)
    
    # Fit grid search
    best_model = clf.fit(X_train_list[i], y_train_list[i])
    
    best_models_l.append(best_model)
    best_acc_l.append(best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])])
    best_roc_auc_l.append(best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])])
    best_f1_l.append(best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])])
 

In [32]:
results_acc = pd.DataFrame(best_acc_l)
results_roc_auc = pd.DataFrame(best_roc_auc_l)
results_f1 = pd.DataFrame(best_f1_l)

# print(results_acc)
# print(results_roc_auc)
# print(results_f1)

In [33]:
models_lr1 = []
models_lr2 = []
models_lr3 = []

# Get new models with best parameters found from grid search
for i in range(20):
    
    # Logistic Regression
    new_clf_lr1 = LogisticRegression(max_iter=10000, solver=results_acc.iloc[i, 2],
                      penalty=results_acc.iloc[i, 1], 
                      C=results_acc.iloc[i, 0])
    new_clf_lr2 = LogisticRegression(max_iter=10000, solver=results_roc_auc.iloc[i, 2],
                      penalty=results_roc_auc.iloc[i, 1], 
                      C=results_roc_auc.iloc[i, 0])
    new_clf_lr3 = LogisticRegression(max_iter=10000, solver=results_f1.iloc[i, 2],
                      penalty=results_f1.iloc[i, 1], 
                      C=results_f1.iloc[i, 0])
    
    # Fit models and store in respective lists
    res1 = new_clf_lr1.fit(X_train_list[i], y_train_list[i])
    res2 = new_clf_lr2.fit(X_train_list[i], y_train_list[i])
    res3 = new_clf_lr3.fit(X_train_list[i], y_train_list[i])
    
    # Append list
    models_lr1.append(res1)
    models_lr2.append(res2)
    models_lr3.append(res3)
    


In [35]:
pred_train_l_acc = []
pred_train_l_roc = []
pred_train_l_f1 = []

pred_test_l_acc = []
pred_test_l_roc = []
pred_test_l_f1 = []

accuracy_l = []

for i in range(20):
    
    # Predict train list
    y_pred = models_lr1[i].predict(X_train_list[i])
    pred_train_l_acc.append(accuracy_score(y_train_list[i], y_pred))
    
    y_pred = models_lr2[i].predict(X_train_list[i])
    pred_train_l_roc.append(roc_auc_score(y_train_list[i], y_pred))
    
    y_pred = models_lr3[i].predict(X_train_list[i])
    pred_train_l_f1.append(f1_score(y_train_list[i], y_pred))
    
    # Predict test list
    y_pred = models_lr1[i].predict(X_test_list[i])
    pred_test_l_acc.append(accuracy_score(y_test_list[i], y_pred))
    
    y_pred = models_lr2[i].predict(X_test_list[i])
    pred_test_l_roc.append(roc_auc_score(y_test_list[i], y_pred))
    
    y_pred = models_lr3[i].predict(X_test_list[i])
    pred_test_l_f1.append(f1_score(y_test_list[i], y_pred))
    
    # Get accuracy
    accuracy_l.append(models_lr1[i].score(X_test_list[i],y_test_list[i]))
    accuracy_l.append(models_lr2[i].score(X_test_list[i],y_test_list[i])) 
    accuracy_l.append(models_lr3[i].score(X_test_list[i],y_test_list[i]))

In [36]:
# Save best metrics and models
best_models_k = []
best_acc_k = []
best_roc_auc_k = []
best_f1_k = []

# What we want to use for our scoring
scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro']

# K values/parameters for grid search
k_list = np.arange(1, 109, 4)
parameters = {'n_neighbors': k_list}

for i in range(len(X_train_list)):
    
    # Create grid search 
    clf = GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1, cv=StratifiedKFold(n_splits=5), 
                       scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False,
                       verbose=0, return_train_score=True)

    # Fit grid search
    best_model = clf.fit(X_train_list[i], y_train_list[i])

    # Append lists
    best_models_k.append(best_model)
    best_acc_k.append(best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])])
    best_roc_auc_k.append(best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])])
    best_f1_k.append(best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])])



In [37]:
results_acck = pd.DataFrame(best_acc_k)
results_roc_auck = pd.DataFrame(best_roc_auc_k)
results_f1k = pd.DataFrame(best_f1_k)

# print(results_acck)
# print(results_roc_auck)
# print(results_f1k)

In [38]:
models_k1 = []
models_k2 = []
models_k3 = []


# Get new models with best parameters found from grid search
for i in range(20):
    
    # KNN
    new_clf_k1 = KNeighborsClassifier(n_neighbors=results_acck.iloc[i,0])
    new_clf_k2 = KNeighborsClassifier(n_neighbors=results_roc_auck.iloc[i,0])
    new_clf_k3 = KNeighborsClassifier(n_neighbors=results_f1k.iloc[i,0])
    
    # Fit models and store in respective lists
    res1 = new_clf_k1.fit(X_train_list[i], y_train_list[i])
    res2 = new_clf_k2.fit(X_train_list[i], y_train_list[i])
    res3 = new_clf_k3.fit(X_train_list[i], y_train_list[i])
    
    # Append lists
    models_k1.append(res1)
    models_k2.append(res2)
    models_k3.append(res3)

In [76]:
pred_train_k_acc = []
pred_train_k_roc = []
pred_train_k_f1 = []

pred_test_k_acc = []
pred_test_k_roc = []
pred_test_k_f1 = []

accuracy_k = []

for i in range(20):
    
    # Predict train list
    y_pred = models_k1[i].predict(X_train_list[i])
    pred_train_k_acc.append(accuracy_score(y_train_list[i], y_pred))
    
    y_pred = models_k2[i].predict(X_train_list[i])
    pred_train_k_roc.append(roc_auc_score(y_train_list[i], y_pred))
    
    y_pred = models_k3[i].predict(X_train_list[i])
    pred_train_k_f1.append(f1_score(y_train_list[i], y_pred))
    
    # Predict test list
    y_pred = models_k1[i].predict(X_test_list[i])
    pred_test_k_acc.append(accuracy_score(y_test_list[i], y_pred))
              
    y_pred = models_k2[i].predict(X_test_list[i])
    pred_test_k_roc.append(roc_auc_score(y_test_list[i], y_pred))
              
    y_pred = models_k3[i].predict(X_test_list[i])
    pred_test_k_f1.append(f1_score(y_test_list[i], y_pred))
    
    # Get accuracy
    accuracy_k.append(models_k1[i].score(X_test_list[i],y_test_list[i]))
    accuracy_k.append(models_k2[i].score(X_test_list[i],y_test_list[i])) 
    accuracy_k.append(models_k3[i].score(X_test_list[i],y_test_list[i]))

In [61]:
# Save best metrics and models
best_models_r = []
best_acc_r = []
best_roc_auc_r = []
best_f1_r = []

# What we want to use for our scoring
scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro']

# Parameters for grid search
param_grid = [{'n_estimators': [1024],
                   'max_features':[1,2,4,6,8,12,16,20]}]

for i in range(len(X_train_list)):

    # Create grid search 
    clf = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, cv=StratifiedKFold(n_splits=5), 
                       scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False,
                       verbose=0, return_train_score=True)

    # Fit grid search
    best_model = clf.fit(X_train_list[i], y_train_list[i])

    # Append lists
    best_models_r.append(best_model)
    best_acc_r.append(best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])])
    best_roc_auc_r.append(best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])])
    best_f1_r.append(best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1_micro'])])

In [62]:
results_accr = pd.DataFrame(best_acc_r)
results_roc_aucr = pd.DataFrame(best_roc_auc_r)
results_f1r = pd.DataFrame(best_f1_r)

# print(results_accr)
# print(results_roc_aucr)
# print(results_f1r)

In [63]:
models_r1 = []
models_r2 = []
models_r3 = []

# Get new models with best parameters found from grid search
for i in range(20):
    
    # Random Forest
    new_clf_r1 = RandomForestClassifier(n_estimators=results_accr.iloc[i, 0], max_features=len(X_train_list[i].columns))
    new_clf_r2 = RandomForestClassifier(n_estimators=results_roc_aucr.iloc[i, 0], max_features=len(X_train_list[i].columns))
    new_clf_r3 = RandomForestClassifier(n_estimators=results_f1r.iloc[i, 0], max_features=len(X_train_list[i].columns))

    res_1 = new_clf_r1.fit(X_train_list[i], y_train_list[i])
    res_2 = new_clf_r2.fit(X_train_list[i], y_train_list[i])
    res_3 = new_clf_r3.fit(X_train_list[i], y_train_list[i])
    
    
    models_r1.append(res_1)
    models_r2.append(res_2)
    models_r3.append(res_3)

In [64]:
pred_train_r_acc = []
pred_train_r_roc = []
pred_train_r_f1 = []

pred_test_r_acc = []
pred_test_r_roc = []
pred_test_r_f1 = []

accuracy_r = []

for i in range(20):
    
    # Predict train list
    y_pred = models_r1[i].predict(X_train_list[i])
    pred_train_r_acc.append(accuracy_score(y_train_list[i], y_pred))
    
    y_pred = models_r2[i].predict(X_train_list[i])
    pred_train_r_roc.append(roc_auc_score(y_train_list[i], y_pred))
    
    y_pred = models_r3[i].predict(X_train_list[i])
    pred_train_r_f1.append(f1_score(y_train_list[i], y_pred))
    
    # Predict test list
    y_pred = models_r1[i].predict(X_test_list[i])
    pred_test_r_acc.append(accuracy_score(y_test_list[i], y_pred))
    
    y_pred = models_r2[i].predict(X_test_list[i])
    pred_test_r_roc.append(roc_auc_score(y_test_list[i], y_pred))
    
    y_pred = models_r3[i].predict(X_test_list[i])
    pred_test_r_f1.append(f1_score(y_test_list[i], y_pred))
    
    # Get accuracy
    accuracy_r.append(models_r1[i].score(X_test_list[i],y_test_list[i]))
    accuracy_r.append(models_r2[i].score(X_test_list[i],y_test_list[i])) 
    accuracy_r.append(models_r3[i].score(X_test_list[i],y_test_list[i]))

In [66]:
table1 = pd.DataFrame(index=['COV_DF', 'ADULT_DF', 'BEAN_DF', 'OCCUPANCY_DF'],
                     columns=['# ATTR', 'TRAIN SIZE', 'TEST SIZE', '%POZ'])

# Number of attributes
table1.iloc[0,0] = len(cov_df.columns)
table1.iloc[1,0] = len(adult_df.columns)
table1.iloc[2,0] = len(bean_df.columns)
table1.iloc[3,0] = len(occupancy_df.columns)

# Train size
table1.iloc[:,1] = 5000

# Average test size
cov_sum = 0
adult_sum = 0
occ_sum = 0
bean_sum = 0

for i in range(0, 5):
    cov_sum += len(X_test_list[i])
    
for i in range(5, 10):
    adult_sum += len(X_test_list[i])
    
    
for i in range(10, 15):
    bean_sum += len(X_test_list[i])
    
for i in range(15, 20):
    occ_sum += len(X_test_list[i])
    
# Test size
table1.iloc[0,2] = int(cov_sum / 5)
table1.iloc[1,2] = int(adult_sum / 5)
table1.iloc[2,2] = int(bean_sum / 5)
table1.iloc[3,2] = int(occ_sum / 5)

table1.loc['COV_DF', '%POZ'] = round(len(cov_df[cov_df['5']==1]) / len(cov_df['5']), 2)
table1.loc['ADULT_DF', '%POZ'] = round(len(adult_df[adult_df['above-50k']==1]) / len(adult_df['above-50k']), 2)
table1.loc['BEAN_DF', '%POZ'] = round(len(bean_df[bean_df['Class']==1]) / len(bean_df['Class']), 2)
table1.loc['OCCUPANCY_DF', '%POZ'] = round(len(occupancy_df[occupancy_df['Occupancy']==1]) / len(occupancy_df['Occupancy']), 2)



table1

Unnamed: 0,# ATTR,TRAIN SIZE,TEST SIZE,%POZ
COV_DF,55,5000,576033,0.04
ADULT_DF,80,5000,15352,0.28
BEAN_DF,17,5000,9415,0.34
OCCUPANCY_DF,6,5000,4414,0.21


In [68]:
def lists_mean(lst1, lst2, lst3):
     return (sum(lst1) + sum(lst2) + sum(lst3)) / (len(lst1) + len(lst2) + len(lst3))

In [70]:
def list_mean(lst):
    return sum(lst) / len(lst)

In [77]:
table2 = pd.DataFrame(index=['KNN', 'LOGREG', 'RF'],
                                 columns=['ACC', 'ROC_AUC', 'F1', 'MEAN'])

table2['ACC']['KNN'] = round(list_mean(pred_test_k_acc), 3)
table2['ROC_AUC']['KNN'] = round(list_mean(pred_test_k_roc), 3)
table2['F1']['KNN'] = round(list_mean(pred_test_k_f1), 3)
table2['MEAN']['KNN'] = round((table2['ACC']['KNN'] + table2['ROC_AUC']['KNN'] + table2['F1']['KNN'])/3, 3)

table2['ACC']['LOGREG'] = round(list_mean(pred_test_l_acc), 3)
table2['ROC_AUC']['LOGREG'] = round(list_mean(pred_test_l_roc), 3)
table2['F1']['LOGREG'] = round(list_mean(pred_test_l_f1), 3)
table2['MEAN']['LOGREG'] = round((table2['ACC']['LOGREG'] + table2['ROC_AUC']['LOGREG'] + table2['F1']['LOGREG'])/3, 3)

table2['ACC']['RF'] = round(list_mean(pred_test_r_acc), 3)
table2['ROC_AUC']['RF'] = round(list_mean(pred_test_r_roc), 3)
table2['F1']['RF'] = round(list_mean(pred_test_r_f1), 3)
table2['MEAN']['RF'] = round((table2['ACC']['RF'] + table2['ROC_AUC']['RF'] + table2['F1']['RF'])/3, 3)

table2

Unnamed: 0,ACC,ROC_AUC,F1,MEAN
KNN,0.906,0.802,0.76,0.823
LOGREG,0.886,0.801,0.703,0.797
RF,0.92,0.841,0.785,0.849


In [79]:
table3 = pd.DataFrame(index=['KNN', 'LOGREG', 'RF'],
                                 columns=['COV_DF', 'ADULT_DF', 'BEAN_DF', 'OCCUPANCY_DF', 'MEAN'])

# Get cov_df data and put in table
cov_k = lists_mean(pred_test_k_acc[0:5], pred_test_k_roc[0:5], pred_test_k_f1[0:5])
cov_lr = lists_mean(pred_test_l_acc[0:5], pred_test_l_roc[0:5], pred_test_l_f1[0:5])
cov_rf = lists_mean(pred_test_r_acc[0:5], pred_test_r_roc[0:5], pred_test_r_f1[0:5])

table3['COV_DF']['KNN'] = round(cov_k, 3)
table3['COV_DF']['LOGREG'] = round(cov_lr, 3)
table3['COV_DF']['RF'] = round(cov_rf, 3)

# Get adult_df data and put in table
adult_k = lists_mean(pred_test_k_acc[5:10], pred_test_k_roc[5:10], pred_test_k_f1[5:10])
adult_lr = lists_mean(pred_test_l_acc[5:10], pred_test_l_roc[5:10], pred_test_l_f1[5:10])
adult_rf = lists_mean(pred_test_r_acc[5:10], pred_test_r_roc[5:10], pred_test_r_f1[5:10])

table3['ADULT_DF']['KNN'] = round(adult_k, 3)
table3['ADULT_DF']['LOGREG'] = round(adult_lr, 3)
table3['ADULT_DF']['RF'] = round(adult_rf, 3)

# Get bean_df data and put in table
bean_k = lists_mean(pred_test_k_acc[10:15], pred_test_k_roc[10:15], pred_test_k_f1[10:15])
bean_lr = lists_mean(pred_test_l_acc[10:15], pred_test_l_roc[10:15], pred_test_l_f1[10:15])
bean_rf = lists_mean(pred_test_r_acc[10:15], pred_test_r_roc[10:15], pred_test_r_f1[10:15])

table3['BEAN_DF']['KNN'] = round(bean_k, 3)
table3['BEAN_DF']['LOGREG'] = round(bean_lr, 3)
table3['BEAN_DF']['RF'] = round(bean_rf, 3)

# Get occupancy_df data and put in table
occ_k = lists_mean(pred_test_k_acc[15:20], pred_test_k_roc[15:20], pred_test_k_f1[15:20])
occ_lr = lists_mean(pred_test_l_acc[15:20], pred_test_l_roc[15:20], pred_test_l_f1[15:20])
occ_rf = lists_mean(pred_test_r_acc[15:20], pred_test_r_roc[15:20], pred_test_r_f1[15:20])

table3['OCCUPANCY_DF']['KNN'] = round(occ_k, 3)
table3['OCCUPANCY_DF']['LOGREG'] = round(occ_lr, 3)
table3['OCCUPANCY_DF']['RF'] = round(occ_rf, 3)

# Mean column
table3['MEAN']['KNN'] = round((cov_k + adult_k + bean_k + occ_k)/4, 3)
table3['MEAN']['LOGREG'] = round((cov_lr + adult_lr + bean_lr + occ_lr)/4, 3)
table3['MEAN']['RF'] = round((cov_rf + adult_rf + bean_rf + occ_rf)/4, 3)

table3

Unnamed: 0,COV_DF,ADULT_DF,BEAN_DF,OCCUPANCY_DF,MEAN
KNN,0.759,0.716,0.827,0.988,0.823
LOGREG,0.796,0.722,0.686,0.983,0.797
RF,0.803,0.693,0.916,0.981,0.849


In [80]:
appendix1 = pd.DataFrame(index=['COV_KNN', 'ADULT_KNN','BEAN_KNN','OCC_KNN',
                                'COV_LOGREG', 'ADULT_LOGREG', 'BEAN_LOGREG', 'OCC_LOGREG',
                                'COV_RF', 'ADULT_RF', 'BEAN_RF', 'OCC_RF'],
                                 columns=['ACC', 'ROC_AUC', 'F1'])

cov_k_acc = list_mean(pred_train_k_acc[0:5])
cov_k_roc = list_mean(pred_train_k_roc[0:5])
cov_k_f1 = list_mean(pred_train_k_f1[0:5])

appendix1['ACC']['COV_KNN'] = round(cov_k_acc, 3)
appendix1['ROC_AUC']['COV_KNN'] = round(cov_k_roc, 3)
appendix1['F1']['COV_KNN'] = round(cov_k_f1, 3)

adult_k_acc = list_mean(pred_train_k_acc[5:10])
adult_k_roc = list_mean(pred_train_k_roc[5:10])
adult_k_f1 = list_mean(pred_train_k_f1[5:10])

appendix1['ACC']['ADULT_KNN'] = round(adult_k_acc, 3)
appendix1['ROC_AUC']['ADULT_KNN'] = round(adult_k_roc, 3)
appendix1['F1']['ADULT_KNN'] = round(adult_k_f1, 3)

bean_k_acc = list_mean(pred_train_k_acc[10:15])
bean_k_roc = list_mean(pred_train_k_roc[10:15])
bean_k_f1 = list_mean(pred_train_k_f1[10:15])

appendix1['ACC']['BEAN_KNN'] = round(bean_k_acc, 3)
appendix1['ROC_AUC']['BEAN_KNN'] = round(bean_k_roc, 3)
appendix1['F1']['BEAN_KNN'] = round(bean_k_f1, 3)

occ_k_acc = list_mean(pred_train_k_acc[15:20])
occ_k_roc = list_mean(pred_train_k_roc[15:20])
occ_k_f1 = list_mean(pred_train_k_f1[15:20])

appendix1['ACC']['OCC_KNN'] = round(occ_k_acc, 3)
appendix1['ROC_AUC']['OCC_KNN'] = round(occ_k_roc, 3)
appendix1['F1']['OCC_KNN'] = round(occ_k_f1, 3)


In [81]:
cov_l_acc = list_mean(pred_train_l_acc[0:5])
cov_l_roc = list_mean(pred_train_l_roc[0:5])
cov_l_f1 = list_mean(pred_train_l_f1[0:5])

appendix1['ACC']['COV_LOGREG'] = round(cov_l_acc, 3)
appendix1['ROC_AUC']['COV_LOGREG'] = round(cov_l_roc, 3)
appendix1['F1']['COV_LOGREG'] = round(cov_l_f1, 3)

adult_l_acc = list_mean(pred_train_l_acc[5:10])
adult_l_roc = list_mean(pred_train_l_roc[5:10])
adult_l_f1 = list_mean(pred_train_l_f1[5:10])

appendix1['ACC']['ADULT_LOGREG'] = round(adult_l_acc, 3)
appendix1['ROC_AUC']['ADULT_LOGREG'] = round(adult_l_roc, 3)
appendix1['F1']['ADULT_LOGREG'] = round(adult_l_f1, 3)

bean_l_acc = list_mean(pred_train_l_acc[10:15])
bean_l_roc = list_mean(pred_train_l_roc[10:15])
bean_l_f1 = list_mean(pred_train_l_f1[10:15])

appendix1['ACC']['BEAN_LOGREG'] = round(bean_l_acc, 3)
appendix1['ROC_AUC']['BEAN_LOGREG'] = round(bean_l_roc, 3)
appendix1['F1']['BEAN_LOGREG'] = round(bean_l_f1, 3)

occ_l_acc = list_mean(pred_train_l_acc[15:20])
occ_l_roc = list_mean(pred_train_l_roc[15:20])
occ_l_f1 = list_mean(pred_train_l_f1[15:20])

appendix1['ACC']['OCC_LOGREG'] = round(occ_l_acc, 3)
appendix1['ROC_AUC']['OCC_LOGREG'] = round(occ_l_roc, 3)
appendix1['F1']['OCC_LOGREG'] = round(occ_l_f1, 3)



In [82]:
cov_r_acc = list_mean(pred_train_r_acc[0:5])
cov_r_roc = list_mean(pred_train_r_roc[0:5])
cov_r_f1 = list_mean(pred_train_r_f1[0:5])

appendix1['ACC']['COV_RF'] = round(cov_r_acc, 3)
appendix1['ROC_AUC']['COV_RF'] = round(cov_r_roc, 3)
appendix1['F1']['COV_RF'] = round(cov_r_f1, 3)

adult_r_acc = list_mean(pred_train_r_acc[5:10])
adult_r_roc = list_mean(pred_train_r_roc[5:10])
adult_r_f1 = list_mean(pred_train_r_f1[5:10])

appendix1['ACC']['ADULT_RF'] = round(adult_r_acc, 3)
appendix1['ROC_AUC']['ADULT_RF'] = round(adult_r_roc, 3)
appendix1['F1']['ADULT_RF'] = round(adult_r_f1, 3)

bean_r_acc = list_mean(pred_train_r_acc[10:15])
bean_r_roc = list_mean(pred_train_r_roc[10:15])
bean_r_f1 = list_mean(pred_train_r_f1[10:15])

appendix1['ACC']['BEAN_RF'] = round(adult_r_acc, 3)
appendix1['ROC_AUC']['BEAN_RF'] = round(adult_r_roc, 3)
appendix1['F1']['BEAN_RF'] = round(adult_r_f1, 3)

occ_r_acc = list_mean(pred_train_r_acc[15:20])
occ_r_roc = list_mean(pred_train_r_roc[15:20])
occ_r_f1 = list_mean(pred_train_r_f1[15:20])

appendix1['ACC']['OCC_RF'] = round(occ_r_acc, 3)
appendix1['ROC_AUC']['OCC_RF'] = round(occ_r_roc, 3)
appendix1['F1']['OCC_RF'] = round(occ_r_f1, 3)

appendix1

Unnamed: 0,ACC,ROC_AUC,F1
COV_KNN,0.993,0.662,0.892
ADULT_KNN,0.814,0.746,0.635
BEAN_KNN,1.0,0.873,1.0
OCC_KNN,1.0,0.992,1.0
COV_LOGREG,0.98,0.799,0.679
ADULT_LOGREG,0.816,0.743,0.633
BEAN_LOGREG,0.771,0.699,0.575
OCC_LOGREG,0.988,0.986,0.972
COV_RF,0.999,0.936,0.989
ADULT_RF,0.845,0.799,0.704


In [114]:
app1 = pd.DataFrame(index=['KNN', 'LOGREG', 'RF'],
                                 columns=['ACC', 'ROC_AUC', 'F1', 'MEAN'])

app1['ACC']['KNN'] = round(list_mean(pred_train_k_acc), 3)
app1['ROC_AUC']['KNN'] = round(list_mean(pred_train_k_roc), 3)
app1['F1']['KNN'] = round(list_mean(pred_train_k_f1), 3)
app1['MEAN']['KNN'] = round((app1['ACC']['KNN'] + app1['ROC_AUC']['KNN'] + app1['F1']['KNN'])/3, 3)

app1['ACC']['LOGREG'] = round(list_mean(pred_train_l_acc), 3)
app1['ROC_AUC']['LOGREG'] = round(list_mean(pred_train_l_roc), 3)
app1['F1']['LOGREG'] = round(list_mean(pred_train_l_f1), 3)
app1['MEAN']['LOGREG'] = round((app1['ACC']['LOGREG'] + app1['ROC_AUC']['LOGREG'] + app1['F1']['LOGREG'])/3, 3)

app1['ACC']['RF'] = round(list_mean(pred_train_r_acc), 3)
app1['ROC_AUC']['RF'] = round(list_mean(pred_train_r_roc), 3)
app1['F1']['RF'] = round(list_mean(pred_train_r_f1), 3)
app1['MEAN']['RF'] = round((app1['ACC']['RF'] + app1['ROC_AUC']['RF'] + app1['F1']['RF'])/3, 3)

app1

Unnamed: 0,ACC,ROC_AUC,F1,MEAN
KNN,0.952,0.818,0.882,0.884
LOGREG,0.889,0.807,0.715,0.804
RF,0.958,0.927,0.917,0.934


In [83]:
appendix2 = pd.DataFrame(index=['COV', 'ADULT', 'BEAN', 'OCCUPANCY'],
                        columns=['KNN', 'LOGREG', 'RF'])

    

appendix2['KNN']['COV'] = (round(accuracy_k[0],3), round(accuracy_k[1],3),
                          round(accuracy_k[2],3), round(accuracy_k[3],3),
                          round(accuracy_k[4],3))

appendix2['KNN']['ADULT'] = (round(accuracy_k[5],3), round(accuracy_k[6],3),
                          round(accuracy_k[7],3), round(accuracy_k[8],3),
                          round(accuracy_k[9],3))

appendix2['KNN']['BEAN'] = (round(accuracy_k[10],3), round(accuracy_k[11],3),
                          round(accuracy_k[12],3), round(accuracy_k[13],3),
                          round(accuracy_k[14],3))

appendix2['KNN']['OCCUPANCY'] = (round(accuracy_k[15],3), round(accuracy_k[16],3),
                          round(accuracy_k[17],3), round(accuracy_k[18],3),
                          round(accuracy_k[19],3))

appendix2['LOGREG']['COV'] = (round(accuracy_l[0],3), round(accuracy_l[1],3),
                          round(accuracy_l[2],3), round(accuracy_l[3],3),
                          round(accuracy_l[4],3))

appendix2['LOGREG']['ADULT'] = (round(accuracy_l[5],3), round(accuracy_l[6],3),
                          round(accuracy_l[7],3), round(accuracy_l[8],3),
                          round(accuracy_l[9],3))

appendix2['LOGREG']['BEAN'] = (round(accuracy_l[10],3), round(accuracy_l[11],3),
                          round(accuracy_l[12],3), round(accuracy_l[13],3),
                          round(accuracy_l[14],3))

appendix2['LOGREG']['OCCUPANCY'] = (round(accuracy_l[15],3), round(accuracy_l[16],3),
                          round(accuracy_l[17],3), round(accuracy_l[18],3),
                          round(accuracy_l[19],3))

appendix2['RF']['COV'] = (round(accuracy_r[0],3), round(accuracy_r[1],3),
                          round(accuracy_r[2],3), round(accuracy_r[3],3),
                          round(accuracy_r[4],3))

appendix2['RF']['ADULT'] = (round(accuracy_r[5],3), round(accuracy_r[6],3),
                          round(accuracy_r[7],3), round(accuracy_r[8],3),
                          round(accuracy_r[9],3))

appendix2['RF']['BEAN'] = (round(accuracy_r[10],3), round(accuracy_r[11],3),
                          round(accuracy_r[12],3), round(accuracy_r[13],3),
                          round(accuracy_r[14],3))

appendix2['RF']['OCCUPANCY'] = (round(accuracy_r[15],3), round(accuracy_r[16],3),
                          round(accuracy_r[17],3), round(accuracy_r[18],3),
                          round(accuracy_r[19],3))
appendix2

Unnamed: 0,KNN,LOGREG,RF
COV,"(0.976, 0.969, 0.976, 0.979, 0.972)","(0.976, 0.976, 0.976, 0.976, 0.977)","(0.98, 0.978, 0.98, 0.978, 0.976)"
ADULT,"(0.979, 0.975, 0.97, 0.975, 0.978)","(0.976, 0.978, 0.978, 0.978, 0.977)","(0.979, 0.98, 0.977, 0.979, 0.98)"
BEAN,"(0.969, 0.978, 0.98, 0.969, 0.98)","(0.977, 0.977, 0.977, 0.977, 0.977)","(0.977, 0.981, 0.98, 0.978, 0.98)"
OCCUPANCY,"(0.804, 0.804, 0.804, 0.801, 0.801)","(0.811, 0.811, 0.811, 0.806, 0.806)","(0.787, 0.784, 0.783, 0.783, 0.793)"


In [84]:
from scipy import stats

In [88]:
appendix3_1 = pd.DataFrame(index=['RF_ACC', 'RF_ROC_AUC', 'RF_F1'],
                        columns=['Pval-KNN', 'Pval-LOGREG'])

rf_knn_acc = stats.ttest_rel(pred_test_r_acc, pred_test_k_acc)
rf_knn_roc = stats.ttest_rel(pred_test_r_roc, pred_test_k_roc)
rf_knn_f1 = stats.ttest_rel(pred_test_r_f1, pred_test_k_f1)

rf_lr_acc = stats.ttest_rel(pred_test_r_acc, pred_test_l_acc)
rf_lr_roc = stats.ttest_rel(pred_test_r_roc, pred_test_l_acc)
rf_lr_f1 = stats.ttest_rel(pred_test_r_f1, pred_test_l_acc)

appendix3_1['Pval-KNN']['RF_ACC'] = "{:.3f}".format(float(rf_knn_acc.pvalue))
appendix3_1['Pval-KNN']['RF_ROC_AUC'] = "{:.3f}".format(float(rf_knn_roc.pvalue))
appendix3_1['Pval-KNN']['RF_F1'] = "{:.3f}".format(float(rf_knn_f1.pvalue))

appendix3_1['Pval-LOGREG']['RF_ACC'] = "{:.3f}".format(float(rf_lr_acc.pvalue))
appendix3_1['Pval-LOGREG']['RF_ROC_AUC'] = "{:.3f}".format(float(rf_lr_roc.pvalue))
appendix3_1['Pval-LOGREG']['RF_F1'] = "{:.3f}".format(float(rf_lr_f1.pvalue))

appendix3_1

Unnamed: 0,Pval-KNN,Pval-LOGREG
RF_ACC,0.118,0.057
RF_ROC_AUC,0.008,0.151
RF_F1,0.094,0.018


In [90]:
# Combine all scores into their own lists for doing ttest
all_rf = pred_test_r_acc + pred_test_r_roc + pred_test_r_f1
all_k = pred_test_k_acc + pred_test_k_roc + pred_test_k_f1
all_lr = pred_test_l_acc + pred_test_l_roc + pred_test_l_f1

In [91]:
appendix3_2 = pd.DataFrame(index=['COV_KNN', 'COV_LR', 
                                  'ADULT_KNN', 'ADULT_LR',
                                 'BEAN_KNN', 'BEAN_LR',
                                 'OCC_KNN', 'OCC_LR'],
                        columns=['RF'])

covk = stats.ttest_rel(all_rf[0:5], all_k[0:5])
covlr = stats.ttest_rel(all_rf[0:5], all_lr[0:5])
adultk = stats.ttest_rel(all_rf[5:10], all_k[5:10])
adultlr = stats.ttest_rel(all_rf[5:10], all_lr[5:10])
beank = stats.ttest_rel(all_rf[10:15], all_k[10:15])
beanlr = stats.ttest_rel(all_rf[10:15], all_lr[10:15])
occk = stats.ttest_rel(all_rf[15:20], all_k[15:20])
occlr = stats.ttest_rel(all_rf[15:20], all_lr[15:20])

appendix3_2['RF'][0] = "{:.8f}".format(float(covk.pvalue))
appendix3_2['RF'][1] = "{:.8f}".format(float(covlr.pvalue))
appendix3_2['RF'][2] = "{:.8f}".format(float(adultk.pvalue))
appendix3_2['RF'][3] = "{:.8f}".format(float(adultlr.pvalue))
appendix3_2['RF'][4] = "{:.8f}".format(float(beank.pvalue))
appendix3_2['RF'][5] = "{:.8f}".format(float(beanlr.pvalue))
appendix3_2['RF'][6] = "{:.8f}".format(float(occk.pvalue))
appendix3_2['RF'][7] = "{:.8f}".format(float(occlr.pvalue))

appendix3_2

Unnamed: 0,RF
COV_KNN,0.14946392
COV_LR,0.00280276
ADULT_KNN,6.693e-05
ADULT_LR,7.476e-05
BEAN_KNN,8.602e-05
BEAN_LR,5.94e-06
OCC_KNN,0.01743704
OCC_LR,0.85500661


In [95]:
appendix3_3 = pd.DataFrame(index=['COV_RF', 'COV_LR', 
                                  'ADULT_RF', 'ADULT_LR',
                                 'BEAN_RF', 'BEAN_LR',
                                 'OCC_RF', 'OCC_LR'],
                        columns=['KNN'])

covrf = stats.ttest_rel(all_k[0:5], all_rf[0:5])
covlr = stats.ttest_rel(all_k[0:5], all_lr[0:5])
adultrf = stats.ttest_rel(all_k[5:10], all_rf[5:10])
adultlr = stats.ttest_rel(all_k[5:10], all_lr[5:10])
beanrf = stats.ttest_rel(all_k[10:15], all_rf[10:15])
beanlr = stats.ttest_rel(all_k[10:15], all_lr[10:15])
occrf = stats.ttest_rel(all_k[15:20], all_rf[15:20])
occlr = stats.ttest_rel(all_k[15:20], all_lr[15:20])

appendix3_3['KNN'][0] = "{:.8f}".format(float(covrf.pvalue))
appendix3_3['KNN'][1] = "{:.8f}".format(float(covlr.pvalue))
appendix3_3['KNN'][2] = "{:.8f}".format(float(adultrf.pvalue))
appendix3_3['KNN'][3] = "{:.8f}".format(float(adultlr.pvalue))
appendix3_3['KNN'][4] = "{:.8f}".format(float(beanrf.pvalue))
appendix3_3['KNN'][5] = "{:.8f}".format(float(beanlr.pvalue))
appendix3_3['KNN'][6] = "{:.8f}".format(float(occrf.pvalue))
appendix3_3['KNN'][7] = "{:.8f}".format(float(occlr.pvalue))

appendix3_3

Unnamed: 0,KNN
COV_RF,0.14946392
COV_LR,0.53168982
ADULT_RF,6.693e-05
ADULT_LR,0.00691084
BEAN_RF,8.602e-05
BEAN_LR,1.6e-07
OCC_RF,0.01743704
OCC_LR,0.00254499


In [96]:
appendix3_4 = pd.DataFrame(index=['COV_RF', 'COV_KNN', 
                                  'ADULT_RF', 'ADULT_KNN',
                                 'BEAN_RF', 'BEAN_KNN',
                                 'OCC_RF', 'OCC_KNN'],
                        columns=['LR'])

covrf = stats.ttest_rel(all_lr[0:5], all_rf[0:5])
covk = stats.ttest_rel(all_lr[0:5], all_k[0:5])
adultrf = stats.ttest_rel(all_lr[5:10], all_rf[5:10])
adultk = stats.ttest_rel(all_lr[5:10], all_k[5:10])
beanrf = stats.ttest_rel(all_lr[10:15], all_rf[10:15])
beank = stats.ttest_rel(all_lr[10:15], all_k[10:15])
occrf = stats.ttest_rel(all_lr[15:20], all_rf[15:20])
occk = stats.ttest_rel(all_lr[15:20], all_k[15:20])

appendix3_4['LR'][0] = "{:.8f}".format(float(covrf.pvalue))
appendix3_4['LR'][1] = "{:.8f}".format(float(covlr.pvalue))
appendix3_4['LR'][2] = "{:.8f}".format(float(adultrf.pvalue))
appendix3_4['LR'][3] = "{:.8f}".format(float(adultlr.pvalue))
appendix3_4['LR'][4] = "{:.8f}".format(float(beanrf.pvalue))
appendix3_4['LR'][5] = "{:.8f}".format(float(beanlr.pvalue))
appendix3_4['LR'][6] = "{:.8f}".format(float(occrf.pvalue))
appendix3_4['LR'][7] = "{:.8f}".format(float(occlr.pvalue))

appendix3_4

Unnamed: 0,LR
COV_RF,0.00280276
COV_KNN,0.53168982
ADULT_RF,7.476e-05
ADULT_KNN,0.00691084
BEAN_RF,5.94e-06
BEAN_KNN,1.6e-07
OCC_RF,0.85500661
OCC_KNN,0.00254499
