In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.decomposition import *

import xgboost as xgb
import lightgbm as lgb

# import tensorflow as tf

import os
import re
import ast
from autokeras import StructuredDataClassifier

In [None]:
# 13 14 15 16 17
length_text = 13
directory_dataframes = '/kaggle/input/shorty/short/subsamples/{}/'.format(length_text)
directory_features = '/kaggle/input/shorty/short/features/{}/'.format(length_text)


def sorted_alphanumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(data, key=alphanum_key)

def get_sample_df(directory=directory_dataframes):
    list_dataframes = []
    filename_list = []
    dir_list = sorted_alphanumeric(os.listdir(directory))
    for filename in dir_list:
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            list_dataframes.append(pd.read_csv(f))
            filename_list.append(filename)
            
    return list_dataframes, filename_list

def get_features(regex_str, directory=directory_features):
    regex = re.compile('/kaggle/input/shorty/short/features/{}/{}'.format(length_text, regex_str))
    
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
#         print(f)
        if regex.match(f):
            file1 = open(f)
            feat_list = file1.read().splitlines()
            
            #txt file converts everything to string, so we need to convert it back to list
            for i in range(len(feat_list)):
                #adding ; to be used a separator for list
                if i<len(feat_list):
                    new_val = feat_list[i].replace('y','y;').replace(') ','); ').replace('4 ', '4; ').replace('5 ', '5; ')
                    feat_list[i] = new_val
                
    for val in feat_list:
        #separating the string into a list of features
        new_val = val.split('; ')
        feat_list[feat_list.index(val)] = new_val
        
    return feat_list

list_sample_dataframes, filename_sample_list = get_sample_df(directory_dataframes)

In [None]:
def model_train_predict(regex_str, dataframes=list_sample_dataframes, params=None):
    
    feat_list = get_features(regex_str)
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    
    
    for i, (sample, feat) in enumerate(zip(dataframes, feat_list)):
        print(i)
        
        feat[len(feat)-1] = feat[len(feat)-1].replace('y;', 'y')
        x = sample[feat]
        x = x.rename(columns = lambda a:re.sub('[^A-Za-z0-9_]+', '', a))
        

        model = StructuredDataClassifier(max_trials=5, overwrite=True, column_names=feat)
        
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        
        loss, acc = model.evaluate(x_test, y_test, verbose=0)
        print('Accuracy: %.3f' % acc)
        
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))
        

    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    print(accuracy_list)
    print(auc_list)
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    
    print('Best Features based on Max Accuracy', feat_list[best_accuracy_index])
    print('Best Features based on Max F1 Score', feat_list[best_f1_score_index])
    print('Best Features based on Max AUC', feat_list[best_auc_index])
    
    return accuracy_list, f1_score_list, auc_list 


## MI

In [None]:
accuracy_list_nas_10_mi, f1_score_list_nas_10_mi, auc_list_nas_10_mi = model_train_predict('mi_feat_list_10')
accuracy_list_nas_20_mi, f1_score_list_nas_20_mi, auc_list_nas_20_mi = model_train_predict('mi_feat_list_20')
accuracy_list_nas_30_mi, f1_score_list_nas_30_mi, auc_list_nas_30_mi = model_train_predict('mi_feat_list_30')
accuracy_list_nas_50_mi, f1_score_list_nas_50_mi, auc_list_nas_50_mi = model_train_predict('mi_feat_list_50')
accuracy_list_nas_75_mi, f1_score_list_nas_75_mi, auc_list_nas_75_mi = model_train_predict('mi_feat_list_75')
accuracy_list_nas_90_mi, f1_score_list_nas_90_mi, auc_list_nas_90_mi = model_train_predict('mi_feat_list_90')

## mRMR

In [None]:
accuracy_list_nas_10_mrmr, f1_score_list_nas_10_mrmr, auc_list_nas_10_mrmr = model_train_predict('mrmr_feat_list_10')
accuracy_list_nas_20_mrmr, f1_score_list_nas_20_mrmr, auc_list_nas_20_mrmr = model_train_predict('mrmr_feat_list_20')
accuracy_list_nas_30_mrmr, f1_score_list_nas_30_mrmr, auc_list_nas_30_mrmr = model_train_predict('mrmr_feat_list_30')
accuracy_list_nas_50_mrmr, f1_score_list_nas_50_mrmr, auc_list_nas_50_mrmr = model_train_predict('mrmr_feat_list_50')
accuracy_list_nas_75_mrmr, f1_score_list_nas_75_mrmr, auc_list_nas_75_mrmr = model_train_predict('mrmr_feat_list_75')
accuracy_list_nas_90_mrmr, f1_score_list_nas_90_mrmr, auc_list_nas_90_mrmr = model_train_predict('mrmr_feat_list_90')

## MI and mRMR

In [None]:
accuracy_list_nas_10_mi_mrmr, f1_score_list_nas_10_mi_mrmr, auc_list_nas_10_mi_mrmr = model_train_predict('mi_mrmr_feat_list_10*')
accuracy_list_nas_20_mi_mrmr, f1_score_list_nas_20_mi_mrmr, auc_list_nas_20_mi_mrmr = model_train_predict('mi_mrmr_feat_list_20*')
accuracy_list_nas_30_mi_mrmr, f1_score_list_nas_30_mi_mrmr, auc_list_nas_30_mi_mrmr = model_train_predict('mi_mrmr_feat_list_30*')
accuracy_list_nas_50_mi_mrmr, f1_score_list_nas_50_mi_mrmr, auc_list_nas_50_mi_mrmr = model_train_predict('mi_mrmr_feat_list_50*')
accuracy_list_nas_75_mi_mrmr, f1_score_list_nas_75_mi_mrmr, auc_list_nas_75_mi_mrmr = model_train_predict('mi_mrmr_feat_list_75*')
accuracy_list_nas_90_mi_mrmr, f1_score_list_nas_90_mi_mrmr, auc_list_nas_90_mi_mrmr = model_train_predict('mi_mrmr_feat_list_90*')

## Results

In [None]:
overall_accuracy_list_nas_mi = (accuracy_list_nas_10_mi + accuracy_list_nas_20_mi + accuracy_list_nas_30_mi + accuracy_list_nas_50_mi + accuracy_list_nas_75_mi + accuracy_list_nas_90_mi)
overall_accuracy_list_nas_mrmr = (accuracy_list_nas_10_mrmr + accuracy_list_nas_20_mrmr + accuracy_list_nas_30_mrmr + accuracy_list_nas_50_mrmr + accuracy_list_nas_75_mrmr + accuracy_list_nas_90_mrmr)
overall_accuracy_list_nas_mi_mrmr = (accuracy_list_nas_10_mi_mrmr + accuracy_list_nas_20_mi_mrmr + accuracy_list_nas_30_mi_mrmr + accuracy_list_nas_50_mi_mrmr + accuracy_list_nas_75_mi_mrmr + accuracy_list_nas_90_mi_mrmr)

overall_f1_score_list_nas_mi = (f1_score_list_nas_10_mi + f1_score_list_nas_20_mi + f1_score_list_nas_30_mi + f1_score_list_nas_50_mi + f1_score_list_nas_75_mi + f1_score_list_nas_90_mi)
overall_f1_score_list_nas_mrmr = (f1_score_list_nas_10_mrmr + f1_score_list_nas_20_mrmr + f1_score_list_nas_30_mrmr + f1_score_list_nas_50_mrmr + f1_score_list_nas_75_mrmr + f1_score_list_nas_90_mrmr)
overall_f1_score_list_nas_mi_mrmr = (f1_score_list_nas_10_mi_mrmr + f1_score_list_nas_20_mi_mrmr + f1_score_list_nas_30_mi_mrmr + f1_score_list_nas_50_mi_mrmr + f1_score_list_nas_75_mi_mrmr + f1_score_list_nas_90_mi_mrmr)

overall_auc_list_nas_mi = (auc_list_nas_10_mi + auc_list_nas_20_mi + auc_list_nas_30_mi + auc_list_nas_50_mi + auc_list_nas_75_mi + auc_list_nas_90_mi)
overall_auc_list_nas_mrmr = (auc_list_nas_10_mrmr + auc_list_nas_20_mrmr + auc_list_nas_30_mrmr + auc_list_nas_50_mrmr + auc_list_nas_75_mrmr + auc_list_nas_90_mrmr)
overall_auc_list_nas_mi_mrmr = (auc_list_nas_10_mi_mrmr + auc_list_nas_20_mi_mrmr + auc_list_nas_30_mi_mrmr + auc_list_nas_50_mi_mrmr + auc_list_nas_75_mi_mrmr + auc_list_nas_90_mi_mrmr)

In [None]:
models = ['nas'] * 60
percentiles = ['10', '20', '30', '50', '75', '90'] * 10
filename_sample_list = filename_sample_list * 6

print(len(models))
print(len(percentiles))
print(len(filename_sample_list))
print(len(overall_accuracy_list_nas_mi_mrmr))

In [None]:
results_dictionary = {
    'samples': filename_sample_list,
    'models': models,
    'percentiles': percentiles,
    'mi_accuracy': overall_accuracy_list_nas_mi,
    'mi_f1_score': overall_f1_score_list_nas_mi,
    'mi_auc': overall_auc_list_nas_mi,
    'mrmr_accuracy': overall_accuracy_list_nas_mrmr,
    'mrmr_f1_score': overall_f1_score_list_nas_mrmr,
    'mrmr_auc': overall_auc_list_nas_mrmr,
    'mi_mrmr_accuracy': overall_accuracy_list_nas_mi_mrmr,
    'mi_mrmr_f1_score': overall_f1_score_list_nas_mi_mrmr,
    'mi_mrmr_auc': overall_auc_list_nas_mi_mrmr,
}
results_df = pd.DataFrame(results_dictionary)
results_df.to_csv('overall_results_20_nas_{}.csv'.format(length_text), index=False)