In [129]:
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.decomposition import *

# import torch
# import torch.nn as nn
# from tab_transformer_pytorch import TabTransformer

from tabtransformertf.utils.preprocessing import df_to_dataset, build_categorical_prep
from tabtransformertf.models.tabtransformer import TabTransformer
from tabtransformertf.models.fttransformer import FTTransformerEncoder, FTTransformer

import warnings
warnings.filterwarnings('ignore')
import os
import re
import ast

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [121]:
directory_dataframes = '/content/drive/MyDrive/subsamples/'
directory_features = '/content/drive/MyDrive/features/'

def get_sample_df(directory=directory_dataframes):
    list_dataframes = []
    filename_list = []
    for filename in os.listdir(directory):
        print(filename)
        filename_list.append(filename)
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            list_dataframes.append(pd.read_csv(f))
            
    return list_dataframes, filename_list

def get_features(regex_str, directory=directory_features):
    regex = re.compile('/content/drive/MyDrive/features/{}'.format(regex_str))
    
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if regex.match(f):
            file1 = open(f,"r+")
            feat_list = file1.read().splitlines()
            
            #txt file converts everything to string, so we need to convert it back to list
            for i in range(len(feat_list)):
                #adding ; to be used a separator for list
                if i<len(feat_list):
                    new_val = feat_list[i].replace('y','y;').replace(') ','); ').replace('4 ', '4; ').replace('5 ', '5; ')
                    feat_list[i] = new_val
                
    for val in feat_list:
        #separating the string into a list of features
        new_val = val.split('; ')
        feat_list[feat_list.index(val)] = new_val
        
    return feat_list

list_sample_dataframes, filename_sample_list = get_sample_df(directory_dataframes)

In [186]:
def model_train_predict_ft(regex_str, dataframes=list_sample_dataframes):
    
    feat_list = get_features(regex_str)
    accuracy_list = []
    f1_score_list = []
    auc_list = []

    for sample, feat in zip(dataframes, feat_list):
        feat[len(feat)-1] = feat[len(feat)-1].replace('y;', 'y')
        
        CATEGORICAL_FEATURES = [] 
        NUMERIC_FEATURES = feat
        FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES
        LABEL = 'conversion_class'
        # print(FEATURES)
        
        train_data, test_data = train_test_split(sample, test_size=0.2, random_state=42, stratify=sample['conversion_class'])
        
        train_data[CATEGORICAL_FEATURES] = train_data[CATEGORICAL_FEATURES].astype(str)
        test_data[CATEGORICAL_FEATURES] = test_data[CATEGORICAL_FEATURES].astype(str)

        train_data[NUMERIC_FEATURES] = train_data[NUMERIC_FEATURES].astype(float)
        test_data[NUMERIC_FEATURES] = test_data[NUMERIC_FEATURES].astype(float)
        
        # # Transform to TF dataset
        train_dataset = df_to_dataset(train_data[FEATURES + [LABEL]], LABEL, batch_size=32)
        test_dataset = df_to_dataset(test_data[FEATURES + [LABEL]], LABEL, shuffle=False, batch_size=32)
        
        # print(train_dataset)

        ft_linear_encoder = FTTransformerEncoder(
            numerical_features = NUMERIC_FEATURES,  # list of numeric features
            categorical_features = CATEGORICAL_FEATURES,  # list of numeric features
            numerical_data = train_data[NUMERIC_FEATURES],  # pandas dataframe of numeric features
            categorical_data = train_data[CATEGORICAL_FEATURES],  # pandas dataframe of categorical features
            # categorical_lookup=category_prep_layers,  # dictionary of categorical lookup layers
            # numerical_embeddings=None,  # None for linear embeddings
            numerical_embedding_type='linear',  # Numerical embedding type
            embedding_dim=16,  # Embedding dimension (for categorical, numerical, and contextual)
            depth=3,  # Number of Transformer Blocks (layers)
            heads=6,  # Number of attention heads in a Transofrmer Block
            attn_dropout=0.2,  # Dropout for attention layers
            ff_dropout=0.2,  # Dropout in Dense layers
            # use_column_embedding=True,  # Fixed column embeddings
            explainable=True  # Whether we want to output attention importances or not
        )

        # Pass the encoder to the model
        ft_linear_transformer = FTTransformer(
            encoder=ft_linear_encoder,  # Encoder from above
            out_dim=1,  # Number of outputs in final layer
            out_activation='sigmoid',  # Activation function for final layer
            # final_layer_size=32,  # Pre-final layer, takes CLS contextual embeddings as input 
        )
        
        LEARNING_RATE = 0.0001
        WEIGHT_DECAY = 0.0001
        NUM_EPOCHS = 10

        # Define optimised
        optimizer = tfa.optimizers.AdamW(
                learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
            )

        # Two outputs, second output is importances, so we don't calculate loss for it
        ft_linear_transformer.compile(
            optimizer = optimizer,
            loss = {"output": tf.keras.losses.BinaryCrossentropy(), "importances": None},
            metrics= {"output": [tf.keras.metrics.AUC(name="PR AUC", curve='PR')], "importances": None},
        )

        # Training
        ft_linear_history = ft_linear_transformer.fit(
            train_dataset, 
            epochs=NUM_EPOCHS, 
            verbose=0          
        )
        
        
        linear_test_preds = ft_linear_transformer.predict(test_dataset)
        auc_list.append(np.round(roc_auc_score(test_data[LABEL], linear_test_preds['output'].ravel()), 4))
        f1_score_list.append(np.round(f1_score(test_data[LABEL], linear_test_preds['output'].ravel()>0.5), 4))
        accuracy_list.append(np.round(accuracy_score(test_data[LABEL], linear_test_preds['output'].ravel()>0.5), 4))

        # print('Average Accuracy', np.mean(accuracy_list))
        # print('Average F1 Score', np.mean(f1_score_list))
        # print('Average AUC', np.mean(auc_list)) 
        
        # print('Max Accuracy', max(accuracy_list))
        # print('Max F1 Score', max(f1_score_list))
        # print('Max AUC', max(auc_list))  
        
        # best_accuracy_index = accuracy_list.index(max(accuracy_list))
        # best_f1_score_index = f1_score_list.index(max(f1_score_list))
        # best_auc_index = auc_list.index(max(auc_list))
        
        # print('Best Sample Index based on Max Accuracy', best_accuracy_index)
        # print('Best Sample Index based on Max F1 Score', best_f1_score_index)
        # print('Best Sample Index based on Max AUC', best_auc_index)
        
        # print('Best Features based on Max Accuracy', feat_list[best_accuracy_index])
        # print('Best Features based on Max F1 Score', feat_list[best_f1_score_index])
        # print('Best Features based on Max AUC', feat_list[best_auc_index]) 

    return accuracy_list, f1_score_list, auc_list,


In [None]:
accuracy_list_10_mi, f1_score_list_10_mi, auc_list_10_mi = model_train_predict_ft('mi_feat_list_10*',)
accuracy_list_20_mi, f1_score_list_20_mi, auc_list_20_mi = model_train_predict_ft('mi_feat_list_20*',)
accuracy_list_30_mi, f1_score_list_30_mi, auc_list_30_mi = model_train_predict_ft('mi_feat_list_30*',)
accuracy_list_50_mi, f1_score_list_50_mi, auc_list_50_mi = model_train_predict_ft('mi_feat_list_50*',)
accuracy_list_75_mi, f1_score_list_75_mi, auc_list_75_mi = model_train_predict_ft('mi_feat_list_75*',)
accuracy_list_90_mi, f1_score_list_90_mi, auc_list_90_mi = model_train_predict_ft('mi_feat_list_90*',)


accuracy_list_10_mrmr, f1_score_list_10_mrmr, auc_list_10_mrmr = model_train_predict_ft('mrmr_feat_list_10*',)
accuracy_list_20_mrmr, f1_score_list_20_mrmr, auc_list_20_mrmr = model_train_predict_ft('mrmr_feat_list_20*',)
accuracy_list_30_mrmr, f1_score_list_30_mrmr, auc_list_30_mrmr = model_train_predict_ft('mrmr_feat_list_30*',)
accuracy_list_50_mrmr, f1_score_list_50_mrmr, auc_list_50_mrmr = model_train_predict_ft('mrmr_feat_list_50*',)
accuracy_list_75_mrmr, f1_score_list_75_mrmr, auc_list_75_mrmr = model_train_predict_ft('mrmr_feat_list_75*',)
accuracy_list_90_mrmr, f1_score_list_90_mrmr, auc_list_90_mrmr = model_train_predict_ft('mrmr_feat_list_90*',)


accuracy_list_10_mi_mrmr, f1_score_list_10_mi_mrmr, auc_list_10_mi_mrmr = model_train_predict_ft('mi_mrmr_feat_list_10*',)
accuracy_list_20_mi_mrmr, f1_score_list_20_mi_mrmr, auc_list_20_mi_mrmr = model_train_predict_ft('mi_mrmr_feat_list_20*',)
accuracy_list_30_mi_mrmr, f1_score_list_30_mi_mrmr, auc_list_30_mi_mrmr = model_train_predict_ft('mi_mrmr_feat_list_30*',)
accuracy_list_50_mi_mrmr, f1_score_list_50_mi_mrmr, auc_list_50_mi_mrmr = model_train_predict_ft('mi_mrmr_feat_list_50*',)
accuracy_list_75_mi_mrmr, f1_score_list_75_mi_mrmr, auc_list_75_mi_mrmr = model_train_predict_ft('mi_mrmr_feat_list_75*',)
accuracy_list_90_mi_mrmr, f1_score_list_90_mi_mrmr, auc_list_90_mi_mrmr = model_train_predict_ft('mi_mrmr_feat_list_90*',)
