# Diabetes readmission prediction

Import libraries for later use:

In [43]:
import pandas as pd
import numpy as np
import pickle as pkl
import os
import subprocess
from operator import itemgetter
from helper import EstimatorSelectionHelper
from sklearn.metrics import f1_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif, SelectKBest, f_classif
from sklearn.preprocessing import RobustScaler

In [3]:
EMB_SIZE = 512  # Size of universal sentence encoder embeddings 
useDummy = True  # Whether to use 1hot encoding or ordinal

To avoid confusion we will only present the results achieved with 1hot encoding, which after experimentation seemed to offer better results. We also tried a couple of embeding sizes without noticing significant differences in performance. These among other experimentation insights will be listed at the end of the notebook.

Importing dataframes:

In [12]:
train_df = pd.read_csv('project2_data/10k_diabetes/diab_train.csv')
valid_df = pd.read_csv('project2_data/10k_diabetes/diab_validation.csv')
test_df = pd.read_csv('project2_data/10k_diabetes/diab_test.csv')

## Preprocessing

The following columns provide no information or contain too many nans:

In [13]:
to_remove = ['Unnamed: 0', 'weight', 'payer_code', 'discharge_disposition_id', 'admission_source_id',
             'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult']

In [14]:
# Values mapped to nan
NaN_values = ['?', 'nan', 'Not Available', 'Not Mapped', 'None']

# Prediction
pred_columns = ['readmitted']

# All integer columns
int_columns = ['time_in_hospital', 'num_lab_procedures',
               'num_procedures', 'num_medications', 'number_outpatient',
               'number_emergency', 'number_inpatient', 'number_diagnoses',
               ]

# All categorical columns
cat_columns = ['race', 'gender', 'age', 'metformin', 'repaglinide', 'nateglinide',
               'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
               'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
               'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
               'insulin', 'glyburide.metformin', 'glipizide.metformin',
               'glimepiride.pioglitazone', 'metformin.rosiglitazone',
               'metformin.pioglitazone', 'change', 'diabetesMed'
               ]


# All string columns
str_columns = ['diag_1_desc', 'diag_2_desc', 'diag_3_desc']

In [15]:
# create dictionary mapping each category to a number
def create_maps(df, cat_columns, nan_categories):
    maps = {}
    for column in cat_columns:
        maps[column] = {category: i for i,
                        category in enumerate(df[column].unique())}

    for key in maps.keys():
        for category in maps[key]:
            if category in nan_categories:
                maps[key][category] = np.NaN
    return maps


def fix_integers(df, int_columns):
    for column in int_columns:
        df[column] = df[column].where(
            lambda x: [str(i).isdigit() for i in x], np.NaN).astype(np.float)
    return df


# encode categories into numbers using create_maps output
def fix_categories(df, cat_columns, cat_map):
    for column in cat_columns:
        df[column] = df[column].replace(cat_map[column])
        df[column] = df[column].astype('float64')
    return df


# map different nan strings to nan value
def set_NaN(df, cat_columns, nan_values):
    nan_map = {key: np.NaN for key in nan_values}
    for column in cat_columns:
        df[column] = df[column].replace(nan_map)
    return df


def create_missing_columns(df, unique_columns):
    for column in unique_columns:
        if column not in df.columns:
            df[column] = 0
    return df


def remove_columns(df, to_remove):
    return df.drop(columns=to_remove)

In [16]:
# remove uninteresting features
train_df = remove_columns(train_df, to_remove)
valid_df = remove_columns(valid_df, to_remove)
test_df = remove_columns(test_df, to_remove)

# Fix the Integers - Set all the unknown to 0
train_df = fix_integers(train_df, int_columns)
valid_df = fix_integers(valid_df, int_columns)
test_df = fix_integers(test_df, int_columns)

# Find category map
# category_map = create_maps(pd.concat((train_df, valid_df, test_df)), cat_columns, NaN_values)

Hard coded result of create_maps:

In [17]:
category_map = {'race': {'AfricanAmerican': 0, 'Caucasian': 1, 'Asian': 2, 'Other': 3, 'Hispanic': 4, '?': np.NaN},
                'gender': {'Male': 0, 'Female': 1},
                'age': {'[0-10)': 0, '[10-20)': 1, '[20-30)': 2, '[30-40)': 3, '[40-50)': 4, '[50-60)': 5, '[60-70)': 6, '[70-80)': 7, '[80-90)': 8, '[90-100)': 9},
                'weight': {'?': np.NaN, '[0-25)': 1, '[25-50)': 2, '[50-75)': 3, '[75-100)': 4, '[100-125)': 5, '[125-150)': 6, '[150-175)': 7},
                'max_glu_serum': {'None': np.NaN, 'Norm': 1, '>200': 2, '>300': 3},
                'A1Cresult': {'None': 0, '>8': 1, 'Norm': 2, '>7': 3},
                'metformin': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'repaglinide': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'nateglinide': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'chlorpropamide': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'glimepiride': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'acetohexamide': {'No': 0},
                'glipizide': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'glyburide': {'No': 0, 'Steady': 1, 'Down': 2, 'Up': 3},
                'tolbutamide': {'No': 0, 'Steady': 1},
                'pioglitazone': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'rosiglitazone': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'acarbose': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'miglitol': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'troglitazone': {'No': 0},
                'tolazamide': {'No': 0, 'Steady': 1},
                'examide': {'No': 0},
                'citoglipton': {'No': 0},
                'insulin': {'No': 0, 'Down': 1, 'Up': 2, 'Steady': 3},
                'glyburide.metformin': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'glipizide.metformin': {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3},
                'glimepiride.pioglitazone': {'No': 0},
                'metformin.rosiglitazone': {'No': 0},
                'metformin.pioglitazone': {'No': 0},
                'change': {'No': 0, 'Ch': 1},
                'diabetesMed': {'No': 0, 'Yes': 1},
                'admission_type_id': {'Emergency': 0, 'Elective': 1, 'Urgent': 2, 'Newborn': 3, 'Not Available': np.NaN,
                                      'Not Mapped': np.NaN},
                'medical_specialty': {'?': np.NaN, 'Family/GeneralPractice': 1, 'Emergency/Trauma': 2, 'InternalMedicine': 3,
                                      'Psychiatry': 4, 'Surgery-Neuro': 5, 'Surgery-General': 6, 'Cardiology': 7,
                                      'Orthopedics': 8, 'Orthopedics-Reconstructive': 9, 'Nephrology': 10,
                                      'Pediatrics-Pulmonology': 11, 'Gastroenterology': 12,
                                      'Surgery-Cardiovascular/Thoracic': 13, 'Osteopath': 14,
                                      'PhysicalMedicineandRehabilitation': 15, 'Hematology/Oncology': 16,
                                      'Surgery-Vascular': 17, 'Pediatrics-Endocrinology': 18, 'Oncology': 19,
                                      'ObstetricsandGynecology': 20, 'Urology': 21, 'Neurology': 22, 'Pulmonology': 23,
                                      'Surgery-Cardiovascular': 24, 'Radiologist': 25, 'OutreachServices': 26,
                                      'Surgery-Plastic': 27, 'Endocrinology': 28, 'Ophthalmology': 29,
                                      'Obsterics&Gynecology-GynecologicOnco': 30, 'Radiology': 31,
                                      'Surgery-Thoracic': 32, 'Pediatrics': 33, 'Psychology': 34, 'Otolaryngology': 35,
                                      'InfectiousDiseases': 36, 'Pediatrics-CriticalCare': 37, 'Gynecology': 38,
                                      'Pediatrics-Hematology-Oncology': 39, 'Surgeon': 40, 'Podiatry': 41,
                                      'Obstetrics': 42, 'Anesthesiology-Pediatric': 43, 'Hospitalist': 44,
                                      'Hematology': 45, 'Pathology': 46, 'Surgery-Pediatric': 47,
                                      'Cardiology-Pediatric': 48, 'Surgery-Colon&Rectal': 49, 'PhysicianNotFound': 50,
                                      'Surgery-PlasticwithinHeadandNeck': 51, 'Pediatrics-EmergencyMedicine': 52}
                }

In [18]:
if useDummy:
    # Set all missing values to NaN
    train_df = set_NaN(train_df, cat_columns, NaN_values)
    valid_df = set_NaN(valid_df, cat_columns, NaN_values)
    test_df = set_NaN(test_df, cat_columns, NaN_values)

    # Create Dummy Variables for all the categories
    train_cat_dummies = pd.get_dummies(train_df[cat_columns])
    valid_cat_dummies = pd.get_dummies(valid_df[cat_columns])
    test_cat_dummies = pd.get_dummies(test_df[cat_columns])

    # Some dummy-variable get excluded because the category doesn't exist in a dataframe.
    # In here we make it so that all 3 dataframes have the same columns
    unique_columns = np.unique(np.concatenate(
        (train_cat_dummies.columns,
         valid_cat_dummies.columns,
         test_cat_dummies.columns)))

    train_cat_dummies = create_missing_columns(train_cat_dummies, unique_columns)
    valid_cat_dummies = create_missing_columns(valid_cat_dummies, unique_columns)
    test_cat_dummies = create_missing_columns(test_cat_dummies, unique_columns)

    # Merge dummy variables and integer variables to one dataframe
    train_data = pd.concat(
        [train_cat_dummies, train_df[int_columns]], axis=1)
    valid_data = pd.concat(
        [valid_cat_dummies, valid_df[int_columns]], axis=1)
    test_data = pd.concat(
        [test_cat_dummies, test_df[int_columns]], axis=1)
else:
    # Fix categories - Apply category map to integers
    train_data = fix_categories(train_df, cat_columns, category_map)[cat_columns + int_columns]
    valid_data = fix_categories(valid_df, cat_columns, category_map)[cat_columns + int_columns]
    test_data = fix_categories(test_df, cat_columns, category_map)[cat_columns + int_columns]

Get y_values:

In [19]:
train_y = train_df[pred_columns[0]].values
valid_y = valid_df[pred_columns[0]].values
test_y = test_df[pred_columns[0]].values

Fill missing values with median:

In [20]:
train_data = train_data.fillna(train_data.median())
valid_data = valid_data.fillna(train_data.median())
test_data = test_data.fillna(train_data.median())

## Model grid and evaluation function

SVC was commented out since it takes too long and the scores were not that great.

In [22]:
models = {
    'AdaBoostClassifier': AdaBoostClassifier(base_estimator=LogisticRegression(solver='liblinear', class_weight='balanced')),
    # 'SVC': SVC(class_weight='balanced', gamma='auto'),
    'LogisticRegression': LogisticRegression(solver='liblinear', class_weight='balanced'),
    'GaussianNB': GaussianNB(),
    'BernoulliNB': BernoulliNB(),
    'RandomForest': RandomForestClassifier(class_weight='balanced')
}

params = {
    'AdaBoostClassifier':  {'n_estimators': [8, 16, 32, 64, 128, 256]},
    # 'SVC': [
    #    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100]},
    #     {'kernel': ['rbf'], 'C': [0.1, 1, 10, 100]},
    # ],
    'LogisticRegression': {'C': [0.1, 1, 10, 50, 100]},
    'GaussianNB': {},
    'BernoulliNB': {},
    'RandomForest': {'n_estimators': [16, 32, 100]},
}

The following models prepend standardization and feature selection to the pipeline: 

In [24]:
models2 = {
    'AdaBoostClassifier': Pipeline([('scaler', RobustScaler()),
                                    ('k_best', SelectKBest()),
                                    ('classifier', AdaBoostClassifier(base_estimator=LogisticRegression(solver='liblinear',
                                                                                                        class_weight='balanced')))]),
    'LogisticRegression': Pipeline([('scaler', RobustScaler()),
                                    ('k_best', SelectKBest()),
                                    ('classifier', LogisticRegression(solver='liblinear', class_weight='balanced'))]),

    'GaussianNB': Pipeline([('scaler', RobustScaler()),
                            ('k_best', SelectKBest()),
                            ('classifier', GaussianNB())]),

    'BernoulliNB': Pipeline([('scaler', RobustScaler()),
                            ('k_best', SelectKBest()),
                            ('classifier', BernoulliNB())]),

    'RandomForest': Pipeline([('scaler', RobustScaler()),
                              ('k_best', SelectKBest()),
                              ('classifier', RandomForestClassifier(class_weight='balanced'))])
}

params2 = {
    'AdaBoostClassifier':  {'k_best__k': [10, 20, 'all'],
                            'k_best__score_func': [f_classif, mutual_info_classif],
                            'classifier__n_estimators': [8, 16, 32, 64, 128, 256]},

    'LogisticRegression':  {'k_best__k': [10, 20, 'all'],
                            'k_best__score_func': [f_classif, mutual_info_classif],
                            'classifier__C': [0.1, 1, 10, 50, 100]},

    'GaussianNB':  {'k_best__k': [10, 20, 'all'],
                    'k_best__score_func': [f_classif, mutual_info_classif]},

    'BernoulliNB':  {'k_best__k': [10, 20, 'all'],
                     'k_best__score_func': [f_classif, mutual_info_classif]},

    'RandomForest':  {'k_best__k': [10, 20, 'all'],
                      'k_best__score_func': [f_classif, mutual_info_classif],
                      'classifier__n_estimators': [16, 32, 100]}
}

Definition of evaluation function. It performs 5x cross validation optimizing for f1 score. One can inspect the values of the grid search by checking the summary returned. 

In [27]:
def evaluate(X_train, train_y, X_test, test_y, models, params):
    helper = EstimatorSelectionHelper(models, params)
    helper.fit(X_train, train_y, X_test, test_y,
               scoring=make_scorer(f1_score), n_jobs=-1)
    summary = helper.score_summary(sort_by='mean_score')
    # summary.to_pickle('summary')

    print('\nF1 test scores:')
    sortedKeysAndValues = sorted(helper.test_f1_scores.items(), key=lambda kv: -kv[1])
    for k, v in sortedKeysAndValues:
        print(k + ': ' + str(v))

    print('\nAUROC test scores:')
    sortedKeysAndValues = sorted(helper.test_auroc_scores.items(), key=lambda kv: -kv[1])
    for k, v in sortedKeysAndValues:
        print(k + ': ' + str(v))

    return summary

## Ignoring text features

We wanted to have a baseline score for the task when not using any information coming from the diagnosis columns

In [28]:
X_train = train_data
X_valid = valid_data
X_test = test_data

X_train = np.concatenate((X_train, X_valid))
y_train = np.concatenate((train_y, valid_y))
y_test = test_y

summary = evaluate(X_train, y_train, X_test, y_test, models, params)
# print(summary[['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']])

Running GridSearchCV for AdaBoostClassifier.
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   34.6s finished


Running GridSearchCV for LogisticRegression.
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    7.0s finished


Running GridSearchCV for GaussianNB.
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for BernoulliNB.
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for RandomForest.
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    3.8s finished



F1 test scores:
AdaBoostClassifier: 0.5862785862785863
BernoulliNB: 0.5697373288938217
GaussianNB: 0.5656855707106964
LogisticRegression: 0.5459518599562363
RandomForest: 0.3800829875518672

AUROC test scores:
AdaBoostClassifier: 0.6366059273824088
LogisticRegression: 0.628508981341502
RandomForest: 0.6056641010666028
BernoulliNB: 0.5880488031668984
GaussianNB: 0.497675915294452


Hard coded results for models2:

F1 test scores:  
GaussianNB: 0.5656855707106964  
BernoulliNB: 0.5655105973025047  
AdaBoostClassifier: 0.5654496883348175  
LogisticRegression: 0.5220012055455092  
RandomForest: 0.420249653259362  
  
AUROC test scores:  
LogisticRegression: 0.6376099486914812  
BernoulliNB: 0.6219144105788951  
AdaBoostClassifier: 0.6130119490028219  
RandomForest: 0.5808404316560292  
GaussianNB: 0.497675915294452  

Hard coded results for models without dummy variables:

F1 test scores:  
LogisticRegression: 0.5248484848484849  
AdaBoostClassifier: 0.5136417556346381  
RandomForest: 0.43959469992205763  
BernoulliNB: 0.41346906812842593  
GaussianNB: 0.28185328185328185  
  
AUROC test scores:  
RandomForest: 0.6465468875861803  
LogisticRegression: 0.6419718518812602  
BernoulliNB: 0.6195015206587048  
GaussianNB: 0.618141756107448  
AdaBoostClassifier: 0.614639696348852  

Hard coded results for models2 without dummy variables:

F1 test scores:  
GaussianNB: 0.5682551056968829  
AdaBoostClassifier: 0.5411230856494612  
LogisticRegression: 0.5177725118483413  
RandomForest: 0.437094682230869  
BernoulliNB: 0.4362264150943397  
  
AUROC test scores:  
AdaBoostClassifier: 0.6467929302691007  
BernoulliNB: 0.6371612211657304  
LogisticRegression: 0.6353804154203464  
GaussianNB: 0.6253856497041741  
RandomForest: 0.5750936895014476  

## Google's Universal Sentence Encoder

In this section we will use a pretrained model to obtain sentence embeddings. The module comes in two flavors, one using a deep averaging network (DAN) encoder and a more powerful one with a Transformer encoder. We tried both and kept the first one, since it is lighter and the results were almost identical.

We defined three ways to combine the three sentence vectors obtained for each diagnosis: 
1. Average
2. Weighted average: we thought it might be a good idea to place a higher weight in the first diagnosis, since their order is not arbitrary
3. Concatenation

In [29]:
def flatten_diag(embs):
    diag_features = []
    embs = [np.zeros(EMB_SIZE) if np.isnan(x).all() else x for x in embs]
    for i in range(len(embs)):
        if i % 3 == 0:
            flattened = np.array(embs[i:i+3]).reshape(-1)
            diag_features.append(flattened)
    return diag_features


def average_diag(embs, weighted=False):
    diag_features = []
    weights = np.array([3, 2, 1])
    for i in range(len(embs)):
        if i % 3 == 0:
            aux = embs[i:i+3]
            nonans = []
            for j in range(3):
                if not np.isnan(aux[j]).all():
                    nonans.append(j)
            if weighted:
                averaged = np.dot(itemgetter(*nonans)(weights), itemgetter(*nonans)(aux))\
                           / np.sum(itemgetter(*nonans)(weights))
            else:
                averaged = np.average(itemgetter(*nonans)(aux), axis=0)
            diag_features.append(averaged)

    return diag_features


def avoid_bug(array):
    converted = np.zeros((array.shape[0], EMB_SIZE))
    for i in range(array.shape[0]):
        converted[i] = array[i]

    return converted

In [44]:
size_train = train_df.shape[0]
size_valid = valid_df.shape[0]
size_test = test_df.shape[0]

Be patient please, first execution takes a while for the model to be downloaded (~1GB). Have a look at the instructions inside 'preprocessing_uni.py' if facing problems executing the next cell

In [45]:
if not os.path.isfile('project2_data/10k_diabetes/uni_emb.pkl'):
    p = subprocess.run(["python", "preprocessing_uni.py"], stdout=subprocess.PIPE)
    print(p)

with open('project2_data/10k_diabetes/uni_emb.pkl', 'rb') as input_file:
    sentence_embs = pkl.load(input_file)

embs = np.array(average_diag(sentence_embs, weighted=False))
embs = avoid_bug(embs)

embs_train_unweighted = embs[:size_train]
embs_valid_unweighted = embs[size_train:(size_train + size_valid)]
embs_test_unweighted = embs[(size_train + size_valid):]

Now we evaluate the performance of the models using only the text representations

In [46]:
X_train = embs_train_unweighted
X_valid = embs_valid_unweighted
X_test = embs_test_unweighted

X_train = np.concatenate((X_train, X_valid))
y_train = np.concatenate((train_y, valid_y))
y_test = test_y

summary = evaluate(X_train, y_train, X_test, y_test, models, params)
# print(summary[['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']])

Running GridSearchCV for AdaBoostClassifier.
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.0min finished


Running GridSearchCV for LogisticRegression.
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   16.7s finished


Running GridSearchCV for GaussianNB.
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


Running GridSearchCV for BernoulliNB.
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.4s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


Running GridSearchCV for RandomForest.
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   29.5s finished



F1 test scores:
AdaBoostClassifier: 0.5322503583373148
GaussianNB: 0.49640685461580975
LogisticRegression: 0.4954545454545455
BernoulliNB: 0.472663139329806
RandomForest: 0.3408071748878924

AUROC test scores:
LogisticRegression: 0.578699181215921
GaussianNB: 0.5638854266463703
BernoulliNB: 0.5638613970000554
AdaBoostClassifier: 0.5635834889165868
RandomForest: 0.5183842465817828


Hard coded results for models2+unweighted:

F1 test scores:  
LogisticRegression: 0.505849582172702  
AdaBoostClassifier: 0.5042492917847025  
GaussianNB: 0.48763657274295574  
BernoulliNB: 0.44458052663808934  
RandomForest: 0.319634703196347  
  
AUROC test scores:  
LogisticRegression: 0.5771670300715352  
AdaBoostClassifier: 0.5767470336446392  
GaussianNB: 0.5674402471501362  
BernoulliNB: 0.5494587583359365  
RandomForest: 0.5154155404946555   

Hard coded results for models+weighted:

F1 test scores:  
LogisticRegression: 0.49520586576424136  
AdaBoostClassifier: 0.49229074889867847  
GaussianNB: 0.48763657274295574  
BernoulliNB: 0.4520123839009288  
RandomForest: 0.3214013709063214  
  
AUROC test scores:  
LogisticRegression: 0.5732679587651269  
GaussianNB: 0.5674402471501362  
BernoulliNB: 0.5607411996644207  
AdaBoostClassifier: 0.5565986975931697  
RandomForest: 0.5139748064829897  

Hard coded results for models2+weighted:

F1 test scores:  
LogisticRegression: 0.505849582172702  
AdaBoostClassifier: 0.5042492917847025  
GaussianNB: 0.48763657274295574  
BernoulliNB: 0.44458052663808934  
RandomForest: 0.319634703196347  
  
AUROC test scores:  
LogisticRegression: 0.5771670300715352  
AdaBoostClassifier: 0.5767470336446392  
GaussianNB: 0.5674402471501362  
BernoulliNB: 0.5494587583359365  
RandomForest: 0.5154155404946555  

Hard coded results for models+concatenation:

F1 test scores:  
AdaBoostClassifier: 0.5018607123870281  
LogisticRegression: 0.4999999999999999  
GaussianNB: 0.49914529914529915  
BernoulliNB: 0.4936268829663963  
RandomForest: 0.31145038167938927  
  
AUROC test scores:  
LogisticRegression: 0.5795255920957091  
BernoulliNB: 0.5757915940118121  
GaussianNB: 0.5708138005393087  
AdaBoostClassifier: 0.5635594592702718  
RandomForest: 0.5182442477728174  

We can see the best method is unweighted averaging, so we will use this in combination with the rest of the features.

In [47]:
X_train = np.concatenate((embs_train_unweighted, train_data), axis=1)
X_valid = np.concatenate((embs_valid_unweighted, valid_data), axis=1)
X_test = np.concatenate((embs_test_unweighted, test_data), axis=1)

X_train = np.concatenate((X_train, X_valid))
y_train = np.concatenate((train_y, valid_y))
y_test = test_y

summary = evaluate(X_train, y_train, X_test, y_test, models, params)
# print(summary[['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']])

Running GridSearchCV for AdaBoostClassifier.
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.9min finished


Running GridSearchCV for LogisticRegression.
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.9min finished


Running GridSearchCV for GaussianNB.
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.6s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


Running GridSearchCV for BernoulliNB.
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


Running GridSearchCV for RandomForest.
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   24.5s finished



F1 test scores:
AdaBoostClassifier: 0.5870563674321504
GaussianNB: 0.5662953647143371
BernoulliNB: 0.5430656934306569
LogisticRegression: 0.3202911737943585
RandomForest: 0.30173124484748554

AUROC test scores:
AdaBoostClassifier: 0.6368942831381882
LogisticRegression: 0.6274642141104173
BernoulliNB: 0.5802370785800778
RandomForest: 0.5357806657465749
GaussianNB: 0.49891083016159415


Hard coded results for models+weighted:

F1 test scores:  
AdaBoostClassifier: 0.5867112411199331  
GaussianNB: 0.5656855707106964  
BernoulliNB: 0.5355566454144188  
LogisticRegression: 0.31444241316270566  
RandomForest: 0.305532617671346  
  
AUROC test scores:  
AdaBoostClassifier: 0.637052042990082  
LogisticRegression: 0.6276366007035462  
BernoulliNB: 0.5803964055828181  
RandomForest: 0.5330961363463028  
GaussianNB: 0.497675915294452  

#### Conclusions for USE

- We have seen using only text features has some predictive power, but not enough to improve the score achieved with only non-textual data
- Weighted method decreases f1 score but mantains auroc. Perhaps the used values for the weights are not optimal
- Something similar occurs when using concatenation method
- Standardization+feature selection was not useful. The best models from the grid used all the features and others using less features performed similarly 
- Using the more powerful version of USE didn't make significant improvements (slightly decreased std)
- Words are very specific to medical domain, fine tuning might offer significant improvements

## Word2vec + RNN