In [1]:
import os 
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, train_test_split, LeaveOneOut, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression

# Loading Data 

In [2]:
os.chdir('..')

In [3]:
df=pd.read_parquet(os.path.join('data', 'modelData', 'modelData.parquet'))

# Settings

In [4]:
df = df[df["BIN"] == 32]
df.drop(columns = ["BIN", "ID"], inplace = True)
df.reset_index(drop = True , inplace = True)

In [5]:
logreg = LogisticRegression(
                solver   = 'sag',
                penalty  = 'l2',
                max_iter = 10000,
                n_jobs   = -1
                )

# Stratified cross-validation

In [6]:
n_Mask = 8

skf = StratifiedKFold(n_splits = 5)

groupFeat_lst = []
feat_lst = []
mask_lst = []
auc_lst  = []
acc_lst  = []

In [7]:
for mask in np.arange(1,n_Mask+1): #For each mask 
    
    aux = df[df['MASK'] == mask].copy(deep=True)
    aux.reset_index(drop = True, inplace = True)
    aux.drop(columns = 'MASK', inplace = True)

    y = aux['SEVERE']
    X = aux.drop(columns = 'SEVERE')

    for feature in range(X.shape[1]):#For each feature

        string_lst = X.iloc[:,feature].name.split('_')

        for train_index, test_index in skf.split(X, y):
            
            X_train, X_test = X.iloc[train_index, feature], X.iloc[test_index, feature]
            y_train, y_test = y[train_index], y[test_index]

            logreg.fit(X_train.values.reshape(-1, 1), y_train)

            y_predict = logreg.predict(X_test.values.reshape(-1,1))
            roc_auc = roc_auc_score(y_test, y_predict)
            acc = logreg.score(X_test.values.reshape(-1, 1), y_test)
            groupFeat_lst.append(string_lst[1])
            feat_lst.append(string_lst[0])
            mask_lst.append(mask)
            acc_lst.append(acc)
            auc_lst.append(roc_auc)
            
groupFeat_lst = np.array(groupFeat_lst)
feat_lst = np.array(feat_lst)
mask_lst = np.array(mask_lst)
auc_lst  = np.array(auc_lst)
acc_lst  = np.array(acc_lst)

In [8]:
data = np.vstack([groupFeat_lst, feat_lst, mask_lst, acc_lst, auc_lst]).T
kfold_df = pd.DataFrame(data, columns = ['GROUP','FEATURE', 'MASK', 'ACC' ,'AUC'])
kfold_df['ACC'] = kfold_df['ACC'].astype(float)
kfold_df['AUC'] = kfold_df['AUC'].astype(float)
kfold_df['MASK'] = kfold_df['MASK'].astype(int)

In [9]:
kfold_df.groupby(by = ['FEATURE','GROUP','MASK']).mean().nlargest(15, 'AUC')
kfold_df.reset_index(drop = True, inplace = True)

In [10]:
kfold_df.to_parquet(os.path.join('data', 'validationModelData', f'kfold{skf.get_n_splits()}.parquet'))

# Repeated Random Test-Train Splits

In [11]:
n_Mask = 8

n_iter = 100

groupFeat_lst = []
feat_lst = []
mask_lst = []
auc_lst  = []
acc_lst  = []

In [12]:
for mask in np.arange(1,n_Mask+1): #For each mask 
    
    aux = df[df['MASK'] == mask].copy(deep=True)
    aux.reset_index(drop = True, inplace = True)
    aux.drop(columns = 'MASK', inplace = True)

    y = aux['SEVERE']
    X = aux.drop(columns = 'SEVERE')

    for feature in range(X.shape[1]):#For each feature

        string_lst = X.iloc[:,feature].name.split('_')

        for i in range(n_iter):
            
            X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, feature], y, test_size=0.2,
                                                                stratify = y, random_state = i)
             
            logreg.fit(X_train.values.reshape(-1, 1), y_train)

            y_predict = logreg.predict(X_test.values.reshape(-1,1))
            roc_auc = roc_auc_score(y_test, y_predict)
            acc = logreg.score(X_test.values.reshape(-1, 1), y_test)
            groupFeat_lst.append(string_lst[1])
            feat_lst.append(string_lst[0])
            mask_lst.append(mask)
            acc_lst.append(acc)
            auc_lst.append(roc_auc)
            
groupFeat_lst = np.array(groupFeat_lst)
feat_lst = np.array(feat_lst)
mask_lst = np.array(mask_lst)
auc_lst  = np.array(auc_lst)
acc_lst  = np.array(acc_lst)



In [13]:
data = np.vstack([groupFeat_lst, feat_lst, mask_lst, acc_lst, auc_lst]).T
randomTestTrainSplit_df = pd.DataFrame(data, columns = ['GROUP','FEATURE', 'MASK', 'ACC' ,'AUC'])
randomTestTrainSplit_df['ACC']  = randomTestTrainSplit_df['ACC'].astype(float)
randomTestTrainSplit_df['AUC']  = randomTestTrainSplit_df['AUC'].astype(float)
randomTestTrainSplit_df['MASK'] = randomTestTrainSplit_df['MASK'].astype(int)

In [14]:
randomTestTrainSplit_df.groupby(by = ['FEATURE','GROUP','MASK']).mean().nlargest(10, 'AUC')
randomTestTrainSplit_df.reset_index(drop = True, inplace = True)

In [15]:
randomTestTrainSplit_df.to_parquet(os.path.join('data', 'validationModelData', f'randomTestTrainSplit{n_iter}.parquet'))

# Bootstrapping

In [16]:
from sklearn.utils import resample

In [17]:
n_Mask = 8

n_iter = 50

groupFeat_lst = []
feat_lst = []
mask_lst = []
auc_lst  = []
acc_lst  = []

In [18]:
for mask in np.arange(1,n_Mask+1): #For each mask 
    
    aux = df[df['MASK'] == mask].copy(deep=True)
    aux.reset_index(drop = True, inplace = True)
    aux.drop(columns = 'MASK', inplace = True)

    for feature in range(1, aux.shape[1]):#For each feature

        string_lst = aux.iloc[:,feature].name.split('_')

        for i in range(n_iter):

            data = aux[['SEVERE', ('_').join(string_lst)]].values
            
            train = resample(data, n_samples = len(aux))
            test = np.array([x for x in data if x.tolist() not in train.tolist()])

            logreg.fit(train[:,1:],train[:,0].astype(int))

            y_predict = logreg.predict(test[:,1:])
            roc_auc = roc_auc_score(test[:,0], y_predict)
            acc = accuracy_score(test[:,0], y_predict)
            groupFeat_lst.append(string_lst[1])
            feat_lst.append(string_lst[0])
            mask_lst.append(mask)
            acc_lst.append(acc)
            auc_lst.append(roc_auc)
            
            
groupFeat_lst = np.array(groupFeat_lst)
feat_lst = np.array(feat_lst)
mask_lst = np.array(mask_lst)
auc_lst  = np.array(auc_lst)
acc_lst  = np.array(acc_lst)



In [19]:
data = np.vstack([groupFeat_lst, feat_lst, mask_lst, acc_lst, auc_lst]).T
bootstrapping_df = pd.DataFrame(data, columns = ['GROUP','FEATURE', 'MASK', 'ACC' ,'AUC'])
bootstrapping_df['ACC']  = bootstrapping_df['ACC'].astype(float)
bootstrapping_df['AUC']  = bootstrapping_df['AUC'].astype(float)
bootstrapping_df['MASK'] = bootstrapping_df['MASK'].astype(int)

In [20]:
bootstrapping_df.groupby(by = ['FEATURE','GROUP','MASK']).mean().nlargest(10, 'AUC')
bootstrapping_df.reset_index(drop = True, inplace = True)

In [21]:
bootstrapping_df.to_parquet(os.path.join('data', 'validationModelData', f'bootstrapping{n_iter}.parquet'))