In [1]:
import pandas as pd
import numpy as np
import random
import os
import warnings

from sklearn.model_selection import StratifiedKFold
from lazypredict.Supervised import LazyClassifier

warnings.simplefilter("ignore", UserWarning)
# warnings.simplefilter("ignore", DataConversionWarning)
warnings.simplefilter("ignore", FutureWarning)



# Load Data

In [2]:
BASEDIR = '../data'

metadata = pd.read_csv(
    os.path.join(BASEDIR, 'metadata', 'metadata.csv')
)

tax_genus = pd.read_csv(
    os.path.join(BASEDIR, 'taxonomy', 'taxonomy_relabd.genus.csv'),
)

pt_ra_1e_1 = pd.read_csv(
    os.path.join(BASEDIR, 'phylotypes', 'phylotype_relabd.1e_1.csv'),
)

alpha = pd.read_csv(
    os.path.join(BASEDIR, 'alpha_diversity', 'alpha_diversity.csv'),
)

cst = pd.read_csv(
    os.path.join(BASEDIR, 'community_state_types', 'cst_valencia.csv'),
)

sv_counts = pd.read_csv(
    os.path.join(BASEDIR, 'sv_counts', 'sp_sv_long.csv'),
)

In [3]:
race_meta = pd.read_csv(
    os.path.join(BASEDIR, 'metadata', 'metadata_normalized.csv')
)

meta = metadata[['specimen', 'participant_id', 'collect_wk']]

index = np.array(meta.specimen)

## sv_counts for features

In [4]:
def sv_to_num(sv):
    try:
        num = int(sv.split('___')[1])
    except:
        num = None
    return num

def sv_for_features(df, index, cols):
    df['num_sv'] = df.sv.map(lambda x: sv_to_num(x))
    
    num_features = 53129
   
    features = []
    for specimen in index:
        feature = np.zeros(num_features)
        new_df = df[df['specimen'] == specimen]
        sv = np.array(new_df['num_sv'])
        fract = np.array(new_df['fract'])
        for i in range(len(sv)):
            if sv[i] == None:
                continue
            else:
                feature[sv[i]-1] = fract[i]
        features.append(feature)
    df_features = pd.DataFrame(features, index=index, columns = cols)
    df_features.index.name = 'specimen'
    return df_features

index = np.array(metadata.specimen)
sv_cols = np.load('../sv_cols.npy') # sorted sv array

new_sv = sv_for_features(sv_counts, index, sv_cols)

## Merge all df

In [5]:
def merge_df(A):
    df = A[0]
    for i in range(1, len(A)):
        df = pd.merge(df, A[i], on='specimen')
    return df

In [6]:
meta = metadata[['specimen', 'participant_id', 'collect_wk']]
race_meta = race_meta[['Race: American Indian or Alaska Native', 'Race: Asian',
                       'Race: Black or African American',
                       'Race: Native Hawaiian or Other Pacific Islander', 'Race: Unknown',
                       'Race: White', 'specimen']]

df = merge_df([meta, tax_genus, pt_ra_1e_1, alpha, cst, new_sv, race_meta])
df = df[df['collect_wk'] <= 32]
# df = df.set_index('specimen')
df = df.drop(columns = ['CST', 'subCST', 'specimen', 'collect_wk'])

dup = df.duplicated('participant_id', keep=False)
duplicated = df[dup]
dup_mean = duplicated.groupby(['participant_id'], as_index=False).aggregate(np.mean)
df = df.drop_duplicates(['participant_id'], keep=False)
df = pd.concat([df, dup_mean])
df = df.set_index('participant_id')

In [7]:
y_df = metadata[['specimen', 'was_preterm', 'participant_id']]
y_df = y_df.drop_duplicates(['participant_id'], keep='first')
y_df = y_df.set_index('participant_id')
y_df = y_df.drop(columns = ['specimen'])

y_df = y_df.loc[df.index]

# Load selected features, alph, cst and race one hot

In [8]:
race_features = np.array(race_meta.columns[:6])
alpha_cst_features = np.load('alpha_cst_features.npy')
selected_features = np.load('selected_features.npy')

In [9]:
all_features = np.concatenate((alpha_cst_features, selected_features))

# Model selection using lazyclassifier
## When model selection, we don't use race features

In [10]:
kf = StratifiedKFold(n_splits=5, random_state = 1)
models = []
for train_idx, val_idx in kf.split(df.index, y_df):
    train_id = np.array(df.index)[train_idx]
    val_id = np.array(df.index)[val_idx]

    train = df.loc[train_id]
    val = df.loc[val_id]

    X_train = train[all_features]
    X_val = val[all_features]

    y_train = np.array(y_df.loc[train_id])
    y_val = np.array(y_df.loc[val_id])
    
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    model, predictions = clf.fit(X_train, X_val, y_train, y_val)
    models.append(model)

100%|██████████| 29/29 [01:58<00:00,  4.09s/it]
100%|██████████| 29/29 [02:07<00:00,  4.39s/it]
100%|██████████| 29/29 [02:07<00:00,  4.41s/it]
100%|██████████| 29/29 [02:09<00:00,  4.45s/it]
100%|██████████| 29/29 [00:36<00:00,  1.25s/it]


In [11]:
models[0]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BernoulliNB,0.95,0.93,0.93,0.95,0.1
NearestCentroid,0.92,0.9,0.9,0.92,0.09
QuadraticDiscriminantAnalysis,0.9,0.9,0.9,0.9,1.9
PassiveAggressiveClassifier,0.92,0.9,0.9,0.92,0.15
LogisticRegression,0.92,0.89,0.89,0.92,0.3
SVC,0.89,0.88,0.88,0.89,0.7
CalibratedClassifierCV,0.9,0.87,0.87,0.9,3.3
GaussianNB,0.89,0.87,0.87,0.89,0.08
SGDClassifier,0.87,0.87,0.87,0.87,0.16
LinearSVC,0.88,0.86,0.86,0.88,1.0


In [12]:
models[1]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CalibratedClassifierCV,0.77,0.7,0.7,0.76,2.57
PassiveAggressiveClassifier,0.78,0.69,0.69,0.76,0.15
NearestCentroid,0.77,0.68,0.68,0.75,0.11
LinearSVC,0.77,0.67,0.67,0.75,0.75
LogisticRegression,0.76,0.66,0.66,0.73,0.29
SVC,0.67,0.65,0.65,0.68,0.68
Perceptron,0.7,0.63,0.63,0.69,0.12
RidgeClassifierCV,0.74,0.62,0.62,0.7,2.46
RidgeClassifier,0.74,0.62,0.62,0.7,0.27
NuSVC,0.64,0.61,0.61,0.65,1.03


In [13]:
models[2]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.8,0.78,0.78,0.8,0.71
GaussianNB,0.81,0.76,0.76,0.81,0.09
Perceptron,0.81,0.76,0.76,0.8,0.16
LinearSVC,0.82,0.75,0.75,0.81,1.04
CalibratedClassifierCV,0.81,0.75,0.75,0.8,3.68
PassiveAggressiveClassifier,0.81,0.75,0.75,0.8,0.13
NearestCentroid,0.79,0.74,0.74,0.79,0.09
LogisticRegression,0.81,0.74,0.74,0.8,0.28
QuadraticDiscriminantAnalysis,0.74,0.73,0.73,0.75,1.8
NuSVC,0.74,0.7,0.7,0.74,0.99


In [14]:
models[3]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
QuadraticDiscriminantAnalysis,0.84,0.78,0.78,0.83,1.48
PassiveAggressiveClassifier,0.82,0.72,0.72,0.8,0.13
SVC,0.78,0.72,0.72,0.77,0.79
LinearSVC,0.81,0.71,0.71,0.79,0.89
CalibratedClassifierCV,0.81,0.7,0.7,0.78,3.3
NuSVC,0.73,0.68,0.68,0.73,1.03
RidgeClassifier,0.79,0.67,0.67,0.75,0.26
RidgeClassifierCV,0.77,0.63,0.63,0.72,2.57
LinearDiscriminantAnalysis,0.75,0.63,0.63,0.7,2.52
SGDClassifier,0.72,0.58,0.58,0.66,0.21


In [15]:
models[4]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Perceptron,0.76,0.62,0.62,0.7,0.13
GaussianNB,0.74,0.6,0.6,0.68,0.1
CalibratedClassifierCV,0.73,0.59,0.59,0.67,3.51
LinearSVC,0.73,0.59,0.59,0.67,0.98
PassiveAggressiveClassifier,0.73,0.58,0.58,0.66,0.14
SGDClassifier,0.69,0.58,0.58,0.65,0.19
RidgeClassifierCV,0.71,0.56,0.56,0.64,1.79
LogisticRegression,0.72,0.56,0.56,0.63,0.29
SVC,0.67,0.55,0.55,0.63,0.76
DecisionTreeClassifier,0.71,0.55,0.55,0.62,0.21


# Model training
## Simply add race feature for selected model training

In [16]:
all_features = np.concatenate((selected_features, alpha_cst_features, race_features))

In [17]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1) # Seed 고정

In [18]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, balanced_accuracy_score

def cal_mat(preds, labels):
    
    result = {}
    result['acc'] = accuracy_score(labels, preds)
    result['AUC'] = roc_auc_score(labels, preds)
    result['macro f1'] = f1_score(labels, preds, average='macro')
    result['precision'] = precision_score(labels, preds)
    result['recall'] = recall_score(labels, preds)
    result['bal_acc'] = balanced_accuracy_score(labels, preds)
    
    return result

In [19]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import VotingClassifier
from joblib import dump, load

seed = 1

kf = StratifiedKFold(n_splits=5, random_state = seed)
i = 1

indexes = []
pred_labels = []
pred_probability = []

acc_results = []
auc_results = []
for train_idx, val_idx in kf.split(y_df.index, y_df):
    train_id = np.array(y_df.index)[train_idx]
    val_id = np.array(y_df.index)[val_idx]

    train = df.loc[train_id]
    val = df.loc[val_id]

    X_train = train[all_features]
    X_val = val[all_features]

    y_train = np.array(y_df.loc[train_id])
    y_val = np.array(y_df.loc[val_id])

    clf1 = make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis())
    
    clf3 = make_pipeline(StandardScaler(), CalibratedClassifierCV())

    lsvc = make_pipeline(StandardScaler(), LinearSVC())
    clf4 = CalibratedClassifierCV(lsvc, cv=10, method='isotonic')

    clf6 = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))

    pac = make_pipeline(StandardScaler(), PassiveAggressiveClassifier())
    clf7 = CalibratedClassifierCV(pac, cv=10, method='isotonic')
    
    lgr = make_pipeline(StandardScaler(), LogisticRegression())
    clf8 = CalibratedClassifierCV(lgr, cv=10, method='isotonic')


    eclf = VotingClassifier(estimators = [
                                          ('qda', clf1), 
                                          ('cbc', clf3), 
                                          ('lsvc', clf4),
                                          ('svc', clf6),
                                          ('pac', clf7),
                                          ('lgr', clf8),
                                                            ],
                                            voting='soft')

    eclf = eclf.fit(np.array(X_train), y_train)

    preds = eclf.predict(np.array(X_val))
    probability = eclf.predict_proba(np.array(X_val))

    result = cal_mat(preds, y_val)
    print(result)
    acc_results.append(result['acc'])
    auc_results.append(result['AUC'])
    dump(eclf, f'Models/model_{i}.save')
    i += 1
    
    indexes.append(val_id)
    pred_labels.append(preds)
    pred_probability.append(probability[:,1])
    
print('Average acc:', sum(acc_results) / 5)
print('Average AUC:', sum(auc_results) / 5)
print('______________________________________________________________')

{'acc': 0.9176954732510288, 'AUC': 0.9055944055944056, 'macro f1': 0.9055944055944056, 'precision': 0.8717948717948718, 'recall': 0.8717948717948718, 'bal_acc': 0.9055944055944056}
{'acc': 0.7242798353909465, 'AUC': 0.6617715617715618, 'macro f1': 0.6680666272503006, 'precision': 0.5846153846153846, 'recall': 0.48717948717948717, 'bal_acc': 0.6617715617715617}
{'acc': 0.8189300411522634, 'AUC': 0.7821678321678323, 'macro f1': 0.7878571428571429, 'precision': 0.7361111111111112, 'recall': 0.6794871794871795, 'bal_acc': 0.7821678321678323}
{'acc': 0.8353909465020576, 'AUC': 0.7435897435897436, 'macro f1': 0.7735321528424977, 'precision': 1.0, 'recall': 0.48717948717948717, 'bal_acc': 0.7435897435897436}
{'acc': 0.7355371900826446, 'AUC': 0.5948051948051948, 'macro f1': 0.584192439862543, 'precision': 0.8421052631578947, 'recall': 0.2077922077922078, 'bal_acc': 0.5948051948051948}
Average acc: 0.8063666972757882
Average AUC: 0.7375857475857476
_____________________________________________