In [1]:
import pandas as pd
import numpy as np
import random
import os
import warnings

from sklearn.model_selection import StratifiedKFold
from lazypredict.Supervised import LazyClassifier

warnings.simplefilter("ignore", UserWarning)
# warnings.simplefilter("ignore", DataConversionWarning)
warnings.simplefilter("ignore", FutureWarning)



# Load Data

In [2]:
BASEDIR = '../data'

metadata = pd.read_csv(
    os.path.join(BASEDIR, 'metadata', 'metadata.csv')
)

tax_genus = pd.read_csv(
    os.path.join(BASEDIR, 'taxonomy', 'taxonomy_relabd.genus.csv'),
)

pt_ra_1e_1 = pd.read_csv(
    os.path.join(BASEDIR, 'phylotypes', 'phylotype_relabd.1e_1.csv'),
)

alpha = pd.read_csv(
    os.path.join(BASEDIR, 'alpha_diversity', 'alpha_diversity.csv'),
)

cst = pd.read_csv(
    os.path.join(BASEDIR, 'community_state_types', 'cst_valencia.csv'),
)

sv_counts = pd.read_csv(
    os.path.join(BASEDIR, 'sv_counts', 'sp_sv_long.csv'),
)

In [3]:
race_meta = pd.read_csv(
    os.path.join(BASEDIR, 'metadata', 'metadata_normalized.csv')
)

meta = metadata[['specimen', 'participant_id', 'collect_wk']]

index = np.array(meta.specimen)

## sv_counts for features

In [4]:
def sv_to_num(sv):
    try:
        num = int(sv.split('___')[1])
    except:
        num = None
    return num

def sv_for_features(df, index, cols):
    df['num_sv'] = df.sv.map(lambda x: sv_to_num(x))
    
    num_features = 53129
   
    features = []
    for specimen in index:
        feature = np.zeros(num_features)
        new_df = df[df['specimen'] == specimen]
        sv = np.array(new_df['num_sv'])
        fract = np.array(new_df['fract'])
        for i in range(len(sv)):
            if sv[i] == None:
                continue
            else:
                feature[sv[i]-1] = fract[i]
        features.append(feature)
    df_features = pd.DataFrame(features, index=index, columns = cols)
    df_features.index.name = 'specimen'
    return df_features

index = np.array(metadata.specimen)
sv_cols = np.load('../sv_cols.npy') # sorted sv array

new_sv = sv_for_features(sv_counts, index, sv_cols)

## Merge all df

In [5]:
def merge_df(A):
    df = A[0]
    for i in range(1, len(A)):
        df = pd.merge(df, A[i], on='specimen')
    return df

In [6]:
meta = metadata[['specimen', 'participant_id', 'collect_wk']]
race_meta = race_meta[['Race: American Indian or Alaska Native', 'Race: Asian',
                       'Race: Black or African American',
                       'Race: Native Hawaiian or Other Pacific Islander', 'Race: Unknown',
                       'Race: White', 'specimen']]

df = merge_df([meta, tax_genus, pt_ra_1e_1, alpha, cst, new_sv, race_meta])
df = df[df['collect_wk'] <= 28]
# df = df.set_index('specimen')
df = df.drop(columns = ['CST', 'subCST', 'specimen', 'collect_wk'])

dup = df.duplicated('participant_id', keep=False)
duplicated = df[dup]
dup_mean = duplicated.groupby(['participant_id'], as_index=False).aggregate(np.mean)
df = df.drop_duplicates(['participant_id'], keep=False)
df = pd.concat([df, dup_mean])
df = df.set_index('participant_id')

In [7]:
# Remove preterm case for remaining early_preterm and term
y_df = metadata[['specimen', 'was_early_preterm', 'was_preterm', 'participant_id', 'collect_wk']]
y_df = y_df[y_df['collect_wk']<=28]
y_df = y_df[(y_df['was_early_preterm']) ==  (y_df['was_preterm'])]
y_df = y_df.drop_duplicates(['participant_id'], keep='first')
y_df = y_df.set_index('participant_id')

y_df = y_df.drop(columns = ['specimen', 'collect_wk'])
y_df = y_df.sort_values(by=['participant_id'], ascending=[True]) 
y_df = y_df.drop(columns = ['was_preterm'])

# Load selected features, alph, cst and race one hot

In [8]:
race_features = np.array(race_meta.columns[:6])
alpha_cst_features = np.load('alpha_cst_features.npy')
selected_features = np.load('selected_features.npy')

# Model selection using lazyclassifier
## When model selection, we don't use race features

In [9]:
all_features = np.concatenate((alpha_cst_features, selected_features))

In [10]:
kf = StratifiedKFold(n_splits=5, random_state = 1)
models = []
for train_idx, val_idx in kf.split(y_df.index, y_df):
    train_id = np.array(y_df.index)[train_idx]
    val_id = np.array(y_df.index)[val_idx]

    train = df.loc[train_id]
    val = df.loc[val_id]

    X_train = train[all_features]
    X_val = val[all_features]

    y_train = np.array(y_df.loc[train_id])
    y_val = np.array(y_df.loc[val_id])
    
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    model, predictions = clf.fit(X_train, X_val, y_train, y_val)
    models.append(model)

100%|██████████| 29/29 [00:06<00:00,  4.16it/s]
100%|██████████| 29/29 [00:07<00:00,  3.75it/s]
100%|██████████| 29/29 [00:08<00:00,  3.30it/s]
100%|██████████| 29/29 [00:08<00:00,  3.59it/s]
100%|██████████| 29/29 [00:07<00:00,  3.77it/s]


In [11]:
models[0]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BernoulliNB,0.94,0.91,0.91,0.94,0.08
ExtraTreeClassifier,0.94,0.89,0.89,0.94,0.05
NearestCentroid,0.9,0.84,0.84,0.9,0.08
RidgeClassifierCV,0.89,0.83,0.83,0.89,0.23
ExtraTreesClassifier,0.86,0.83,0.83,0.87,0.39
LogisticRegression,0.88,0.82,0.82,0.88,0.1
LinearSVC,0.81,0.81,0.81,0.83,0.3
PassiveAggressiveClassifier,0.78,0.8,0.8,0.8,0.09
RidgeClassifier,0.85,0.8,0.8,0.86,0.1
CalibratedClassifierCV,0.79,0.8,0.8,0.81,1.2


In [12]:
models[1]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.85,0.85,0.85,0.86,0.2
CalibratedClassifierCV,0.89,0.85,0.85,0.89,0.82
PassiveAggressiveClassifier,0.86,0.85,0.85,0.87,0.08
LogisticRegression,0.94,0.84,0.84,0.93,0.09
LinearSVC,0.87,0.84,0.84,0.88,0.21
NearestCentroid,0.88,0.82,0.82,0.88,0.08
ExtraTreeClassifier,0.84,0.81,0.81,0.85,0.05
RidgeClassifierCV,0.92,0.8,0.8,0.92,0.33
ExtraTreesClassifier,0.84,0.79,0.79,0.85,0.39
RidgeClassifier,0.92,0.78,0.78,0.91,0.13


In [13]:
models[2]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.91,0.85,0.85,0.91,0.28
QuadraticDiscriminantAnalysis,0.86,0.8,0.8,0.87,0.3
PassiveAggressiveClassifier,0.89,0.7,0.7,0.87,0.09
CalibratedClassifierCV,0.88,0.64,0.64,0.84,1.64
LinearSVC,0.86,0.62,0.62,0.83,0.46
ExtraTreeClassifier,0.84,0.61,0.61,0.81,0.05
BernoulliNB,0.86,0.59,0.59,0.82,0.08
DecisionTreeClassifier,0.82,0.58,0.58,0.79,0.19
GaussianNB,0.82,0.58,0.58,0.79,0.06
BaggingClassifier,0.83,0.58,0.58,0.8,0.64


In [14]:
models[3]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.88,0.79,0.79,0.88,0.27
Perceptron,0.91,0.73,0.73,0.89,0.1
SGDClassifier,0.87,0.71,0.71,0.86,0.11
DecisionTreeClassifier,0.86,0.66,0.66,0.84,0.16
GaussianNB,0.83,0.65,0.65,0.82,0.06
PassiveAggressiveClassifier,0.87,0.65,0.65,0.85,0.09
RidgeClassifierCV,0.87,0.62,0.62,0.84,0.35
QuadraticDiscriminantAnalysis,0.82,0.62,0.62,0.81,0.32
LinearSVC,0.87,0.61,0.61,0.83,0.44
CalibratedClassifierCV,0.87,0.61,0.61,0.83,1.37


In [15]:
models[4]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.87,0.74,0.74,0.87,0.25
LinearSVC,0.87,0.65,0.65,0.84,0.44
PassiveAggressiveClassifier,0.86,0.64,0.64,0.84,0.09
CalibratedClassifierCV,0.86,0.63,0.63,0.83,1.36
QuadraticDiscriminantAnalysis,0.83,0.61,0.61,0.81,0.39
SGDClassifier,0.86,0.6,0.6,0.82,0.1
Perceptron,0.86,0.6,0.6,0.82,0.1
LogisticRegression,0.84,0.55,0.55,0.78,0.2
ExtraTreeClassifier,0.83,0.54,0.54,0.78,0.05
GaussianNB,0.81,0.54,0.54,0.77,0.06


# Model training
## Simply add race feature for selected model training

In [16]:
all_features = np.concatenate((selected_features, alpha_cst_features, race_features))

In [17]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1) # Seed 고정

In [18]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, balanced_accuracy_score

def cal_mat(preds, labels):
    
    result = {}
    result['acc'] = accuracy_score(labels, preds)
    result['AUC'] = roc_auc_score(labels, preds)
    result['macro f1'] = f1_score(labels, preds, average='macro')
    result['precision'] = precision_score(labels, preds)
    result['recall'] = recall_score(labels, preds)
    result['bal_acc'] = balanced_accuracy_score(labels, preds)
    
    return result

In [19]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.ensemble import VotingClassifier
from joblib import dump, load
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, random_state = 1)
i = 1

indexes = []
pred_labels = []
pred_probability = []

seed = 1
acc_results = []
auc_results = []
for train_idx, val_idx in kf.split(y_df.index, y_df):
    train_id = np.array(y_df.index)[train_idx]
    val_id = np.array(y_df.index)[val_idx]

    train = df.loc[train_id]
    val = df.loc[val_id]

    X_train = train[all_features]
    X_val = val[all_features]

    y_train = np.array(y_df.loc[train_id])
    y_val = np.array(y_df.loc[val_id])

    clf1 = make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis())
    
    clf3 = make_pipeline(StandardScaler(), CalibratedClassifierCV())
    
    lsvc = make_pipeline(StandardScaler(), LinearSVC())
    
    clf4 = CalibratedClassifierCV(lsvc, cv=10, method='isotonic')
    
    clf6 = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
    
    pac = make_pipeline(StandardScaler(), PassiveAggressiveClassifier())
    clf7 = CalibratedClassifierCV(pac, cv=10, method='isotonic')
    
    eclf = VotingClassifier(estimators = [
                                          ('qda', clf1), 
                                          ('cbc', clf3), 
                                          ('lsvc', clf4),
                                          ('svc', clf6),
                                          ('pac', clf7),
                                                            ],
                                            voting='soft')

    eclf = eclf.fit(np.array(X_train), y_train)

    preds = eclf.predict(np.array(X_val))
    probability = eclf.predict_proba(np.array(X_val))

    result = cal_mat(preds, y_val)
    print(result)
    acc_results.append(result['acc'])
    auc_results.append(result['AUC'])
    dump(eclf, f'Models/models_{i}.save')
    i += 1
    
    indexes.append(val_id)
    pred_labels.append(preds)
    pred_probability.append(probability[:,1])
    
print('Average acc:', sum(acc_results) / 5)
print('Average AUC:', sum(auc_results) / 5)
print('______________________________________________________________')

{'acc': 0.671875, 'AUC': 0.7778730703259005, 'macro f1': 0.6263783783783784, 'precision': 0.33695652173913043, 'recall': 0.9393939393939394, 'bal_acc': 0.7778730703259005}
{'acc': 0.8385416666666666, 'AUC': 0.8424814179531162, 'macro f1': 0.769650528271218, 'precision': 0.5185185185185185, 'recall': 0.8484848484848485, 'bal_acc': 0.8424814179531162}
{'acc': 0.8958333333333334, 'AUC': 0.7570040022870211, 'macro f1': 0.7909407665505226, 'precision': 0.782608695652174, 'recall': 0.5454545454545454, 'bal_acc': 0.7570040022870211}
{'acc': 0.8795811518324608, 'AUC': 0.6531053459119497, 'macro f1': 0.6986348357000753, 'precision': 0.9090909090909091, 'recall': 0.3125, 'bal_acc': 0.6531053459119497}
{'acc': 0.8795811518324608, 'AUC': 0.687476026083621, 'macro f1': 0.7307715879144451, 'precision': 0.8125, 'recall': 0.3939393939393939, 'bal_acc': 0.687476026083621}
Average acc: 0.8330824607329843
Average AUC: 0.7435879725123217
______________________________________________________________
