In [1]:
import pandas as pd
import numpy as np

import os

from mrmr import mrmr_classif

# Load Data

In [2]:
BASEDIR = '../data'

metadata = pd.read_csv(
    os.path.join(BASEDIR, 'metadata', 'metadata.csv')
)

tax_genus = pd.read_csv(
    os.path.join(BASEDIR, 'taxonomy', 'taxonomy_relabd.genus.csv'),
)

pt_ra_1e_1 = pd.read_csv(
    os.path.join(BASEDIR, 'phylotypes', 'phylotype_relabd.1e_1.csv'),
)

sv_counts = pd.read_csv(
    os.path.join(BASEDIR, 'sv_counts', 'sp_sv_long.csv'),
)

meta = metadata[['specimen', 'participant_id', 'collect_wk']]

## sv_counts for features

In [3]:
def sv_to_num(sv):
    try:
        num = int(sv.split('___')[1])
    except:
        num = None
    return num

def sv_for_features(df, index, cols):
    df['num_sv'] = df.sv.map(lambda x: sv_to_num(x))
    
    num_features = 53129
   
    features = []
    for specimen in index:
        feature = np.zeros(num_features)
        new_df = df[df['specimen'] == specimen]
        sv = np.array(new_df['num_sv'])
        fract = np.array(new_df['fract'])
        for i in range(len(sv)):
            if sv[i] == None:
                continue
            else:
                feature[sv[i]-1] = fract[i]
        features.append(feature)
    df_features = pd.DataFrame(features, index=index, columns = cols)
    df_features.index.name = 'specimen'
    return df_features

index = np.array(metadata.specimen)
sv_cols = np.load('../sv_cols.npy') # sorted sv array

new_sv = sv_for_features(sv_counts, index, sv_cols)

## y_labels

In [4]:
# Remove preterm case for remaining early_preterm and term
y_df = metadata[['specimen', 'was_early_preterm', 'was_preterm', 'participant_id', 'collect_wk']]
y_df = y_df[y_df['collect_wk']<=28] # For early preterm
y_df = y_df[(y_df['was_early_preterm']) ==  (y_df['was_preterm'])] # Remove preterm for extreme case
y_df = y_df.drop_duplicates(['participant_id'], keep='first')
y_df = y_df.set_index('participant_id')

y_df = y_df.drop(columns = ['specimen', 'collect_wk'])
y_df = y_df.sort_values(by=['participant_id'], ascending=[True]) 
y_df = y_df.drop(columns = ['was_preterm'])

In [5]:
y_df

Unnamed: 0_level_0,was_early_preterm
participant_id,Unnamed: 1_level_1
A00004,False
A00005,False
A00009,False
A00010,False
A00011,False
...,...
J00111,False
J00112,False
J00113,False
J00115,False


# MRMR feature selection

In [6]:
from mrmr import mrmr_classif
def get_mrmr(df, num = 300):
    df = pd.merge(meta, df, on='specimen')
    df = df[df['collect_wk'] <= 28] # For early preterm
    dup = df.duplicated('participant_id', keep=False)
    duplicated = df[dup]
    dup_mean = duplicated.groupby(['participant_id'], as_index=False).aggregate(np.mean)
    df = df.drop_duplicates(['participant_id'], keep=False)
    df = pd.concat([df, dup_mean])
    df = df.sort_values(by=['participant_id'], ascending=[True]) 
    df = df.set_index('participant_id')
    df = df.drop(columns = ['specimen', 'collect_wk'])
    df = df.loc[y_df.index]
    
    selected_features = mrmr_classif(X=df, y=y_df, K=num)
    
    return selected_features

In [7]:
tax_selected_features = get_mrmr(tax_genus)
tax_selected_features

100%|██████████| 300/300 [01:42<00:00,  2.93it/s]


['Prevotella',
 'Rhodanobacter',
 'Yokenella',
 'Micrococcaceae/Parachlamydiaceae',
 'Serratia',
 'Bacteroidales/Fusobacteriales',
 'Vallitalea/Corynebacterium',
 'Cyanobacteria',
 'Pseudoramibacter',
 'Cryptobacterium',
 'Desulfatiglans',
 'Oxalobacteraceae',
 'Psychrobacter',
 'Mycoplasmataceae/Hungateiclostridiaceae',
 'Methylobacteriaceae',
 'Prolinoborus/Acinetobacter',
 'Ciceribacter',
 'Flavobacterium/Oligella',
 'Adhaeribacter',
 'Deinococci/Acidobacteriia',
 'Phyllobacterium',
 'Geodermatophilus',
 'Scardovia',
 'Tidjanibacter',
 'Lacrimispora',
 'Mobiluncus',
 'Cellvibrionaceae',
 'Escherichia/Shigella',
 'Methylorubrum',
 'Peptostreptococcus',
 'Corynebacteriaceae/Lachnospiraceae',
 'Schwartzia <firmicutes>',
 'Clostridiaceae',
 'Butyricicoccus',
 'Actinotignum',
 'Ureaplasma',
 'Craurococcus',
 'Peptococcus',
 'Streptococcus',
 'Massilistercora',
 'Sphingomonas',
 'Arcanobacterium',
 'Dermabacter',
 'Chelatococcus',
 'Sneathia',
 'cellular organisms',
 'Pseudolabrys',
 'Tis

In [8]:
pt_selected_features = get_mrmr(pt_ra_1e_1)
pt_selected_features

100%|██████████| 300/300 [06:36<00:00,  1.32s/it]


['pt__00019',
 'pt__00355',
 'pt__03332',
 'pt__00920',
 'pt__04487',
 'pt__01037',
 'pt__00838',
 'pt__04290',
 'pt__00885',
 'pt__08054',
 'pt__06816',
 'pt__01489',
 'pt__09410',
 'pt__04289',
 'pt__00201',
 'pt__00618',
 'pt__01628',
 'pt__09016',
 'pt__02337',
 'pt__05267',
 'pt__08818',
 'pt__01545',
 'pt__02805',
 'pt__01300',
 'pt__03835',
 'pt__01000',
 'pt__05213',
 'pt__01936',
 'pt__04520',
 'pt__07723',
 'pt__07850',
 'pt__01559',
 'pt__09701',
 'pt__03748',
 'pt__04104',
 'pt__02601',
 'pt__01806',
 'pt__03457',
 'pt__04794',
 'pt__07390',
 'pt__05033',
 'pt__03531',
 'pt__09441',
 'pt__01216',
 'pt__03320',
 'pt__04090',
 'pt__05899',
 'pt__08778',
 'pt__03325',
 'pt__04927',
 'pt__00590',
 'pt__02874',
 'pt__07564',
 'pt__02319',
 'pt__02612',
 'pt__09096',
 'pt__03677',
 'pt__07986',
 'pt__04730',
 'pt__06612',
 'pt__07864',
 'pt__08557',
 'pt__03082',
 'pt__04194',
 'pt__07841',
 'pt__01349',
 'pt__00844',
 'pt__03932',
 'pt__09140',
 'pt__00403',
 'pt__01824',
 'pt__

In [9]:
sv_selected_features = get_mrmr(new_sv)
sv_selected_features

100%|██████████| 300/300 [35:02<00:00,  7.01s/it]


['combinedsv___33594',
 'combinedsv___35302',
 'combinedsv___35089',
 'combinedsv___36067',
 'combinedsv___35394',
 'combinedsv___45017',
 'combinedsv___34288',
 'combinedsv___34777',
 'combinedsv___33463',
 'combinedsv___32420',
 'combinedsv___1716',
 'combinedsv___20452',
 'combinedsv___20132',
 'combinedsv___6026',
 'combinedsv___23667',
 'combinedsv___6310',
 'combinedsv___26244',
 'combinedsv___6909',
 'combinedsv___20611',
 'combinedsv___44825',
 'combinedsv___7597',
 'combinedsv___33040',
 'combinedsv___34183',
 'combinedsv___35610',
 'combinedsv___39305',
 'combinedsv___44169',
 'combinedsv___32254',
 'combinedsv___24841',
 'combinedsv___40919',
 'combinedsv___39578',
 'combinedsv___20735',
 'combinedsv___10199',
 'combinedsv___46915',
 'combinedsv___9040',
 'combinedsv___46952',
 'combinedsv___24325',
 'combinedsv___43898',
 'combinedsv___31466',
 'combinedsv___0004',
 'combinedsv___3508',
 'combinedsv___35434',
 'combinedsv___33226',
 'combinedsv___35397',
 'combinedsv___1150

In [10]:
all_features = np.array(tax_selected_features + pt_selected_features + sv_selected_features)
np.save('selected_features.npy', all_features)