In [1]:
import pandas as pd
import numpy as np

## 1. Zeller G 2014

In [2]:
zeller_raw = pd.read_csv('data/raw/ZellerG_2014.tsv',sep='\t').T
zeller_raw.columns = [col.split('|')[-1].split('s__')[-1] for col in zeller_raw.columns]

print(zeller_raw.shape)
zeller_raw.head(2)

(156, 652)


Unnamed: 0,Bifidobacterium_longum,Alistipes_putredinis,Faecalibacterium_prausnitzii,Eubacterium_sp_CAG_180,Fusicatenibacter_saccharivorans,Anaerostipes_hadrus,Coprococcus_comes,Bacteroides_vulgatus,Ruminococcus_bromii,Eubacterium_rectale,...,Robinsoniella_sp_RHS,Synergistes_jonesii,Lactobacillus_crustorum,Lactobacillus_farciminis,Firmicutes_bacterium_CAG_555,Clostridium_ventriculi,Barnesiella_sp_An22,Klebsiella_quasivariicola,Actinobaculum_massiliense,Atopobium_minutum
CCIS00146684ST-4-0,9.64987,8.62537,7.88807,7.83075,6.80929,6.49145,5.28827,5.23567,4.87884,4.20644,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CCIS00281083ST-3-0,0.0,1.19239,1.83716,0.0,0.15119,0.4594,1.00224,4.86169,0.0,0.13938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
zeller_metadata = pd.read_csv('data/raw/ZellerG_2014_metadata.tsv',sep='\t')
zeller_metadata.head(2)

Unnamed: 0,study_name,subject_id,body_site,study_condition,disease,age,age_category,gender,country,non_westernized,...,number_bases,minimum_read_length,median_read_length,NCBI_accession,curator,BMI,disease_subtype,tnm,fobt,ajcc
CCIS00146684ST-4-0,ZellerG_2014,FR-726,stool,control,healthy,72,senior,female,FRA,no,...,6512170482,45,91,ERR480457;ERR480456;ERR480455;ERR480454;ERR478...,Paolo_Manghi,25.0,,,no,
CCIS00281083ST-3-0,ZellerG_2014,FR-060,stool,control,healthy,53,adult,male,FRA,no,...,3298640533,45,69,ERR480459;ERR480458;ERR478962;ERR478963,Paolo_Manghi,32.0,,,no,


In [4]:
zeller_metadata_control = zeller_metadata[zeller_metadata['study_condition']=='control']
zeller_metadata_control['disease_subtype'].value_counts()

Series([], Name: disease_subtype, dtype: int64)

In [5]:
zeller_label = zeller_metadata[['study_condition','study_name']]

zeller_df = zeller_raw.join(zeller_label)
print(zeller_df.shape)
zeller_df.head(2)

(156, 654)


Unnamed: 0,Bifidobacterium_longum,Alistipes_putredinis,Faecalibacterium_prausnitzii,Eubacterium_sp_CAG_180,Fusicatenibacter_saccharivorans,Anaerostipes_hadrus,Coprococcus_comes,Bacteroides_vulgatus,Ruminococcus_bromii,Eubacterium_rectale,...,Lactobacillus_crustorum,Lactobacillus_farciminis,Firmicutes_bacterium_CAG_555,Clostridium_ventriculi,Barnesiella_sp_An22,Klebsiella_quasivariicola,Actinobaculum_massiliense,Atopobium_minutum,study_condition,study_name
CCIS00146684ST-4-0,9.64987,8.62537,7.88807,7.83075,6.80929,6.49145,5.28827,5.23567,4.87884,4.20644,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,ZellerG_2014
CCIS00281083ST-3-0,0.0,1.19239,1.83716,0.0,0.15119,0.4594,1.00224,4.86169,0.0,0.13938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,ZellerG_2014


In [6]:
zeller_df['study_condition'].value_counts()

control    61
CRC        53
adenoma    42
Name: study_condition, dtype: int64

In [7]:
zeller_crc = zeller_df[(zeller_df['study_condition']=='CRC') | (zeller_df['study_condition']=='control')]

print(zeller_crc['study_condition'].value_counts())

zeller_crc.head(2)

control    61
CRC        53
Name: study_condition, dtype: int64


Unnamed: 0,Bifidobacterium_longum,Alistipes_putredinis,Faecalibacterium_prausnitzii,Eubacterium_sp_CAG_180,Fusicatenibacter_saccharivorans,Anaerostipes_hadrus,Coprococcus_comes,Bacteroides_vulgatus,Ruminococcus_bromii,Eubacterium_rectale,...,Lactobacillus_crustorum,Lactobacillus_farciminis,Firmicutes_bacterium_CAG_555,Clostridium_ventriculi,Barnesiella_sp_An22,Klebsiella_quasivariicola,Actinobaculum_massiliense,Atopobium_minutum,study_condition,study_name
CCIS00146684ST-4-0,9.64987,8.62537,7.88807,7.83075,6.80929,6.49145,5.28827,5.23567,4.87884,4.20644,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,ZellerG_2014
CCIS00281083ST-3-0,0.0,1.19239,1.83716,0.0,0.15119,0.4594,1.00224,4.86169,0.0,0.13938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,ZellerG_2014


# 2. Wirbel 2018

In [8]:
wirbel_raw = pd.read_csv('data/raw/WirbelJ_2018.tsv',sep='\t').T
wirbel_raw.columns = [col.split('|')[-1].split('s__')[-1] for col in wirbel_raw.columns]

print(wirbel_raw.shape)
wirbel_raw.head(2)

(125, 537)


Unnamed: 0,Bacteroides_plebeius,Faecalibacterium_prausnitzii,Bacteroides_uniformis,Alistipes_putredinis,Bacteroides_vulgatus,Roseburia_intestinalis,Roseburia_inulinivorans,Fusicatenibacter_saccharivorans,Collinsella_aerofaciens,Eubacterium_rectale,...,Mitsuokella_sp_oral_taxon_131,Propionibacterium_acidifaciens,Negativicoccus_succinicivorans,Desulfovibrio_sp_MES5,Lactococcus_piscium,Butyricicoccus_pullicaecorum,Streptococcus_agalactiae,Tropheryma_whipplei,Cutibacterium_acnes,Actinomyces_sp_oral_taxon_414
CCMD10032470ST-11-0,27.51801,19.70982,6.72206,5.27369,3.7856,3.73574,2.71004,2.70134,2.62207,2.5242,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CCMD10191450ST-11-0,0.0,36.1266,0.75146,4.96234,0.18463,0.51501,1.06791,0.93409,2.37133,4.71091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
wirbel_metadata = pd.read_csv('data/raw/WirbelJ_2018_metadata.tsv',sep='\t')
wirbel_metadata.head(2)

Unnamed: 0,study_name,subject_id,body_site,study_condition,disease,age,age_category,gender,country,non_westernized,...,PMID,number_reads,number_bases,minimum_read_length,median_read_length,NCBI_accession,curator,BMI,tnm,ajcc
CCMD10032470ST-11-0,WirbelJ_2018,CCMD10032470ST-11-0,stool,control,healthy,45,adult,male,DEU,no,...,30936547,37708359,4639213592,45,139,ERR2726404,Jacob_Wirbel;Paolo_Manghi,30.7,,
CCMD10191450ST-11-0,WirbelJ_2018,CCMD10191450ST-11-0,stool,control,healthy,62,adult,female,DEU,no,...,30936547,34952407,4351849639,45,140,ERR2726405,Jacob_Wirbel;Paolo_Manghi,28.5,,


In [10]:
wirbel_metadata_control = wirbel_metadata[wirbel_metadata['study_condition']=='control']
wirbel_metadata_control['disease'].value_counts()

healthy    65
Name: disease, dtype: int64

In [11]:
wirbel_label = wirbel_metadata[['study_condition','study_name']]

# wirbel_label['disease'] = ['CRC' if 'CRC' in label.split(';') else 'adenoma' if 'adenoma' in label.split(';') else 'healthy' for label in wirbel_label['disease']]

wirbel_df = wirbel_raw.join(wirbel_label)
print(wirbel_df['study_condition'].value_counts())

print(wirbel_df.shape)
wirbel_df.head(2)

control    65
CRC        60
Name: study_condition, dtype: int64
(125, 539)


Unnamed: 0,Bacteroides_plebeius,Faecalibacterium_prausnitzii,Bacteroides_uniformis,Alistipes_putredinis,Bacteroides_vulgatus,Roseburia_intestinalis,Roseburia_inulinivorans,Fusicatenibacter_saccharivorans,Collinsella_aerofaciens,Eubacterium_rectale,...,Negativicoccus_succinicivorans,Desulfovibrio_sp_MES5,Lactococcus_piscium,Butyricicoccus_pullicaecorum,Streptococcus_agalactiae,Tropheryma_whipplei,Cutibacterium_acnes,Actinomyces_sp_oral_taxon_414,study_condition,study_name
CCMD10032470ST-11-0,27.51801,19.70982,6.72206,5.27369,3.7856,3.73574,2.71004,2.70134,2.62207,2.5242,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,WirbelJ_2018
CCMD10191450ST-11-0,0.0,36.1266,0.75146,4.96234,0.18463,0.51501,1.06791,0.93409,2.37133,4.71091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,WirbelJ_2018


In [12]:
wirbel_crc = wirbel_df[(wirbel_df['study_condition']=='CRC') | (wirbel_df['study_condition']=='control')]

print(wirbel_crc['study_condition'].value_counts())

wirbel_crc.head(2)

control    65
CRC        60
Name: study_condition, dtype: int64


Unnamed: 0,Bacteroides_plebeius,Faecalibacterium_prausnitzii,Bacteroides_uniformis,Alistipes_putredinis,Bacteroides_vulgatus,Roseburia_intestinalis,Roseburia_inulinivorans,Fusicatenibacter_saccharivorans,Collinsella_aerofaciens,Eubacterium_rectale,...,Negativicoccus_succinicivorans,Desulfovibrio_sp_MES5,Lactococcus_piscium,Butyricicoccus_pullicaecorum,Streptococcus_agalactiae,Tropheryma_whipplei,Cutibacterium_acnes,Actinomyces_sp_oral_taxon_414,study_condition,study_name
CCMD10032470ST-11-0,27.51801,19.70982,6.72206,5.27369,3.7856,3.73574,2.71004,2.70134,2.62207,2.5242,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,WirbelJ_2018
CCMD10191450ST-11-0,0.0,36.1266,0.75146,4.96234,0.18463,0.51501,1.06791,0.93409,2.37133,4.71091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,WirbelJ_2018


## 3. Vogtmann E 2016

In [13]:
vogtmann_raw = pd.read_csv('data/raw/VogtmannE_2016.tsv',sep='\t').T
vogtmann_raw.columns = [col.split('|')[-1].split('s__')[-1] for col in vogtmann_raw.columns]


print(vogtmann_raw.shape)
vogtmann_raw.head(2)

(110, 540)


Unnamed: 0,Bacteroides_dorei,Bacteroides_uniformis,Eubacterium_eligens,Roseburia_inulinivorans,Lachnospira_pectinoschiza,Ruminococcus_bromii,Faecalibacterium_prausnitzii,Bacteroides_ovatus,Roseburia_faecis,Bacteroides_cellulosilyticus,...,Acinetobacter_pittii,Dialister_sp_CAG_357,Weissella_viridescens,Desulfovibrio_fairfieldensis,Fusobacterium_naviforme,Anaerococcus_lactolyticus,Campylobacter_hominis,Porphyromonas_sp_HMSC065F10,Haemophilus_haemolyticus,Faecalicatena_orotica
MMRS11288076ST-27-0-0,19.65406,8.42983,7.90381,7.73506,7.39937,4.67849,4.66826,3.86307,3.43927,3.13954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MMRS11664448ST-27-0-0,0.0,6.04275,0.30436,4.13622,1.47024,1.97378,4.55172,0.36159,3.03204,1.2744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
vogtmann_metadata = pd.read_csv('data/raw/VogtmannE_2016_metadata.tsv',sep='\t')
vogtmann_metadata.head(2)

Unnamed: 0,study_name,subject_id,body_site,study_condition,disease,age,age_category,gender,country,non_westernized,sequencing_platform,DNA_extraction_kit,PMID,number_reads,number_bases,minimum_read_length,median_read_length,NCBI_accession,curator,BMI
MMRS11288076ST-27-0-0,VogtmannE_2016,MMRS11288076ST-27-0-0,stool,control,healthy,65.0,adult,male,USA,no,IlluminaHiSeq,Gnome,27171425,67924141,5793273464,1,89,ERR1293500;ERR1293499;ERR1293498;ERR1293497;ER...,Paolo_Manghi,23.490755
MMRS11664448ST-27-0-0,VogtmannE_2016,MMRS11664448ST-27-0-0,stool,,healthy,,adult,,USA,no,IlluminaHiSeq,Gnome,27171425,85820299,7389001007,45,90,ERR1293861;ERR1293860;ERR1293859;ERR1293858;ER...,Paolo_Manghi,


In [15]:
vogtmann_label = vogtmann_metadata[['study_condition','study_name']]

# vogtmann_label['disease'] = ['CRC' if 'CRC' in label.split(';') else 'adenoma' if 'adenoma' in label.split(';') else 'healthy' for label in vogtmann_label['disease']]

vogtmann_df = vogtmann_raw.join(vogtmann_label)
print(vogtmann_df['study_condition'].value_counts())

print(vogtmann_df.shape)
vogtmann_df.head(2)

control    52
CRC        52
Name: study_condition, dtype: int64
(110, 542)


Unnamed: 0,Bacteroides_dorei,Bacteroides_uniformis,Eubacterium_eligens,Roseburia_inulinivorans,Lachnospira_pectinoschiza,Ruminococcus_bromii,Faecalibacterium_prausnitzii,Bacteroides_ovatus,Roseburia_faecis,Bacteroides_cellulosilyticus,...,Weissella_viridescens,Desulfovibrio_fairfieldensis,Fusobacterium_naviforme,Anaerococcus_lactolyticus,Campylobacter_hominis,Porphyromonas_sp_HMSC065F10,Haemophilus_haemolyticus,Faecalicatena_orotica,study_condition,study_name
MMRS11288076ST-27-0-0,19.65406,8.42983,7.90381,7.73506,7.39937,4.67849,4.66826,3.86307,3.43927,3.13954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,VogtmannE_2016
MMRS11664448ST-27-0-0,0.0,6.04275,0.30436,4.13622,1.47024,1.97378,4.55172,0.36159,3.03204,1.2744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,VogtmannE_2016


In [16]:
vogtmann_crc = vogtmann_df[(vogtmann_df['study_condition']=='CRC') | (vogtmann_df['study_condition']=='control')]

print(vogtmann_crc['study_condition'].value_counts())

vogtmann_crc.head(2)

control    52
CRC        52
Name: study_condition, dtype: int64


Unnamed: 0,Bacteroides_dorei,Bacteroides_uniformis,Eubacterium_eligens,Roseburia_inulinivorans,Lachnospira_pectinoschiza,Ruminococcus_bromii,Faecalibacterium_prausnitzii,Bacteroides_ovatus,Roseburia_faecis,Bacteroides_cellulosilyticus,...,Weissella_viridescens,Desulfovibrio_fairfieldensis,Fusobacterium_naviforme,Anaerococcus_lactolyticus,Campylobacter_hominis,Porphyromonas_sp_HMSC065F10,Haemophilus_haemolyticus,Faecalicatena_orotica,study_condition,study_name
MMRS11288076ST-27-0-0,19.65406,8.42983,7.90381,7.73506,7.39937,4.67849,4.66826,3.86307,3.43927,3.13954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,VogtmannE_2016
MMRS11932626ST-27-0-0,13.81281,3.5224,0.58275,1.24115,0.0833,0.57827,7.37884,0.0,4.53814,0.29066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,control,VogtmannE_2016


## 4. YuJ_2015

In [17]:
yu_raw = pd.read_csv('data/raw/YuJ_2015.tsv',sep='\t').T
yu_raw.columns = [col.split('|')[-1].split('s__')[-1] for col in yu_raw.columns]

print(yu_raw.shape)
yu_raw.head(2)

(128, 575)


Unnamed: 0,Akkermansia_muciniphila,Escherichia_coli,Faecalibacterium_prausnitzii,Bacteroides_vulgatus,Anaerostipes_hadrus,Alistipes_putredinis,Collinsella_aerofaciens,Phascolarctobacterium_succinatutens,Clostridium_sp_CAG_413,Desulfovibrio_piger,...,Bifidobacterium_animalis,Anaerococcus_tetradius,Lactococcus_plantarum,Bifidobacterium_mongoliense,Corynebacterium_variabile,Sodalis_praecaptivus,Blautia_sp_An249,Lactobacillus_reuteri,Bacteroides_fluxus,Enterococcus_durans
SZAXPI003409-8,25.51215,20.176,3.70796,3.34119,2.61438,2.52846,2.40854,2.24518,1.88855,1.66218,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SZAXPI003410-3,10.95862,0.34416,5.15137,0.07968,0.19203,0.0,0.01517,0.0,0.0,0.73417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
yu_metadata = pd.read_csv('data/raw/YuJ_2015_metadata.tsv',sep='\t')
yu_metadata.head(2)

Unnamed: 0,study_name,subject_id,body_site,study_condition,disease,age,age_category,gender,country,non_westernized,...,ldl,smoker,ever_smoker,disease_location,cholesterol,creatinine,fasting_glucose,ajcc,ALT,eGFR
SZAXPI003409-8,YuJ_2015,SZAXPI003409-8,stool,CRC,CRC;T2D,64,adult,male,CHN,no,...,61.872,yes,yes,rectum,135.345,70.0,129.6,iii,25.0,88.74
SZAXPI003410-3,YuJ_2015,SZAXPI003410-3,stool,CRC,CRC,73,senior,male,CHN,no,...,139.212,yes,yes,rectum,220.419,74.0,99.0,i,17.0,63.65


In [19]:
yu_metadata['ajcc'].value_counts().sum()

74

In [20]:
yu_metadata_control =yu_metadata[yu_metadata['study_condition']=='control']
yu_metadata_control['disease'].value_counts()

healthy    38
T2D        16
Name: disease, dtype: int64

In [21]:
yu_label = yu_metadata[['study_condition','study_name']]

# yu_label['disease'] = ['CRC' if 'CRC' in label.split(';') else 'adenoma' if 'adenoma' in label.split(';') else 'healthy' for label in yu_label['disease']]

yu_df = yu_raw.join(yu_label)
print(yu_df['study_condition'].value_counts())

print(yu_df.shape)
yu_df.head(2)

CRC        74
control    54
Name: study_condition, dtype: int64
(128, 577)


Unnamed: 0,Akkermansia_muciniphila,Escherichia_coli,Faecalibacterium_prausnitzii,Bacteroides_vulgatus,Anaerostipes_hadrus,Alistipes_putredinis,Collinsella_aerofaciens,Phascolarctobacterium_succinatutens,Clostridium_sp_CAG_413,Desulfovibrio_piger,...,Lactococcus_plantarum,Bifidobacterium_mongoliense,Corynebacterium_variabile,Sodalis_praecaptivus,Blautia_sp_An249,Lactobacillus_reuteri,Bacteroides_fluxus,Enterococcus_durans,study_condition,study_name
SZAXPI003409-8,25.51215,20.176,3.70796,3.34119,2.61438,2.52846,2.40854,2.24518,1.88855,1.66218,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CRC,YuJ_2015
SZAXPI003410-3,10.95862,0.34416,5.15137,0.07968,0.19203,0.0,0.01517,0.0,0.0,0.73417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CRC,YuJ_2015


In [22]:
yu_crc = yu_df[(yu_df['study_condition']=='CRC') | (yu_df['study_condition']=='control')]

print(yu_crc['study_condition'].value_counts())

yu_crc.head(2)

CRC        74
control    54
Name: study_condition, dtype: int64


Unnamed: 0,Akkermansia_muciniphila,Escherichia_coli,Faecalibacterium_prausnitzii,Bacteroides_vulgatus,Anaerostipes_hadrus,Alistipes_putredinis,Collinsella_aerofaciens,Phascolarctobacterium_succinatutens,Clostridium_sp_CAG_413,Desulfovibrio_piger,...,Lactococcus_plantarum,Bifidobacterium_mongoliense,Corynebacterium_variabile,Sodalis_praecaptivus,Blautia_sp_An249,Lactobacillus_reuteri,Bacteroides_fluxus,Enterococcus_durans,study_condition,study_name
SZAXPI003409-8,25.51215,20.176,3.70796,3.34119,2.61438,2.52846,2.40854,2.24518,1.88855,1.66218,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CRC,YuJ_2015
SZAXPI003410-3,10.95862,0.34416,5.15137,0.07968,0.19203,0.0,0.01517,0.0,0.0,0.73417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CRC,YuJ_2015


## 5. YachidaS_2019 (Stages I-IV)

In [23]:
yachida_raw = pd.read_csv('data/raw/YachidaS_2019.tsv',sep='\t').T
yachida_raw.columns = [col.split('|')[-1].split('s__')[-1] for col in yachida_raw.columns]

print(yachida_raw.shape)
yachida_raw.head(2)

(616, 718)


Unnamed: 0,Bacteroides_plebeius,Bacteroides_dorei,Faecalibacterium_prausnitzii,Eubacterium_eligens,Bacteroides_ovatus,Parabacteroides_distasonis,Ruminococcus_gnavus,Phascolarctobacterium_faecium,Bacteroides_uniformis,Bifidobacterium_longum,...,Firmicutes_bacterium_CAG_534,Actinomyces_europaeus,Anaerobiospirillum_thomasii,Fusobacterium_naviforme,Bacteroides_pectinophilus,Methanosphaera_sp_SHI1033,Prevotella_sp_CAG_755,Enterobacter_soli,Hungatella_effluvii,Bacteroides_caecimuris
SAMD00114718,54.94898,9.74689,4.36704,3.8581,3.39297,2.74964,2.26875,1.78037,1.62034,1.58755,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SAMD00114719,11.23749,0.48509,4.32637,0.38532,0.61081,0.18787,0.07518,0.0,1.53612,0.46743,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
yachida_metadata = pd.read_csv('data/raw/YachidaS_2019_metadata.tsv',sep='\t')
print(yachida_metadata.shape)
yachida_metadata.head(2)

(616, 22)


Unnamed: 0,study_name,subject_id,body_site,study_condition,disease,age,age_category,gender,country,non_westernized,...,number_reads,number_bases,minimum_read_length,median_read_length,curator,BMI,disease_location,ajcc,brinkman_index,alcohol_numeric
SAMD00114718,YachidaS_2019,sub_10219,stool,control,healthy,68,senior,male,JPN,no,...,84641274,11889864466,50,150,Paolo_Manghi,21.383942,,,1290.0,0.0
SAMD00114719,YachidaS_2019,sub_10226,stool,control,healthy,60,adult,male,JPN,no,...,47470646,6605044144,50,150,Paolo_Manghi,22.229062,,,100.0,933.0


In [25]:
yachida_healthy_id = pd.read_csv('data/other/yachida_healthy_normal.csv')[['Subject_ID']]
yachida_healthy_id.columns = ['subject_id']
yachida_healthy_id['subject_id'] = ['sub_' + str(sampleID) for sampleID in yachida_healthy_id['subject_id']]
yachida_healthy_id.index = yachida_healthy_id['subject_id']


yachida_metadata_reset = yachida_metadata.reset_index()
yachida_metadata_reset.index = yachida_metadata_reset['subject_id']
yachida_metadata_reset = yachida_metadata_reset[['index']]

yachida_healthy_id_final = yachida_metadata_reset.join(yachida_healthy_id,how='inner')
yachida_healthy_id_final.index = yachida_healthy_id_final['index']
yachida_healthy_id_final.drop('index',axis=1,inplace=True)

yachida_crc_id = yachida_metadata[(yachida_metadata['ajcc']=='i') | (yachida_metadata['ajcc']=='ii') | (yachida_metadata['ajcc']=='iii') | (yachida_metadata['ajcc']=='iv')]
yachida_crc_id = yachida_crc_id[['subject_id']]

yachida_id_selected = pd.concat([yachida_crc_id,yachida_healthy_id_final])
yachida_id_selected

Unnamed: 0,subject_id
SAMD00114722,sub_10232
SAMD00114723,sub_10236
SAMD00114724,sub_10238
SAMD00114726,sub_10330
SAMD00114727,sub_10332
...,...
SAMD00165029,sub_12276
SAMD00165030,sub_12340
SAMD00165031,sub_12370
SAMD00165032,sub_12389


In [26]:
yachida_label = yachida_metadata[['ajcc','study_condition','study_name','subject_id']]
yachida_label = yachida_label.merge(yachida_id_selected,left_index=True,right_index=True,how='right')

yachida_label

Unnamed: 0,ajcc,study_condition,study_name,subject_id_x,subject_id_y
SAMD00114722,iii,CRC,YachidaS_2019,sub_10232,sub_10232
SAMD00114723,i,CRC,YachidaS_2019,sub_10236,sub_10236
SAMD00114724,iv,CRC,YachidaS_2019,sub_10238,sub_10238
SAMD00114726,ii,CRC,YachidaS_2019,sub_10330,sub_10330
SAMD00114727,iv,CRC,YachidaS_2019,sub_10332,sub_10332
...,...,...,...,...,...
SAMD00165029,,control,YachidaS_2019,sub_12276,sub_12276
SAMD00165030,,control,YachidaS_2019,sub_12340,sub_12340
SAMD00165031,,control,YachidaS_2019,sub_12370,sub_12370
SAMD00165032,,control,YachidaS_2019,sub_12389,sub_12389


In [27]:
yachida_df = yachida_raw.join(yachida_label,how='inner').drop(['subject_id_x','subject_id_y'],axis=1)
print(yachida_df['study_condition'].value_counts())

print(yachida_df.shape)
yachida_df.head(2)

CRC        185
control    146
Name: study_condition, dtype: int64
(331, 721)


Unnamed: 0,Bacteroides_plebeius,Bacteroides_dorei,Faecalibacterium_prausnitzii,Eubacterium_eligens,Bacteroides_ovatus,Parabacteroides_distasonis,Ruminococcus_gnavus,Phascolarctobacterium_faecium,Bacteroides_uniformis,Bifidobacterium_longum,...,Fusobacterium_naviforme,Bacteroides_pectinophilus,Methanosphaera_sp_SHI1033,Prevotella_sp_CAG_755,Enterobacter_soli,Hungatella_effluvii,Bacteroides_caecimuris,ajcc,study_condition,study_name
SAMD00114722,0.01312,4.04432,0.61176,0.95398,0.98491,4.4137,8.04042,0.0,3.29386,0.66588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,iii,CRC,YachidaS_2019
SAMD00114723,0.0002,0.0,0.74673,0.0,0.0011,0.95861,1.48201,0.49569,0.01172,0.64581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,i,CRC,YachidaS_2019


In [28]:
yachida_crc = yachida_df[(yachida_df['study_condition']=='control') | (yachida_df['ajcc']=='i') | (yachida_df['ajcc']=='ii') | (yachida_df['ajcc']=='iii') | (yachida_df['ajcc']=='iv')]


print(yachida_crc['study_condition'].value_counts())
yachida_crc = yachida_crc.drop('ajcc',axis=1)
yachida_crc.head(2)

CRC        185
control    146
Name: study_condition, dtype: int64


Unnamed: 0,Bacteroides_plebeius,Bacteroides_dorei,Faecalibacterium_prausnitzii,Eubacterium_eligens,Bacteroides_ovatus,Parabacteroides_distasonis,Ruminococcus_gnavus,Phascolarctobacterium_faecium,Bacteroides_uniformis,Bifidobacterium_longum,...,Anaerobiospirillum_thomasii,Fusobacterium_naviforme,Bacteroides_pectinophilus,Methanosphaera_sp_SHI1033,Prevotella_sp_CAG_755,Enterobacter_soli,Hungatella_effluvii,Bacteroides_caecimuris,study_condition,study_name
SAMD00114722,0.01312,4.04432,0.61176,0.95398,0.98491,4.4137,8.04042,0.0,3.29386,0.66588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CRC,YachidaS_2019
SAMD00114723,0.0002,0.0,0.74673,0.0,0.0011,0.95861,1.48201,0.49569,0.01172,0.64581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CRC,YachidaS_2019


## Intersecting Features

In [29]:
len(list(set(yachida_crc.columns) & set(zeller_crc.columns) &  set(wirbel_crc.columns) & set(vogtmann_crc.columns) & set(yu_crc.columns) )) - 1

396

# CONCAT ALL DATA

In [30]:
all_df = [yachida_crc, zeller_crc, wirbel_crc, yu_crc, vogtmann_crc]  # vogtmann_crc, ,feng_crc

df_all = pd.concat(all_df)
df_concat_ab = df_all.drop(['study_condition','study_name'],axis=1)

# normalize
df_concat_ab = df_concat_ab.div(df_concat_ab.sum(axis=1), axis=0)

df_concat_label = df_all[['study_condition','study_name']]
df_concat = pd.concat([df_concat_ab, df_concat_label],axis=1)

# convert nan to 0
df_concat = df_concat.fillna(0)

# Extract only species name for the columns
df_concat.columns = [col.split('|')[-1].split('s__')[-1] for col in df_concat.columns]

# Encode label
df_concat['CRC'] = [1 if label=='CRC' else 0 for label in df_concat['study_condition']]
df_concat = df_concat.drop('study_condition',axis=1)

df_concat.head(2)

Unnamed: 0,Bacteroides_plebeius,Bacteroides_dorei,Faecalibacterium_prausnitzii,Eubacterium_eligens,Bacteroides_ovatus,Parabacteroides_distasonis,Ruminococcus_gnavus,Phascolarctobacterium_faecium,Bacteroides_uniformis,Bifidobacterium_longum,...,Prevotella_bergensis,Treponema_sp_OMZ_838,Selenomonas_sp_FOBRC6,Selenomonas_sp_oral_taxon_138,Pseudomonas_helleri,Barnesiella_viscericola,Coprobacter_sp,Acinetobacter_pittii,study_name,CRC
SAMD00114722,0.000131,0.040444,0.006118,0.00954,0.009849,0.044138,0.080406,0.0,0.03294,0.006659,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,YachidaS_2019,1
SAMD00114723,2e-06,0.0,0.007468,0.0,1.1e-05,0.009587,0.014821,0.004957,0.000117,0.006459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,YachidaS_2019,1


In [31]:
df_concat.shape

(802, 867)

In [32]:
df_concat['CRC'].value_counts()

1    424
0    378
Name: CRC, dtype: int64

In [33]:
df_concat['study_name'].value_counts()

YachidaS_2019     331
YuJ_2015          128
WirbelJ_2018      125
ZellerG_2014      114
VogtmannE_2016    104
Name: study_name, dtype: int64

In [34]:
df_concat.to_csv('data/bacteria_relative_abundance_concat.csv')