<h2> Set-up Env <h2>

In [4]:
# importing the required modules 
import pandas as pd
import glob
import xmltodict
import elementpath
from xml.etree import ElementTree as et
import numpy as np
import yaml

In [2]:
#################################################################
#aim data
aim = '3'

#read in and set dir from yml
yml_path = '/Users/slsevilla/Google Drive/MyDocuments_Current/Education/George Mason University/Dissertation/Analysis/config_all.yml'
with open(yml_path) as file:
    dir_list = yaml.load(file, Loader=yaml.FullLoader)

aim_search = 'aim_dir' + aim
proj_dir = dir_list[aim_search].replace('\\','')
data_dir = dir_list[aim_search + '_q'] + dir_list['data_dir']

manifest_dir = dir_list[aim_search].replace('\\','') + dir_list['manifest_dir']
notebook_dir = dir_list[aim_search] + dir_list['notebook_dir']
img_dir = dir_list[aim_search] + dir_list['img_dir']
stats_dir = dir_list[aim_search] + dir_list['stats_dir']
code_dir = dir_list['analysis_dir'] + dir_list['code_dir']
#################################################################
#updates to dir_list
stats_dir = stats_dir.replace('\\','')

#################################################################


<h2> XML Parsing <h2>

<h3> Include only samples <h3>

In [3]:
def xml_clean_blanks(xtree_in, xroot_in,filename_in):
    for meta_search in xroot_in.findall('BioSample'):
        a = meta_search.find('Description')
        b = a.find('Title').text
        #c = b.get('db') #if there's a third level
        
        if "BLANK" in b or "Blank" in b or "blank" in b:
            xroot_in.remove(meta_search)
    xtree_in.write(manifest_dir + filename_in)

In [4]:
def xml_clean_subsets(xtree_in, xroot_in,filename_in,subset_in,key_in):
    search_text = "./Attributes/Attribute/[@attribute_name='" + subset_in + "']" #example "./BioSample/Attributes/Attribute/[@attribute_name='acid_reflux']"
    
    for meta_search in xroot_in.findall('BioSample'):
        for meta_search2 in meta_search.findall(search_text):
            if meta_search2.text not in key_in:
                xroot_in.remove(meta_search)    
    xtree_in.write(manifest_dir + filename_in)

In [5]:
#define xml file
xml_file = 'biosample.xml'

#Read in file
xtree = et.parse(manifest_dir+xml_file)
xroot = xtree.getroot()

#Remove blanks
xml_clean_blanks(xtree,xroot,"biosample_samples.xml")

<h3> Create cohorts <h3>

Mental illness defines health status
- no illness is healthy (l1 = "no" to question of mental illness, l2 = "no" to other dx of mental illness)
- yes defines disease

In [6]:
#identify who does not have mental illness
xml_clean_subsets(xtree,xroot,"biosample_samples_h1.xml",'mental_illness',"No")

#remove those with other disorders
mental_list = ["alzheimers","depression_bipolar_schizophrenia","epilepsy_or_seizure_disorder"]
for disease in mental_list:
    xml_clean_subsets(xtree,xroot,"biosample_samples_h2.xml",disease,
                      ["I do not have this condition"])
del xtree, xroot

In [7]:
#read back in the samples xml file
xml_file = 'biosample_samples.xml'
xtree = et.parse(manifest_dir+xml_file)
xroot = xtree.getroot()

In [8]:
#identify who does have a mental illness
xml_clean_subsets(xtree,xroot,"biosample_samples_u1.xml",'mental_illness',"Yes")
del xtree, xroot

<h2> Metadata <h2>

<h3> determine which categories to include <h3>

In [9]:
def meta_inclusion(filename):
    lines = pd.read_csv(glob.glob(filename)[0],sep='\t',index_col=0)
    return lines

In [10]:
#Read in file for what metadata to include
meta_cat_df = meta_inclusion(manifest_dir+"metadata_cat_inclusion.txt")
drop_list = ["N","c"]
meta_list = meta_cat_df[~meta_cat_df.Inclusion.isin(drop_list)].index.tolist()
meta_list

['age_cat',
 'alzheimers',
 'bmi_cat',
 'body_product',
 'body_site',
 'depression_bipolar_schizophrenia',
 'epilepsy_or_seizure_disorder',
 'ibd',
 'ibd_diagnosis',
 'ibd_diagnosis_refined',
 'mental_illness',
 'mental_illness_type_bipolar_disorder',
 'mental_illness_type_depression',
 'mental_illness_type_ptsd_posttraumatic_stress_disorder',
 'mental_illness_type_schizophrenia',
 'mental_illness_type_substance_abuse',
 'race',
 'sample_name',
 'sex',
 'smoking_frequency']

<h3> create cohort dfs <h3>

In [11]:
def xml_df(xroot_in):
    id_list=[]
    df=pd.DataFrame()
    meta_sorted = []
    for node in xroot_in:
        s_id = node.attrib.get("accession")
        id_list.append(s_id)
        
        meta_tmp=[]
        for meta_select in meta_list:
            search_text = "./Attributes/Attribute/[@attribute_name='" + meta_select + "']"
            
            meta_val = "NA"
            for search_in in node.findall(search_text):
                meta_val = search_in.text
            
            meta_tmp.append(meta_val)
            meta_sorted.append(meta_select)
            
        if len(df)==0:
            df=pd.DataFrame(columns=meta_sorted)

        df.loc[len(df)] = meta_tmp
    
    df["s_ids"]=id_list
    return df

In [12]:
def metacount_all(df_in):
    for i in df_in.columns:
        display(df_in[i].value_counts().rename_axis(i).to_frame('Number of samples'))

In [13]:
#Create df of healthy metadata
xml_file = 'biosample_samples_h2.xml'
xtree = et.parse(manifest_dir+xml_file)
xroot = xtree.getroot()

df_meta_healthy = xml_df(xroot)
df_meta_healthy["status"]="healthy"
del xtree,xroot

In [14]:
metacount_all(df_meta_healthy)

Unnamed: 0_level_0,Number of samples
age_cat,Unnamed: 1_level_1
,2315
Unspecified,237
40s,77
50s,73
60s,55
30s,54
20s,45
70+,21
child,18
teen,11


Unnamed: 0_level_0,Number of samples
alzheimers,Unnamed: 1_level_1
I do not have this condition,1920
,986


Unnamed: 0_level_0,Number of samples
bmi_cat,Unnamed: 1_level_1
,2315
Unspecified,229
Normal,190
Overweight,103
Obese,41
Underweight,28


Unnamed: 0_level_0,Number of samples
body_product,Unnamed: 1_level_1
,2518
UBERON:feces,148
UBERON:sebum,119
UBERON:mucus,61
UBERON:saliva,60


Unnamed: 0_level_0,Number of samples
body_site,Unnamed: 1_level_1
,2518
UBERON:feces,148
UBERON:nostril,61
UBERON:skin of head,60
UBERON:tongue,60
UBERON:skin of axilla,59


Unnamed: 0_level_0,Number of samples
depression_bipolar_schizophrenia,Unnamed: 1_level_1
,2905
I do not have this condition,1


Unnamed: 0_level_0,Number of samples
epilepsy_or_seizure_disorder,Unnamed: 1_level_1
,2638
I do not have this condition,268


Unnamed: 0_level_0,Number of samples
ibd,Unnamed: 1_level_1
I do not have this condition,2543
,174
"Diagnosed by a medical professional (doctor, physician assistant)",110
Self-diagnosed,52
Not provided,15
Unspecified,8
Diagnosed by an alternative medicine practitioner,4


Unnamed: 0_level_0,Number of samples
ibd_diagnosis,Unnamed: 1_level_1
,2316
Unspecified,573
Crohn's disease,13
Ulcerative colitis,4


Unnamed: 0_level_0,Number of samples
ibd_diagnosis_refined,Unnamed: 1_level_1
,2316
Unspecified,573
Colonic Crohn's Disease,11
Ulcerative colitis,4
Ileal and Colonic Crohn's Disease,1
Ileal Crohn's Disease,1


Unnamed: 0_level_0,Number of samples
mental_illness,Unnamed: 1_level_1
,2316
No,590


Unnamed: 0_level_0,Number of samples
mental_illness_type_bipolar_disorder,Unnamed: 1_level_1
,2583
No,323


Unnamed: 0_level_0,Number of samples
mental_illness_type_depression,Unnamed: 1_level_1
,2583
No,323


Unnamed: 0_level_0,Number of samples
mental_illness_type_ptsd_posttraumatic_stress_disorder,Unnamed: 1_level_1
,2583
No,322
Yes,1


Unnamed: 0_level_0,Number of samples
mental_illness_type_schizophrenia,Unnamed: 1_level_1
,2906


Unnamed: 0_level_0,Number of samples
mental_illness_type_substance_abuse,Unnamed: 1_level_1
,2583
No,323


Unnamed: 0_level_0,Number of samples
race,Unnamed: 1_level_1
Caucasian,2537
Asian or Pacific Islander,158
,97
Other,71
Hispanic,32
African American,6
Unspecified,4
Not provided,1


Unnamed: 0_level_0,Number of samples
sample_name,Unnamed: 1_level_1
,2906


Unnamed: 0_level_0,Number of samples
sex,Unnamed: 1_level_1
,2315
female,421
male,169
unspecified,1


Unnamed: 0_level_0,Number of samples
smoking_frequency,Unnamed: 1_level_1
,2315
Never,566
Rarely (a few times/month),13
Daily,7
Unspecified,2
Regularly (3-5 times/week),2
Occasionally (1-2 times/week),1


Unnamed: 0_level_0,Number of samples
s_ids,Unnamed: 1_level_1
SAMEA4036915,1
SAMEA3633276,1
SAMEA6674519,1
SAMEA6675696,1
SAMEA3613321,1
...,...
SAMEA3633527,1
SAMEA6674508,1
SAMEA3633516,1
SAMEA3633998,1


Unnamed: 0_level_0,Number of samples
status,Unnamed: 1_level_1
healthy,2906


In [15]:
#Create df of ibd metadata
xml_file = 'biosample_samples_u1.xml'
xtree = et.parse(manifest_dir+xml_file)
xroot = xtree.getroot()

df_meta_unhealthy = xml_df(xroot)
df_meta_unhealthy["status"]="unhealthy"
del xtree,xroot

In [16]:
metacount_all(df_meta_unhealthy)

Unnamed: 0_level_0,Number of samples
age_cat,Unnamed: 1_level_1
,3679
50s,162
40s,161
30s,161
60s,133
20s,101
Unspecified,27
70+,24
teen,13
child,4


Unnamed: 0_level_0,Number of samples
alzheimers,Unnamed: 1_level_1
I do not have this condition,2401
Not provided,1276
,681
Unspecified,93
Unknown,9
Self-diagnosed,3
"Diagnosed by a medical professional (doctor, physician assistant)",2
Diagnosed by an alternative medicine practitioner,1


Unnamed: 0_level_0,Number of samples
bmi_cat,Unnamed: 1_level_1
,3679
Normal,431
Overweight,160
Obese,148
Underweight,28
Unspecified,19
Unknown,1


Unnamed: 0_level_0,Number of samples
body_product,Unnamed: 1_level_1
,3655
UBERON:feces,638
UBERON:sebum,121
UBERON:saliva,50
UBERON:mucus,2


Unnamed: 0_level_0,Number of samples
body_site,Unnamed: 1_level_1
,3655
UBERON:feces,638
UBERON:skin of axilla,59
UBERON:tongue,50
UBERON:skin of head,31
UBERON:skin of hand,29
UBERON:skin of trunk,2
UBERON:vaginal introitus,2


Unnamed: 0_level_0,Number of samples
depression_bipolar_schizophrenia,Unnamed: 1_level_1
,3737
Unspecified,719
Unknown,9
I do not have this condition,1


Unnamed: 0_level_0,Number of samples
epilepsy_or_seizure_disorder,Unnamed: 1_level_1
,3697
I do not have this condition,744
"Diagnosed by a medical professional (doctor, physician assistant)",14
Unknown,9
Unspecified,2


Unnamed: 0_level_0,Number of samples
ibd,Unnamed: 1_level_1
I do not have this condition,3845
,174
Not provided,142
"Diagnosed by a medical professional (doctor, physician assistant)",117
Unspecified,115
Self-diagnosed,59
Diagnosed by an alternative medicine practitioner,10
Unknown,4


Unnamed: 0_level_0,Number of samples
ibd_diagnosis,Unnamed: 1_level_1
,3689
Unspecified,761
Ulcerative colitis,9
Crohn's disease,7


Unnamed: 0_level_0,Number of samples
ibd_diagnosis_refined,Unnamed: 1_level_1
,3689
Unspecified,753
Ulcerative colitis,9
Microcolitis,8
Ileal Crohn's Disease,4
Ileal and Colonic Crohn's Disease,3


Unnamed: 0_level_0,Number of samples
mental_illness,Unnamed: 1_level_1
,3689
Yes,777


Unnamed: 0_level_0,Number of samples
mental_illness_type_bipolar_disorder,Unnamed: 1_level_1
,3733
No,666
Yes,67


Unnamed: 0_level_0,Number of samples
mental_illness_type_depression,Unnamed: 1_level_1
,3733
Yes,637
No,96


Unnamed: 0_level_0,Number of samples
mental_illness_type_ptsd_posttraumatic_stress_disorder,Unnamed: 1_level_1
,3733
No,645
Yes,88


Unnamed: 0_level_0,Number of samples
mental_illness_type_schizophrenia,Unnamed: 1_level_1
,3751
No,686
Yes,29


Unnamed: 0_level_0,Number of samples
mental_illness_type_substance_abuse,Unnamed: 1_level_1
,3733
No,669
Yes,64


Unnamed: 0_level_0,Number of samples
race,Unnamed: 1_level_1
Caucasian,3893
Asian or Pacific Islander,143
Other,102
,97
Unspecified,88
Hispanic,82
Not provided,46
African American,15


Unnamed: 0_level_0,Number of samples
sample_name,Unnamed: 1_level_1
,4466


Unnamed: 0_level_0,Number of samples
sex,Unnamed: 1_level_1
,3679
female,526
male,258
unspecified,2
other,1


Unnamed: 0_level_0,Number of samples
smoking_frequency,Unnamed: 1_level_1
,3679
Never,678
Rarely (a few times/month),55
Daily,32
Occasionally (1-2 times/week),11
Regularly (3-5 times/week),7
Unspecified,4


Unnamed: 0_level_0,Number of samples
s_ids,Unnamed: 1_level_1
SAMEA4036915,1
SAMEA3613112,1
SAMEA4036611,1
SAMEA3613412,1
SAMEA104604230,1
...,...
SAMEA3613525,1
SAMEA3612272,1
SAMEA3611508,1
SAMEA104701922,1


Unnamed: 0_level_0,Number of samples
status,Unnamed: 1_level_1
unhealthy,4466


In [17]:
#merge df's
df_meta_allsubj = df_meta_unhealthy
df_meta_allsubj = df_meta_allsubj.append(df_meta_healthy)
df_meta_allsubj.shape[0]

7372

<h3> filter cohort <h3>

Filtering process
1) remove samples with no data for categories (age, bmi, race, sex)

2) remove samples with non-western dx (ibd, mental illness)

3) remove age categories (child, teen, 70+)

4) remove bmi categories (underweight)

In [18]:
#determine which cols should be used as filters
remove_list = ["age_cat","bmi_cat","ibd","mental_illness","race","sex"]
#not used: "alzheimers","epilepsy_or_seizure_disorder"

#terms to remove
terms_list = ["Unknown","Unspecified","NA","Not provided", "unspecified","other",
              "Self-diagnosed","Diagnosed by an alternative medicine practitioner","child","teen", "Underweight", "70+"]
df_clean = df_meta_allsubj

print("Start:",df_clean.shape[0])

for colnames in remove_list:
    for terms in terms_list:
        df_clean = df_clean.drop(df_clean[(df_clean[colnames] == terms)].index)
    print(colnames,df_clean.shape[0])

Start: 7372
age_cat 473
bmi_cat 436
ibd 395
mental_illness 390
race 386
sex 386


In [19]:
#combine "overweight/obese" into one cat
df_clean_merge = df_clean.copy()
df_clean_merge["bmi_cat"].replace({"Overweight": "overweight/obese", "Obese": "overweight/obese"}, inplace=True)

In [20]:
metacount_all(df_clean_merge)

Unnamed: 0_level_0,Number of samples
age_cat,Unnamed: 1_level_1
40s,106
50s,88
30s,75
60s,73
20s,44


Unnamed: 0_level_0,Number of samples
alzheimers,Unnamed: 1_level_1
I do not have this condition,314
,68
Unspecified,4


Unnamed: 0_level_0,Number of samples
bmi_cat,Unnamed: 1_level_1
Normal,209
overweight/obese,177


Unnamed: 0_level_0,Number of samples
body_product,Unnamed: 1_level_1
UBERON:feces,224
,151
UBERON:sebum,5
UBERON:saliva,4
UBERON:mucus,2


Unnamed: 0_level_0,Number of samples
body_site,Unnamed: 1_level_1
UBERON:feces,224
,151
UBERON:skin of head,4
UBERON:tongue,4
UBERON:vaginal introitus,2
UBERON:skin of hand,1


Unnamed: 0_level_0,Number of samples
depression_bipolar_schizophrenia,Unnamed: 1_level_1
,219
Unspecified,167


Unnamed: 0_level_0,Number of samples
epilepsy_or_seizure_disorder,Unnamed: 1_level_1
I do not have this condition,313
,68
"Diagnosed by a medical professional (doctor, physician assistant)",5


Unnamed: 0_level_0,Number of samples
ibd,Unnamed: 1_level_1
I do not have this condition,370
"Diagnosed by a medical professional (doctor, physician assistant)",16


Unnamed: 0_level_0,Number of samples
ibd_diagnosis,Unnamed: 1_level_1
Unspecified,372
Crohn's disease,9
Ulcerative colitis,5


Unnamed: 0_level_0,Number of samples
ibd_diagnosis_refined,Unnamed: 1_level_1
Unspecified,371
Colonic Crohn's Disease,5
Ulcerative colitis,5
Ileal Crohn's Disease,3
Microcolitis,1
Ileal and Colonic Crohn's Disease,1


Unnamed: 0_level_0,Number of samples
mental_illness,Unnamed: 1_level_1
No,193
Yes,193


Unnamed: 0_level_0,Number of samples
mental_illness_type_bipolar_disorder,Unnamed: 1_level_1
No,220
,155
Yes,11


Unnamed: 0_level_0,Number of samples
mental_illness_type_depression,Unnamed: 1_level_1
,155
Yes,153
No,78


Unnamed: 0_level_0,Number of samples
mental_illness_type_ptsd_posttraumatic_stress_disorder,Unnamed: 1_level_1
No,208
,155
Yes,23


Unnamed: 0_level_0,Number of samples
mental_illness_type_schizophrenia,Unnamed: 1_level_1
,223
No,159
Yes,4


Unnamed: 0_level_0,Number of samples
mental_illness_type_substance_abuse,Unnamed: 1_level_1
No,221
,155
Yes,10


Unnamed: 0_level_0,Number of samples
race,Unnamed: 1_level_1
Caucasian,326
Asian or Pacific Islander,32
Other,18
Hispanic,8
African American,2


Unnamed: 0_level_0,Number of samples
sample_name,Unnamed: 1_level_1
,386


Unnamed: 0_level_0,Number of samples
sex,Unnamed: 1_level_1
female,226
male,160


Unnamed: 0_level_0,Number of samples
smoking_frequency,Unnamed: 1_level_1
Never,347
Rarely (a few times/month),17
Daily,11
Occasionally (1-2 times/week),6
Regularly (3-5 times/week),4
Unspecified,1


Unnamed: 0_level_0,Number of samples
s_ids,Unnamed: 1_level_1
SAMEA6674294,1
SAMEA6675722,1
SAMEA6675927,1
SAMEA6675793,1
SAMEA6675709,1
...,...
SAMEA6674792,1
SAMEA4670614,1
SAMEA6675735,1
SAMEA4788346,1


Unnamed: 0_level_0,Number of samples
status,Unnamed: 1_level_1
healthy,193
unhealthy,193


<h3> evaluate and filter <h3>

In [21]:
#generate numbers by cats
df_grouped = df_clean_merge.groupby(by=['age_cat',"sex","bmi_cat","race","status"]).size()
df_grouped.to_csv(stats_dir + "cohort_vals.csv")
df_grouped=pd.read_csv(stats_dir + "cohort_vals.csv")
df_grouped.rename(columns={'0': 'count'}, inplace=True)

In [22]:
def count_dfvals(df_in,unique_age,unique_sex,unique_bmi,unique_race,unique_status):
    try:
        count_val = df_in.loc[(df_in["age_cat"]==unique_age) & (df_in["sex"]==unique_sex) 
                              & (df_in["bmi_cat"]==unique_bmi) & (df_in["race"]==unique_race)
                              & (df_in["status"]==unique_status), 'count'].values[0]
        #print(unique_age,unique_sex,unique_bmi,unique_race,unique_status,":",count_val)
    except:
        #print(unique_age,unique_sex,unique_bmi,unique_race,unique_status,": failed")
        count_val = 0
    return count_val

In [23]:
#determine counts by age,sex,bmi,race and then ID into either healthy or unhealthy categories
col_list = ["age_cat","sex","bmi_cat","race","healthy","unhealthy"]
df_counts = pd.DataFrame(columns=col_list)

#iterate through age, sex, bmi and race
for unique_age in df_clean_merge.age_cat.unique():
    for unique_sex in df_clean_merge.sex.unique():
        for unique_bmi in df_clean_merge.bmi_cat.unique():
            for unique_race in df_clean_merge.race.unique():
                
                #count all rows that match the criteria, and are healthy or unhealthy
                count_health = count_dfvals(df_grouped,unique_age,unique_sex,unique_bmi,unique_race,"healthy")
                count_unhealth = count_dfvals(df_grouped,unique_age,unique_sex,unique_bmi,unique_race,"unhealthy")
                df_list = [unique_age,unique_sex,unique_bmi,unique_race,count_health,count_unhealth]
                df_counts.loc[len(df_counts)] = df_list

In [24]:
#drop any 0 cat combos, determine the min number between healthy and unhealthy
df_counts = df_counts[(df_counts.healthy != 0) & (df_counts.unhealthy != 0)]
df_counts["min"] = df_counts[['healthy','unhealthy']].min(axis=1)
df_counts.to_csv(stats_dir + "cohort_mins.csv")
print("Total cohort:", 2*df_counts['min'].sum())

Total cohort: 252.0


In [25]:
def merge_final(df_in,status_in,rows_in,df_out):
    df_tmp = df_in.drop(df_in[(df_in['status'] == status_in)].index)
    df_tmp = df_tmp.iloc[:(rows_in)]
    
    df_out = df_out.append(df_tmp)    
    return (df_out)

In [26]:
#create sample ID list that matches criterion
remove_list = ["age_cat","sex","race", 'bmi_cat']
col_list = list(df_clean_merge.columns)
col_list.insert(0,"index")
df_sids = pd.DataFrame(columns=col_list)

for index, row in df_counts.iterrows():
    df_tmp = df_clean_merge.copy()
    df_tmp.set_index('s_ids',inplace=True)
    rows_to_add = int(row['min'])

    #create subset df specific to matching criteria
    #age, sex, race
    for colnames in remove_list:
        df_tmp.drop(df_tmp[(df_tmp[colnames] != row[colnames])].index,inplace=True)
    
    #create healthy and unhealthy, then merge
    df_sids = merge_final(df_tmp,"healthy",rows_to_add,df_sids)    
    df_sids = merge_final(df_tmp,"unhealthy",rows_to_add,df_sids)

#write dataset
df_sids.to_csv(manifest_dir + "cohort_meta_l1.csv")

In [27]:
#Confirmation
print("Total cohort:", 2*df_counts['min'].sum())
print("Total df:", df_sids.shape[0])

Total cohort: 252.0
Total df: 252


<h3> evaluate and filter <h3>

Filtering included
1) removing 20 year olds - there numbers were too low

2) removing non-caucasians - not enough in cohort

3) errors - said "yes" for mental health issue, but no for all dx

In [28]:
#remove problematic samples
df_subset = df_sids.copy()
print(df_subset.shape)

#20 y.o. individuals
df_subset.drop(df_subset[(df_subset['age_cat'] == "20s")].index,inplace=True)
print(df_subset.shape)

#non-caucasian individuals
df_subset.drop(df_subset[(df_subset['race'] != "Caucasian")].index,inplace=True)
print(df_subset.shape)

#Yes = mental illness, but No/NA for all subcategories
remove_list = ["No", "NA"]

for option in remove_list:
    df_subset.drop(df_subset[(df_subset['mental_illness'] == "Yes") &
                             (df_subset['mental_illness_type_bipolar_disorder'] == option) & 
                             (df_subset['mental_illness_type_depression'] == option) & 
                             (df_subset['mental_illness_type_bipolar_disorder'] == option) & 
                             (df_subset['mental_illness_type_ptsd_posttraumatic_stress_disorder'] == option) & 
                             (df_subset['mental_illness_type_schizophrenia'] == option) &
                             (df_subset['mental_illness_type_substance_abuse'] == option)].index,inplace=True)
print(df_subset.shape)

(252, 23)
(238, 23)
(224, 23)
(202, 23)


In [29]:
#set index
df_subset["s_ids"]=df_subset.index
df_subset.drop(columns="index",inplace=True)

#create sample ID list that matches criterion
remove_list = ["age_cat","sex","race", 'bmi_cat']
df_counts=pd.read_csv(stats_dir + "cohort_mins_manuallyedited.csv")

#re-write prev df, prepare for cohort
col_list = list(df_subset.columns)
col_list.insert(0,"index")
df_sids = pd.DataFrame(columns=col_list)

for index, row in df_counts.iterrows():
    df_tmp = df_subset.copy()
    df_tmp.set_index('s_ids',inplace=True)
    rows_to_add = int(row['new_min'])

    #create subset df specific to matching criteria
    #age, sex, race
    for colnames in remove_list:
        df_tmp.drop(df_tmp[(df_tmp[colnames] != row[colnames])].index,inplace=True)
    
    #create healthy and unhealthy, then merge
    df_sids = merge_final(df_tmp,"healthy",rows_to_add,df_sids)    
    df_sids = merge_final(df_tmp,"unhealthy",rows_to_add,df_sids)

#write dataset
df_sids.to_csv(manifest_dir + "cohort_meta_l2.csv")

<h3> Include download information <h3>

In [30]:
#read in filereport and select cohort values
df_filereport=pd.read_csv(manifest_dir + "filereport_read_run_PRJEB11419_tsv.txt",sep='\t')
df_downloads = df_filereport[df_filereport.sample_accession.isin(list(df_sids.index))]
df_downloads

Unnamed: 0,study_accession,sample_accession,experiment_accession,run_accession,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp
24262,PRJEB11419,SAMEA4786600,ERX2710930,ERR2696537,408170,human gut metagenome,ftp.sra.ebi.ac.uk/vol1/fastq/ERR269/007/ERR269...,ftp.sra.ebi.ac.uk/vol1/run/ERR269/ERR2696537/1...,ftp.sra.ebi.ac.uk/vol1/err/ERR269/007/ERR2696537
24306,PRJEB11419,SAMEA4786644,ERX2710974,ERR2696581,408170,human gut metagenome,ftp.sra.ebi.ac.uk/vol1/fastq/ERR269/001/ERR269...,ftp.sra.ebi.ac.uk/vol1/run/ERR269/ERR2696581/1...,ftp.sra.ebi.ac.uk/vol1/err/ERR269/001/ERR2696581
24393,PRJEB11419,SAMEA4786731,ERX2711061,ERR2696668,408170,human gut metagenome,ftp.sra.ebi.ac.uk/vol1/fastq/ERR269/008/ERR269...,ftp.sra.ebi.ac.uk/vol1/run/ERR269/ERR2696668/1...,ftp.sra.ebi.ac.uk/vol1/err/ERR269/008/ERR2696668
24414,PRJEB11419,SAMEA4786752,ERX2711082,ERR2696689,408170,human gut metagenome,ftp.sra.ebi.ac.uk/vol1/fastq/ERR269/009/ERR269...,ftp.sra.ebi.ac.uk/vol1/run/ERR269/ERR2696689/1...,ftp.sra.ebi.ac.uk/vol1/err/ERR269/009/ERR2696689
24417,PRJEB11419,SAMEA4786755,ERX2711085,ERR2696692,408170,human gut metagenome,ftp.sra.ebi.ac.uk/vol1/fastq/ERR269/002/ERR269...,ftp.sra.ebi.ac.uk/vol1/run/ERR269/ERR2696692/1...,ftp.sra.ebi.ac.uk/vol1/err/ERR269/002/ERR2696692
...,...,...,...,...,...,...,...,...,...
26688,PRJEB11419,SAMEA6675972,ERX4022103,ERR4020715,408170,human gut metagenome,ftp.sra.ebi.ac.uk/vol1/fastq/ERR402/005/ERR402...,ftp.sra.ebi.ac.uk/vol1/run/ERR402/ERR4020715/1...,ftp.sra.ebi.ac.uk/vol1/err/ERR402/005/ERR4020715
26689,PRJEB11419,SAMEA6675973,ERX4022104,ERR4020716,408170,human gut metagenome,ftp.sra.ebi.ac.uk/vol1/fastq/ERR402/006/ERR402...,ftp.sra.ebi.ac.uk/vol1/run/ERR402/ERR4020716/1...,ftp.sra.ebi.ac.uk/vol1/err/ERR402/006/ERR4020716
26692,PRJEB11419,SAMEA6675976,ERX4022107,ERR4020719,408170,human gut metagenome,ftp.sra.ebi.ac.uk/vol1/fastq/ERR402/009/ERR402...,ftp.sra.ebi.ac.uk/vol1/run/ERR402/ERR4020719/1...,ftp.sra.ebi.ac.uk/vol1/err/ERR402/009/ERR4020719
26693,PRJEB11419,SAMEA6675977,ERX4022108,ERR4020720,408170,human gut metagenome,ftp.sra.ebi.ac.uk/vol1/fastq/ERR402/000/ERR402...,ftp.sra.ebi.ac.uk/vol1/run/ERR402/ERR4020720/1...,ftp.sra.ebi.ac.uk/vol1/err/ERR402/000/ERR4020720


In [31]:
#write out files for other scripts to read
file_out = manifest_dir + 'sid_list.txt'
with open(file_out, 'w') as f:
    f.write(df_downloads.run_accession.str.cat(sep='\n'))

file_out = manifest_dir + 'ftp_list.txt'
with open(file_out, 'w') as f:
    f.write(df_downloads.fastq_ftp.str.cat(sep='\n'))

In [32]:
#add sampleid to metadata df
split_list = []

df_tmp = df_downloads[['submitted_ftp','run_accession']]
for index, rows in df_tmp.iterrows():
    split_one = rows['submitted_ftp'].rsplit('/',1)[1].rsplit('.',2)[0].rstrip('.R1')
    split_list.append(split_one)

df_tmp = df_tmp.assign(sampleid=split_list)
df_tmp.index=df_downloads['sample_accession']

In [33]:
def merge_meta(df1, df2):
    result = pd.concat([df_sids, df_tmp], axis=1, sort=False)
    return result

In [34]:
#merge metadf
df_cohort = merge_meta(df_sids,df_tmp)
df_cohort.drop(columns=['submitted_ftp','index','sample_name','s_ids'],inplace=True)
df_cohort['sample_name']=df_cohort.index
df_cohort.index=df_cohort['sampleid']
del df_downloads
df_cohort

Unnamed: 0_level_0,age_cat,alzheimers,bmi_cat,body_product,body_site,depression_bipolar_schizophrenia,epilepsy_or_seizure_disorder,ibd,ibd_diagnosis,ibd_diagnosis_refined,...,mental_illness_type_ptsd_posttraumatic_stress_disorder,mental_illness_type_schizophrenia,mental_illness_type_substance_abuse,race,sex,smoking_frequency,status,run_accession,sampleid,sample_name
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10317.000105348,40s,I do not have this condition,overweight/obese,UBERON:feces,UBERON:feces,Unspecified,I do not have this condition,I do not have this condition,Unspecified,Unspecified,...,No,No,No,Caucasian,female,Never,unhealthy,ERR4019277,10317.000105348,SAMEA6674381
10317.000102654,40s,I do not have this condition,overweight/obese,,,,I do not have this condition,I do not have this condition,Unspecified,Unspecified,...,,,,Caucasian,female,Never,healthy,ERR4020715,10317.000102654,SAMEA6675972
10317.000097313,60s,I do not have this condition,Normal,UBERON:feces,UBERON:feces,Unspecified,I do not have this condition,I do not have this condition,Unspecified,Unspecified,...,No,No,No,Caucasian,male,Never,unhealthy,ERR2697855,10317.000097313,SAMEA4788481
10317.000097312,60s,I do not have this condition,Normal,UBERON:feces,UBERON:feces,Unspecified,I do not have this condition,I do not have this condition,Unspecified,Unspecified,...,No,No,No,Caucasian,male,Never,unhealthy,ERR2697854,10317.000097312,SAMEA4788480
10317.000103666,60s,I do not have this condition,Normal,,,,I do not have this condition,I do not have this condition,Unspecified,Unspecified,...,,,,Caucasian,male,Never,healthy,ERR4020721,10317.000103666,SAMEA6675978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10317.000089690,60s,I do not have this condition,overweight/obese,UBERON:feces,UBERON:feces,Unspecified,I do not have this condition,I do not have this condition,Unspecified,Unspecified,...,No,Yes,No,Caucasian,male,Never,unhealthy,ERR2696537,10317.000089690,SAMEA4786600
10317.000099103,60s,I do not have this condition,overweight/obese,,,,I do not have this condition,I do not have this condition,Unspecified,Unspecified,...,,,,Caucasian,male,Never,healthy,ERR4020516,10317.000099103,SAMEA6675771
10317.000099068,60s,I do not have this condition,overweight/obese,,,,I do not have this condition,I do not have this condition,Unspecified,Unspecified,...,,,,Caucasian,male,Never,healthy,ERR4020511,10317.000099068,SAMEA6675766
10317.000098925,60s,I do not have this condition,overweight/obese,,,,I do not have this condition,I do not have this condition,Unspecified,Unspecified,...,,,,Caucasian,male,Never,healthy,ERR4020491,10317.000098925,SAMEA6675746


<h3> include seq information <h3>

In [35]:
df_downloads = pd.read_csv(manifest_dir + 'SRARunTable.txt',sep=",")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [36]:
keep_cols=['Library Name','run_prefix (exp)','run_date (exp)','barcode (exp)', 'linker (exp)','pcr_primers (exp)',
           'primer (exp)','run_center (exp)','well_description (exp)','COUNTRY']
df_downloads = df_downloads[keep_cols]
df_downloads = df_downloads.drop_duplicates()
df_downloads

Unnamed: 0,Library Name,run_prefix (exp),run_date (exp),barcode (exp),linker (exp),pcr_primers (exp),primer (exp),run_center (exp),well_description (exp),COUNTRY
0,10317.000001041,Pool_S1_L001,04/20/15,AGATCGTGCCTA,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate60_C7,USA
1,10317.000001656,Pool_S1_L001,04/20/15,AGTAGTTTCCTT,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate59_A6,USA
2,10317.000002252,Pool_S1_L001,04/20/15,GAGTCCGTTGCT,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate61_D10,USA
3,10317.000003491,Pool_S1_L001,04/20/15,TCCACAGGGTTC,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate60_B7,USA
4,10317.000003492,Pool_S1_L001,04/20/15,CCTCACTAGCGA,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate60_D7,USA
...,...,...,...,...,...,...,...,...,...,...
26915,10317.000102751,Paulus_LIBR_35_AGP_257-258_LisaSRSraw_JEckmann...,2/8/19,GAGCCCAAAGAG,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,American_Gut_Project_Plate_257_102751_C7,USA
26916,10317.000102784,Paulus_LIBR_35_AGP_257-258_LisaSRSraw_JEckmann...,2/8/19,CTTGTGCGACAA,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,American_Gut_Project_Plate_258_102784_D11,USA
26917,10317.000102882,Paulus_LIBR_35_AGP_257-258_LisaSRSraw_JEckmann...,2/8/19,AGTAGTTTCCTT,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,American_Gut_Project_Plate_257_102882_A6,United Kingdom
26918,10317.000102894,Paulus_LIBR_35_AGP_257-258_LisaSRSraw_JEckmann...,2/8/19,GACCCGTTTCGC,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,American_Gut_Project_Plate_257_102894_B6,United Kingdom


In [37]:
#merge metadf
df_downloads['sampleid']=df_downloads['Library Name']
df_cohort.drop(columns='sampleid',inplace=True)
df_cohort.reset_index(inplace=True)
df_cohort_l3 = pd.concat([df_cohort, df_downloads], axis=1, join="inner")
del df_downloads

#create plateid col
df_cohort_l3['plateid']=df_cohort_l3['well_description (exp)'].str.replace(r'_\w\w[0-9]','') #removes wellID >10
df_cohort_l3['plateid']=df_cohort_l3['plateid'].str.replace(r'_\w\w','') #removes wellID <10

#write dataset
df_cohort_l3.to_csv(manifest_dir + "cohort_meta_l3.csv")
df_cohort_l3

Unnamed: 0,sampleid,age_cat,alzheimers,bmi_cat,body_product,body_site,depression_bipolar_schizophrenia,epilepsy_or_seizure_disorder,ibd,ibd_diagnosis,...,run_date (exp),barcode (exp),linker (exp),pcr_primers (exp),primer (exp),run_center (exp),well_description (exp),COUNTRY,sampleid.1,plateid
0,10317.000105348,40s,I do not have this condition,overweight/obese,UBERON:feces,UBERON:feces,Unspecified,I do not have this condition,I do not have this condition,Unspecified,...,04/20/15,AGATCGTGCCTA,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate60_C7,USA,10317.000001041,Plate60
1,10317.000102654,40s,I do not have this condition,overweight/obese,,,,I do not have this condition,I do not have this condition,Unspecified,...,04/20/15,AGTAGTTTCCTT,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate59_A6,USA,10317.000001656,Plate59
2,10317.000097313,60s,I do not have this condition,Normal,UBERON:feces,UBERON:feces,Unspecified,I do not have this condition,I do not have this condition,Unspecified,...,04/20/15,GAGTCCGTTGCT,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate61_D10,USA,10317.000002252,Plate61
3,10317.000097312,60s,I do not have this condition,Normal,UBERON:feces,UBERON:feces,Unspecified,I do not have this condition,I do not have this condition,Unspecified,...,04/20/15,TCCACAGGGTTC,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate60_B7,USA,10317.000003491,Plate60
4,10317.000103666,60s,I do not have this condition,Normal,,,,I do not have this condition,I do not have this condition,Unspecified,...,04/20/15,CCTCACTAGCGA,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate60_D7,USA,10317.000003492,Plate60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,10317.000089690,60s,I do not have this condition,overweight/obese,UBERON:feces,UBERON:feces,Unspecified,I do not have this condition,I do not have this condition,Unspecified,...,04/20/15,AATCTTGCGCCG,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate58_D9,USA,10317.000020108,Plate58
112,10317.000099103,60s,I do not have this condition,overweight/obese,,,,I do not have this condition,I do not have this condition,Unspecified,...,04/20/15,TAGTCTAAGGGT,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate60_G10,USA,10317.000020109,Plate60
113,10317.000099068,60s,I do not have this condition,overweight/obese,,,,I do not have this condition,I do not have this condition,Unspecified,...,04/20/15,AGCTCTAGAAAC,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate60_C3,USA,10317.000020336,Plate60
114,10317.000098925,60s,I do not have this condition,overweight/obese,,,,I do not have this condition,I do not have this condition,Unspecified,...,04/20/15,CGCACCCATACA,GT,FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT,GTGTGYCAGCMGCCGCGGTAA,UCSDMI,Plate58_H10,USA,10317.000020368,Plate58


<h3> Prep Q2 manifest <h3>

In [38]:
for cols in df_cohort_l3.columns:
    print (cols)

sampleid
age_cat
alzheimers
bmi_cat
body_product
body_site
depression_bipolar_schizophrenia
epilepsy_or_seizure_disorder
ibd
ibd_diagnosis
ibd_diagnosis_refined
mental_illness
mental_illness_type_bipolar_disorder
mental_illness_type_depression
mental_illness_type_ptsd_posttraumatic_stress_disorder
mental_illness_type_schizophrenia
mental_illness_type_substance_abuse
race
sex
smoking_frequency
status
run_accession
sample_name
Library Name
run_prefix (exp)
run_date (exp)
barcode (exp)
linker (exp)
pcr_primers (exp)
primer (exp)
run_center (exp)
well_description (exp)
COUNTRY
sampleid
plateid


In [77]:
df_cohort_q2 = df_cohort_l3.copy()

#drop unneeded cols
df_cohort_q2.drop(columns=['well_description (exp)','sampleid'],inplace=True)

#remove (exp) from col name
df_cohort_q2.columns = df_cohort_q2.columns.str.replace(' \(exp\)','')

#fix other issues
df_cohort_q2 = df_cohort_q2.rename(columns={'Library Name': 'library_name','COUNTRY':'country','run_accession':'sampleid',
                                           'run_prefix':'Run-ID'})

#add fastq path
df_cohort_q2['fq1'] = 'e:/"My Files"/"My Files"/Dissertation/Analysis/Aim3/data/split_files/' + df_cohort_q2['sampleid'] + '_R1.fastq'
df_cohort_q2['fq2'] = 'e:/"My Files"/"My Files"/Dissertation/Analysis/Aim3/data/split_files/' + df_cohort_q2['sampleid'] + '_R2.fastq'


In [78]:
#rearraign cols
cols = list(df_cohort_q2.columns.values)
cols

['age_cat',
 'alzheimers',
 'bmi_cat',
 'body_product',
 'body_site',
 'depression_bipolar_schizophrenia',
 'epilepsy_or_seizure_disorder',
 'ibd',
 'ibd_diagnosis',
 'ibd_diagnosis_refined',
 'mental_illness',
 'mental_illness_type_bipolar_disorder',
 'mental_illness_type_depression',
 'mental_illness_type_ptsd_posttraumatic_stress_disorder',
 'mental_illness_type_schizophrenia',
 'mental_illness_type_substance_abuse',
 'race',
 'sex',
 'smoking_frequency',
 'status',
 'sampleid',
 'sample_name',
 'library_name',
 'Run-ID',
 'run_date',
 'barcode',
 'linker',
 'pcr_primers',
 'primer',
 'run_center',
 'country',
 'plateid',
 'fq1',
 'fq2']

In [79]:
df_cohort_q2 = df_cohort_q2[['sampleid','Run-ID','fq1','fq2','barcode','linker','pcr_primers','primer',
 'sample_name', 'library_name',  'run_date', 'run_center','plateid',
 'status','age_cat','bmi_cat','race', 'sex','country',
 'epilepsy_or_seizure_disorder',
 'ibd', 'ibd_diagnosis', 'ibd_diagnosis_refined',
 'mental_illness', 'mental_illness_type_bipolar_disorder', 'mental_illness_type_depression',
 'mental_illness_type_ptsd_posttraumatic_stress_disorder', 'mental_illness_type_schizophrenia',
 'mental_illness_type_substance_abuse']]

In [86]:
#write dataset
df_cohort_q2.to_csv(manifest_dir + "cohort_meta_q2.csv",index=False)