# ASD Meta-Analysis

#### This notebook contains the steps to process and merge the metadata files from all studies together for combines study analyses

In [12]:
#Import dependencies
from qiime2 import Visualization
import os
import qiime2 as q2
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import scipy
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

#### Load Metadata

In [13]:
amgut = pd.read_csv("./American_gut_metadata.txt",sep='\t',index_col=0)
berding = pd.read_csv("./meta-berding.txt",sep='\t',index_col=0)
cao = pd.read_csv("./meta-cao.txt",sep='\t',index_col=0)
chen = pd.read_csv("./meta-chen.txt",sep='\t',index_col=0)
dan = pd.read_csv("./meta-dan.txt",sep='\t',index_col=0)
david = pd.read_csv("./meta-david.txt",sep='\t',index_col=0)
huang = pd.read_csv("./meta-huang.txt",sep='\t',index_col=0)
fouquier = pd.read_csv("./meta-fouquier.txt",sep='\t',index_col=0)
kang = pd.read_csv("./meta-kang.txt",sep='\t',index_col=0)
kong = pd.read_csv("./meta-kong.txt",sep='\t',index_col=0)
liu = pd.read_csv("./meta-liu.txt",sep='\t',index_col=0)
son = pd.read_csv("./meta-son.txt",sep='\t',index_col=0)
zou = pd.read_csv("./meta-zou.txt",sep='\t',index_col=0)
zurita = pd.read_csv("./meta-zurita.txt",sep='\t',index_col=0)

In [14]:
amgut = amgut.drop(amgut.loc[amgut["Sex"]=="LabControl test"].index)

In [15]:
all_meta = [amgut, berding, cao, chen, dan, david, huang, fouquier, kang, kong, liu, son, zou, zurita]

all_meta_merged = pd.concat(all_meta)

all_meta_merged

Unnamed: 0_level_0,Age,Control_Relation,Control_Type,Control_relation,Country,Sample_size,Sex,Status,Study,Subjects_Location,Variable_Region,sequencing_depth_min
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
56608.10317.000059852,7,,Age_Sex_Match,No Relationship,,,male,ASD,American Gut,,V4,6000.0
56608.10317.000073749,9,,Age_Sex_Match,No Relationship,,,male,ASD,American Gut,,V4,6000.0
56608.10317.000072136,10,,Age_Sex_Match,No Relationship,,,male,ASD,American Gut,,V4,6000.0
56608.10317.000073612,18,,Age_Sex_Match,No Relationship,,,male,ASD,American Gut,,V4,6000.0
56754.10317.000021574,5,,Age_Sex_Match,No Relationship,,,male,ASD,American Gut,,V4,6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
P53.stool,8,,Age_sex_no_relation,,Ecuador,50.0,male,Control,Zurita2019,,V4,
P54.stool,5,,Age_sex_no_relation,,Ecuador,50.0,female,Control,Zurita2019,,V4,
P55.stool,11,,Age_sex_no_relation,,Ecuador,50.0,male,Control,Zurita2019,,V4,
P56.stool,12,,Age_sex_no_relation,,Ecuador,50.0,male,Control,Zurita2019,,V4,


In [16]:
all_meta_merged['Age'] = pd.to_numeric(all_meta_merged['Age'], errors='coerce')

#### Define processing functions

In [17]:
def add_age(row): 
    if row['Age'] < 5 :
        return 'Below 5 years'
    if row['Age'] >= 5 and row['Age'] <= 7:
        return '5-7 years'
    if row['Age'] > 7:
        return 'Above 7 years'
    if row['Age'] == "Unknown":
        return "NaN"
    if row['Age'] == "NaN":
        return "NaN"
    else :
        return 'NaN'
    
def sequencing_depth_min(row): 
    if row['Study'] == "American Gut" :
        return 6000
    if row['Study'] == "Berding2020" :
        return 14300
    if row['Study'] == "Cao2021":
        return 5837
    if row['Study'] == "Chen2020":
        return 18417
    if row['Study'] == "Dan2020":
        return 26868
    if row['Study'] == "David2021":
        return 5559
    if row['Study'] == "Huang2021":
        return 14075
    if row['Study'] == "Fouquier2021":
        return 20428
    if row['Study'] == "Kang2017":
        return 5636
    if row['Study'] == "Kong2019":
        return 18116
    if row['Study'] == "Liu2019":
        return 22613
    if row['Study'] == "Son2015":
        return 49184
    if row['Study'] == "Zou2020":
        return 28246
    if row['Study'] == "Zurita2019":
        return 5802
    else :
        return "NaN"
    
def sequencing_depth_range(row):
    if row['Study'] == "American Gut" :
        return "< 6000"
    if row['Study'] == "Berding2020" :
        return "< 6000"
    if row['Study'] == "Cao2021":
        return "< 6000"
    if row['Study'] == "Chen2020":
        return "> 1400"
    if row['Study'] == "Dan2020":
        return "> 1400"
    if row['Study'] == "David2021":
        return "< 6000"
    if row['Study'] == "Huang2021":
        return "> 1400"
    if row['Study'] == "Fouquier2021":
        return "> 1400"
    if row['Study'] == "Kang2017":
        return "< 6000"
    if row['Study'] == "Kong2019":
        return "> 1400"
    if row['Study'] == "Liu2019":
        return "> 1400"
    if row['Study'] == "Son2015":
        return "> 1400"
    if row['Study'] == "Zou2020":
        return "> 1400"
    if row['Study'] == "Zurita2019":
        return "< 6000"
    else :
        return 'NaN'

def control_type_add(row):
    if row['Study'] == "American Gut" :
        return "No Relationship"
    if row['Study'] == "Berding2020" :
        return "No Relationship"
    if row['Study'] == "Cao2021":
        return "No Relationship"
    if row['Study'] == "Chen2020":
        return "Related"
    if row['Study'] == "Dan2020":
        return "No Relationship"
    if row['Study'] == "David2021":
        return "Related"
    if row['Study'] == "Huang2021":
        return "No Relationship"
    if row['Study'] == "Fouquier2021":
        return "Related"
    if row['Study'] == "Kang2017":
        return "No Relationship"
    if row['Study'] == "Kong2019":
        return "Related"
    if row['Study'] == "Liu2019":
        return "No Relationship"
    if row['Study'] == "Son2015":
        return "Related"
    if row['Study'] == "Zou2020":
        return "No Relationship"
    if row['Study'] == "Zurita2019":
        return "No Relationship"
    else :
        return 'NA'
    
def Berding_Sample_Size(row): 
    if row['Study'] == "Berding2020" :
        return 52
    else :
        return row['Sample_size']

def Berding_Country(row): 
    if row['Study'] == "Berding2020" :
        return "USA"
    if row['Study'] == "American Gut" :
        return "USA"
    if row['Study'] == "Kang2017" or row['Study'] == "kang" :
        return "USA"
    if row['Study'] == "Kong2019" or row['Study'] == "kong" :
        return "USA"
    else :
        return row['Country']
    
def samp_size(row): 
    if row['Study'] == "American Gut" :
        return 532
    if row['Study'] == "Berding2020" :
        return 52
    if row['Study'] == "Cao2021":
        return 86
    if row['Study'] == "Chen2020":
        return 123
    if row['Study'] == "Dan2020":
        return 286
    if row['Study'] == "David2021":
        return 135
    if row['Study'] == "Huang2021":
        return 83
    if row['Study'] == "Fouquier2021":
        return 78
    if row['Study'] == "Kang2017":
        return 38
    if row['Study'] == "Kong2019":
        return 45
    if row['Study'] == "Liu2019":
        return 50
    if row['Study'] == "Son2015":
        return 103
    if row['Study'] == "Zou2020":
        return 96
    if row['Study'] == "Zurita2019":
        return 50
    else :
        return "NaN"

In [22]:
all_meta_merged['Age_Range'] = all_meta_merged.apply (lambda row: add_age(row), axis=1)
all_meta_merged['sequencing_depth_min'] = all_meta_merged.apply (lambda row: sequencing_depth_min(row), axis=1)
all_meta_merged['seq_depth_range'] = all_meta_merged.apply (lambda row: sequencing_depth_range(row), axis=1)
all_meta_merged['Control_relation'] = all_meta_merged.apply (lambda row: control_type_add(row), axis=1)
all_meta_merged['Country'] = all_meta_merged.apply (lambda row: Berding_Country(row), axis=1)
all_meta_merged['Sample_size'] = all_meta_merged.apply (lambda row: samp_size(row), axis=1)
all_meta_merged['Sample_size'] = all_meta_merged.apply (lambda row: Berding_Sample_Size(row), axis=1)

In [64]:
all_meta_merged.to_csv("Master_complete_metadata.txt",sep='\t')

In [24]:
all_meta_merged['Study'].value_counts()

American Gut    524
Dan2020         286
David2021       125
Chen2020        123
Son2015         103
Zou2020          96
Cao2021          86
Huang2021        83
Fouquier2021     78
Berding2020      52
Zurita2019       50
Liu2019          50
Kang2017         38
Kong2019         36
Name: Study, dtype: int64

## Source truth metadata

#### Load original unprocessed metadata files 

In [44]:
amgut = pd.read_csv("./American_gut_metadata.txt",sep='\t',index_col=0)
berding_source = pd.read_csv("../../Berding_2020/sample_metadata.txt",sep='\t',index_col=0)
cao_source = pd.read_csv("../../Cao_2021/sample_metadata.txt",sep='\t',index_col=0)
chen_source = pd.read_csv("../../Chen_2020/sample_metadata.txt",sep='\t',index_col=0)
dan_source = pd.read_csv("../../Dan_2020/sample_metadata.txt",sep='\t',index_col=0)
david_source = pd.read_csv("../../David_2021/sample_metadata.txt",sep='\t',index_col=0)
huang_source = pd.read_csv("../../Huang_2021/sample_metadata.txt",sep='\t',index_col=0)
fouquier_source = pd.read_csv("../../Fouquier_2021/sample_metadata.txt",sep='\t',index_col=0)
kang_source = pd.read_csv("../../Kang_2017/sample_metadata_rf_kang.txt",sep='\t',index_col=0)
kong_source = pd.read_csv("../../Kong_2019/sample_metadata.txt",sep='\t',index_col=0)
liu_source = pd.read_csv("../../Liu_2019/sample_metadata.txt",sep='\t',index_col=0)
son_source = pd.read_csv("../../Son_2015/sample_metadata.txt",sep='\t',index_col=0)
zou_source = pd.read_csv("../../Zou_2020/sample_metadata.txt",sep='\t',index_col=0)
zurita_source = pd.read_csv("../../Zurita_2019/sample_metadata.txt",sep='\t',index_col=0)

In [45]:
kang_source = kang_source.drop(kang_source.loc[kang_source["collection-method"]=="swab"].index)
amgut = amgut.drop(amgut.loc[amgut["Sex"]=="LabControl test"].index)
zurita_source['Age'] = pd.to_numeric(zurita_source['Age'], errors='coerce')

In [46]:
all_original = [amgut, berding_source, cao_source, chen_source, dan_source, david_source, huang_source, 
                fouquier_source, kang_source, kong_source, liu_source, son_source, zou_source, zurita_source]

all_original_merged = pd.concat(all_original)

all_original_merged

Unnamed: 0,Abdominal_pain,Age,Alcohol_use,Allergies,Asian,Assay Type,Autism_ID,B.feed,BioProject,BioSample,...,sample_name.1,sequencing_depth_min,sev,severe,teeth.Br,tonsils.rem,trouble_sleeping,unique_id,weeks-since-experiment-start,what.type
56608.10317.000059852,,7,,,,,,,,,...,,6000.0,,,,,,,,
56608.10317.000073749,,9,,,,,,,,,...,,6000.0,,,,,,,,
56608.10317.000072136,,10,,,,,,,,,...,,6000.0,,,,,,,,
56608.10317.000073612,,18,,,,,,,,,...,,6000.0,,,,,,,,
56754.10317.000021574,,5,,,,,,,,,...,,6000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P53.stool,,8,,,,,,,,,...,,,,,,,,,,
P54.stool,,5,,,,,,,,,...,,,,,,,,,,
P55.stool,,11,,,,,,,,,...,,,,,,,,,,
P56.stool,,12,,,,,,,,,...,,,,,,,,,,


In [47]:
ground_truth = all_original_merged[['Age','Sex','Status','Study','Variable_Region','Control_Type','Cohort',
                                    'Subjects_Location']]

#### Define metadata formatting functions

In [48]:
def country(row): 
    if row['Study'] == "American Gut" :
        return "USA"
    if row['Study'] == "Berding2020" :
        return "USA"
    if row['Study'] == "Cao2021":
        return "China"
    if row['Study'] == "Chen2020":
        return "China"
    if row['Study'] == "Dan2020":
        return "China"
    if row['Study'] == "David2021":
        return "USA"
    if row['Study'] == "Huang2021":
        return "China"
    if row['Study'] == "Fouquier2021":
        return "USA"
    if row['Study'] == "Kang2017":
        return "USA"
    if row['Study'] == "Kong2019":
        return "USA"
    if row['Study'] == "Liu2019":
        return "China"
    if row['Study'] == "Son2015":
        return "USA"
    if row['Study'] == "Zou2020":
        return "China"
    if row['Study'] == "Zurita2019":
        return "Equador"
    else :
        return 'NaN'
    
def Study(row): 
    if row['Study'] == "Berding2020" :
        return row['Study']
    if row['Study'] == "American Gut" :
        return row['Study']
    else :
        return row['Cohort']

In [49]:
# Apply formatting columns to merged metadata file

ground_truth['Age'] = pd.to_numeric(ground_truth['Age'], errors='coerce')
ground_truth['Age_Range'] = ground_truth.apply (lambda row: add_age(row), axis=1)
ground_truth['sequencing_depth_min'] = ground_truth.apply (lambda row: sequencing_depth_min(row), axis=1)
ground_truth['seq_depth_range'] = ground_truth.apply (lambda row: sequencing_depth_range(row), axis=1)
ground_truth['Control_relation'] = ground_truth.apply (lambda row: control_type_add(row), axis=1)
ground_truth['Study'] = ground_truth.apply (lambda row: Study(row), axis=1)
ground_truth['country'] = ground_truth.apply (lambda row: country(row), axis=1)

In [54]:
ground_truth.index.name = '#SampleID'

# Remove unwanted columns
del ground_truth["Cohort"]
del ground_truth["Subjects_Location"]

KeyError: 'Cohort'

In [52]:
ground_truth

Unnamed: 0_level_0,Age,Sex,Status,Study,Variable_Region,Control_Type,Age_Range,sequencing_depth_min,seq_depth_range,Control_relation,country
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
56608.10317.000059852,7.0,male,ASD,American Gut,V4,Age_Sex_Match,5-7 years,6000,< 6000,No Relationship,USA
56608.10317.000073749,9.0,male,ASD,American Gut,V4,Age_Sex_Match,Above 7 years,6000,< 6000,No Relationship,USA
56608.10317.000072136,10.0,male,ASD,American Gut,V4,Age_Sex_Match,Above 7 years,6000,< 6000,No Relationship,USA
56608.10317.000073612,18.0,male,ASD,American Gut,V4,Age_Sex_Match,Above 7 years,6000,< 6000,No Relationship,USA
56754.10317.000021574,5.0,male,ASD,American Gut,V4,Age_Sex_Match,5-7 years,6000,< 6000,No Relationship,USA
...,...,...,...,...,...,...,...,...,...,...,...
P53.stool,8.0,male,Control,Zurita2019,V4,Age_Sex_Match,Above 7 years,,,,Equador
P54.stool,5.0,female,Control,Zurita2019,V4,Age_Sex_Match,5-7 years,,,,Equador
P55.stool,11.0,male,Control,Zurita2019,V4,Age_Sex_Match,Above 7 years,,,,Equador
P56.stool,12.0,male,Control,Zurita2019,V4,Age_Sex_Match,Above 7 years,,,,Equador


#### Export file to plot figure 1 in R studio. 

In [53]:
ground_truth.to_csv("ground_truth.txt", sep='\t')