In [None]:
# !pip install biom-format

In [30]:
import biom
# import torch
# from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Meta data

In [31]:
# hospital meta data
names_meta_v = [9102, 9159, 9230, 9249]
get_file_meta = lambda x: 'data/hospital/sample_information_from_prep_'+str(x)+'.tsv'
hospital_meta = pd.concat([pd.read_csv(get_file_meta(i), sep='\t') for i in names_meta_v]).drop_duplicates()
hospital_meta.head()

Unnamed: 0,sample_name,sex,e_cq,n1_cq,n2_cq,rp_cq,title,empo_1,empo_2,empo_3,...,days_since_collection_started,followup_clinical_swab_result,viral_copies_mean_log_confirmed,viral_copies_mean_log_stringent,viral_copies_per_swab_stringent,sample_sarscov2_screening_result,patient_type_in_room_at_collection,sampling_session_has_patient_positives,sampling_session_has_surface_positives,qiita_study_id
0,13092.000117015.covid,not applicable,,,,31.771654,COVID-19 hospital study,Free-living,Non-saline,Surface (non-saline),...,0.0,pos,,,,not detected,unoccupied,n,n,13092
1,13092.000117016.covid,not applicable,,,,37.422719,COVID-19 hospital study,Free-living,Non-saline,Surface (non-saline),...,0.0,pos,,,,not detected,unoccupied,n,n,13092
2,13092.000117017.covid,not applicable,,,,,COVID-19 hospital study,Free-living,Non-saline,Surface (non-saline),...,0.0,pos,,,,not detected,unoccupied,n,n,13092
3,13092.000117018.covid,not applicable,,,,,COVID-19 hospital study,Free-living,Non-saline,Water (non-saline),...,0.0,pos,,,,not detected,unoccupied,n,n,13092
4,13092.000117019.covid,not applicable,,,,,COVID-19 hospital study,Free-living,Non-saline,Surface (non-saline),...,0.0,pos,,,,not detected,unoccupied,n,n,13092


# Microbiome data

In [32]:
def get_src(table):
    '''Returns a DataFrame with columns for sample, asv, and asv count'''
    s_ids = table.ids()
    obs_ids = table.ids(axis='observation')
    coo = table.matrix_data.tocoo()
    counts, rows, cols = coo.data, coo.row, coo.col
    
    src = pd.DataFrame({
        'sample': [s_ids[col] for col in cols],
        'asv': [obs_ids[row] for row in rows],
        'count': coo.data
    })
    return src

def get_data(table):
    '''Returns DataFrame indexed by sample name with one column for ASVs'''
    s_ids = table.ids()
    coo = table.matrix_data.tocoo()
    counts, rows, cols = coo.data, coo.row, coo.col
        
    # dictionary key: sample, value: asv
    asv_data = {s_id: [] for s_id in s_ids}
    for count, row, col in zip(counts, rows, cols):
        s_id = table.ids()[col]
        obs_seq = table.ids(axis='observation')[row]
        asv_data[s_id].append(obs_seq)

    # dictionary to array
    model_input = [np.array(value) for key, value in asv_data.items()]

    # return 
    input_df = pd.DataFrame({'ASVs': [row.tolist() for row in model_input]})
    input_df['sample'] = asv_data.keys()
    input_df = input_df.set_index('sample')
    return input_df

In [33]:
from biom.util import biom_open

In [34]:
# merge biome tables
table1 = biom.load_table("data/hospital/150/133520_all.biom")
table2 = biom.load_table("data/hospital/150/134073_all.biom")
table3 = biom.load_table("data/hospital/150/134769_all.biom")
table4 = biom.load_table("data/hospital/150/134858_all.biom")
merged_table = table1.merge(table2).merge(table3).merge(table4)

with biom_open('data/input/merged_biom_table.biom', 'w') as f:
    merged_table.to_hdf5(f, 'created table')


# load table as df
merged_table.to_dataframe()

Unnamed: 0,13092..54.covid,13092..55.covid,13092..56.covid,13092..61.covid,13092..62.covid,13092..63.covid,13092..64.covid,13092.000117015.covid,13092.000117016.covid,13092.000117017.covid,...,13092.v2.8.blank.A6,13092.v2.8.blank.B6,13092.v2.8.blank.C6,13092.v2.8.blank.D6,13092.v2.8.blank.D8,13092.v2.8.blank.E6,13092.v2.8.blank.F6,13092.v2.8.blank.G6,13092.v2.8.blank.H1,13092.v2.8.blank.H6
AAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGAC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCTACCGCGCTTCGCTTGGTCAACCCCTCAGCGGCAAAAATTAAAATTTTTACCGCTTCGGCGTTATAACCTCACACTCAATCTTTTATCACGAAGTCATGATTGAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACTCTGCAGGTTGGATACGCCAATCATTTTTATCGAAGCGCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAACGTCGGCTACAGTAACTTTTCCCAGCCTCAATCTCATCTCTCTTTTTGCGTTCTGCTTCAAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCGGGTGCTCGCGTTGTTCGGAATGACTGGGCGTAAAGCGCGCGTAGGCGGATCGGTAAGTCAGAGGTGAAAGCCCGGGGCTCAACCCCGGAATGGCCTTTGAGACTCCCGGTCTTGAGTTCGAGAGAGGTGGGTGGAATTCCGAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTTTCGACTCATCAGAAATATCCGAAAGTGTTAACTTCTGCGTCATGGAAGCGATAAAACTCTGCAGGTTGGATACGCCAATCATTTTTATCGAAGCGCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAACGTCGG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTTTCTCATTTTCCGCCAGCAGTCCACTTCGATTTAATTCGTAAACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATCCGAAAGTGTTAACTTCTGCGTCATGGAAGCGATAAAACTCTGCAGGTTG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
#organize ASVs by sample
merged_biom = get_data(merged_table)
merged_biom

Unnamed: 0_level_0,ASVs
sample,Unnamed: 1_level_1
13092..54.covid,[AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGA...
13092..55.covid,[AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGA...
13092..56.covid,[TACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGT...
13092..61.covid,[TACAGAGGTCTCAAGCGTTGTTCGGAATCACTGGGCGTAAAGCGT...
13092..62.covid,[AACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGA...
...,...
13092.v2.8.blank.E6,[TACAGAGGGTGCGAGCGTTAATCGGATTTACTGGGCGTAAAGCGC...
13092.v2.8.blank.F6,[AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGA...
13092.v2.8.blank.G6,[CACGATTAACCCAAGCCAATAGAAGCCGGCGTAAAGAGCGTTTTA...
13092.v2.8.blank.H1,[TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGCGC...


# Merge meta and biome data

Sample sizes according to paper
- **forehead**
  - 79 total
  - 63 not detected and 16 positives
- **inside floor**
  - 107 total
  - 67 not detected and 40 positive
- **nares**
  - 76 total
  - 45 not detected and 31 positives
- **stool**
  - 44 total
  - 33 not detected and 11 positives

In [36]:
# merge meta data with all biome data 
meta_biom = hospital_meta[['sample_name', 'sample_sarscov2_screening_result', 'study_sample_type']].merge(merged_biom, left_on='sample_name', right_index=True)
meta_biom

Unnamed: 0,sample_name,sample_sarscov2_screening_result,study_sample_type,ASVs
0,13092.000117015.covid,not detected,outside floor,[AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGA...
1,13092.000117016.covid,not detected,outside door handle,[CACGTAGGGGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGGGC...
2,13092.000117017.covid,not detected,inside door handle,[CACAAGTAAGACAAGTGTTATTCATCATTATTAGGTTTAAAGCGT...
3,13092.000117018.covid,not detected,water,[TACGAAAGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGC...
4,13092.000117019.covid,not detected,keyboard,[GACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGT...
...,...,...,...,...
330,13092.v2.14.ZymoMock.D7,positive,,[TACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGT...
331,13092.v2.14.ZymoMock.E7,positive,,[TACGAAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGC...
332,13092.v2.14.ZymoMock.F7,positive,,[TACAGAGGGTGCGAGCGTTAATCGGATTTACTGGGCGTAAAGCGT...
333,13092.v2.14.ZymoMock.G7,not detected,,[TACGAAGGGGGCTAGCGTTGCTCGGAATCACTGGGCGTAAAGGGT...


In [8]:
# query to get relevant rows
data = meta_biom.query(
    "study_sample_type in ['stool', 'forehead', 'inside floor', 'nares'] & \
    sample_sarscov2_screening_result in ['not detected', 'positive']"
).reset_index(drop=True)
data

Unnamed: 0,sample_name,sample_sarscov2_screening_result,study_sample_type,ASVs
0,13092.000117020.covid,not detected,inside floor,[AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGA...
1,13092.000117027.covid,not detected,inside floor,[AACAGAGGGGGCGAGCGTTAATCATCATGACTGGGCGTAAAGGGT...
2,13092.000117035.covid,not detected,inside floor,[GACAGAGGATGCAAGCGTTATCTGGAATGATTGGGCGTAAGGCGT...
3,13092.000117050.covid,positive,inside floor,[AACGGGGGGGGCAAGTGTTCTTCGGAATGACTAGGCGTAAAGGGC...
4,13092.000117082.covid,not detected,inside floor,[AACGGAGGGAGTGAGTGTTATCCGTCAAAACTGGGCGTAAAGGGT...
...,...,...,...,...
332,13092.000119591.covid,positive,nares,[CACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTA...
333,13092.000119592.covid,positive,nares,[CACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTA...
334,13092.000119603.covid,not detected,stool,[AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGA...
335,13092.000119650.covid,not detected,inside floor,[AACGTAGGGTGCAAGCGTTGCCCGGAATTACTGGGTGTAAAGGGA...


In [9]:
data.iloc[214]

sample_name                                                     13092.000118492.covid
sample_sarscov2_screening_result                                         not detected
study_sample_type                                                               nares
ASVs                                [CACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTA...
Name: 214, dtype: object

In [None]:
# Pad asv array length
max_len = max(data['ASVs'].apply(len))  # longest is data.iloc[214]

data['ASVs'] = data['ASVs'].apply(lambda x: np.pad(x, (0, max_len - len(x)), mode='constant', constant_values="N"))
data.iloc[242].get('ASVs')

# Train and test split

In [11]:
# from sklearn.model_selection import train_test_split

# def get_train_test_split(df):
#     X = df.drop('sample_sarscov2_screening_result', axis=1)
#     y = df['sample_sarscov2_screening_result']
    
#     return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [12]:
# stool = data.query("study_sample_type == 'stool'")
# forehead = data.query("study_sample_type == 'forehead'")
# nares = data.query("study_sample_type == 'nares'")
# inside_floor = data.query("study_sample_type == 'inside floor'")

# stool_X_train, stool_X_test, stool_y_train, stool_y_test = get_train_test_split(stool)
# forehead_X_train, forehead_X_test, forehead_y_train, forehead_y_test = get_train_test_split(forehead)
# nares_X_train, nares_X_test, nares_y_train, nares_y_test = get_train_test_split(nares)
# if_X_train, if_X_test, if_y_train, if_y_test = get_train_test_split(inside_floor)

In [13]:
# X_train = pd.concat([stool_X_train, forehead_X_train, nares_X_train, if_X_train]).reset_index(drop=True)
# X_train#.to_csv('samples_X_train.csv', index=False)

In [14]:
# y_train = pd.concat([stool_y_train, forehead_y_train, nares_y_train, if_y_train]).reset_index(drop=True)
# y_train.to_csv('samples_y_train.csv', index=False)

In [15]:
# X_test = pd.concat([stool_X_test, forehead_X_test, nares_X_test, if_X_test]).reset_index(drop=True)
# X_test.to_csv('samples_X_test.csv', index=False)

In [16]:
# y_test = pd.concat([stool_y_test, forehead_y_test, nares_y_test, if_y_test]).reset_index(drop=True)
# y_test.to_csv('samples_y_test.csv', index=False)

In [37]:
# split data
X = data.drop(columns=['study_sample_type', 'sample_sarscov2_screening_result'], axis=1)
y = data[['study_sample_type', 'sample_sarscov2_screening_result']]
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
y_test

Unnamed: 0,study_sample_type,sample_sarscov2_screening_result
322,stool,not detected
234,inside floor,positive
249,inside floor,positive
262,inside floor,not detected
36,inside floor,not detected
...,...,...
199,inside floor,not detected
269,forehead,not detected
92,forehead,positive
301,nares,positive


In [38]:
#Write txt file  with names of sample used for training
with open("data/input/training_samples.txt", "w") as f:
    for s in X_train['sample_name']:
        f.write(f'{s}\n')
#Read the names of each sample into a array
with open("data/input/training_samples.txt", "r") as f:
    samples = [s.strip() for s in f.readlines()]
samples

# hospital meta data
names_meta_v = [9102, 9159, 9230, 9249]
get_file_meta = lambda x: 'data/hospital/sample_information_from_prep_'+str(x)+'.tsv'
hospital_meta = pd.concat([pd.read_csv(get_file_meta(i), sep='\t') for i in names_meta_v]).drop_duplicates()
hospital_meta.head()
training_data = hospital_meta.loc[hospital_meta["sample_name"].isin(samples)]
#One hot encode covid result
def check_covid_positive(row):
    if row =='positive':
        return 1
    else:
        return 0
#Add has_covid column
training_data['has_covid'] = training_data['sample_sarscov2_screening_result'].apply(check_covid_positive)
#save training metadata to tsv
training_data.to_csv("data/input/training_metadata.tsv", sep="\t", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['has_covid'] = training_data['sample_sarscov2_screening_result'].apply(check_covid_positive)


In [39]:
# to csv
X_train.assign(study_sample_type=y_train['study_sample_type']).reset_index(drop=True).to_csv('data/input/samples_X_train.csv', index=False, sep='\t')
X_test.assign(study_sample_type=y_test['study_sample_type']).reset_index(drop=True).to_csv('data/input/samples_X_test.csv', index=False, sep='\t')
y_train.drop(columns=['study_sample_type']).reset_index(drop=True).to_csv('data/input/samples_y_train.csv', index=False, sep='\t')
y_test.drop(columns=['study_sample_type']).reset_index(drop=True).to_csv('data/input/samples_y_test.csv', index=False, sep='\t')