In [None]:
import biom
from biom.util import biom_open
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Meta data

In [None]:
# hospital meta data
names_meta_v = [9102, 9159, 9230, 9249]
get_file_meta = lambda x: 'data/hospital/sample_information_from_prep_'+str(x)+'.tsv'
hospital_meta = pd.concat([pd.read_csv(get_file_meta(i), sep='\t') for i in names_meta_v]).drop_duplicates()
hospital_meta.head()

In [None]:
# merge meta data with all biome data 
hospital_meta = hospital_meta[['sample_name', 'sample_sarscov2_screening_result', 'study_sample_type']]
hospital_meta

In [None]:
# query to get relevant rows
data = hospital_meta.query(
    "study_sample_type in ['stool', 'forehead', 'inside floor', 'nares'] & \
    sample_sarscov2_screening_result in ['not detected', 'positive']"
).reset_index(drop=True)
data

# Microbiome data

In [None]:
# merge biome tables
table1 = biom.load_table("data/hospital/150/133520_all.biom")
table2 = biom.load_table("data/hospital/150/134073_all.biom")
table3 = biom.load_table("data/hospital/150/134769_all.biom")
table4 = biom.load_table("data/hospital/150/134858_all.biom")
merged_table = table1.merge(table2).merge(table3).merge(table4)

with biom_open('data/input/merged_biom_table.biom', 'w') as f:
    merged_table.to_hdf5(f, 'created table')


# load table as df
merged_table.to_dataframe()

# Train and test split

In [None]:
# split data
X = data.drop(columns=['study_sample_type', 'sample_sarscov2_screening_result'], axis=1)
y = data[['study_sample_type', 'sample_sarscov2_screening_result']]
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
y_test

In [None]:
#Write txt file  with names of sample used for training
with open("data/input/training_samples.txt", "w") as f:
    for s in X_train['sample_name']:
        f.write(f'{s}\n')
#Read the names of each sample into a array
with open("data/input/training_samples.txt", "r") as f:
    samples_train = [s.strip() for s in f.readlines()]

#Write txt file  with names of sample used for training
with open("data/input/test_samples.txt", "w") as f:
    for s in X_test['sample_name']:
        f.write(f'{s}\n')

#Read the names of each sample into a array
with open("data/input/test_samples.txt", "r") as f:
    samples_test = [s.strip() for s in f.readlines()]

def check_covid_positive(row):
    if row =='positive':
        return 1
    else:
        return 0
# hospital meta data
names_meta_v = [9102, 9159, 9230, 9249]
get_file_meta = lambda x: 'data/hospital/sample_information_from_prep_'+str(x)+'.tsv'
hospital_meta = pd.concat([pd.read_csv(get_file_meta(i), sep='\t') for i in names_meta_v]).drop_duplicates()
hospital_meta['has_covid'] = hospital_meta['sample_sarscov2_screening_result'].apply(check_covid_positive)
hospital_meta.head()
training_data = hospital_meta.loc[hospital_meta["sample_name"].isin(samples_train)]
test_data = hospital_meta.loc[hospital_meta["sample_name"].isin(samples_test)]

#save training metadata to tsv
training_data.to_csv("data/input/training_metadata.tsv", sep="\t", index=False)
test_data.to_csv("data/input/test_metadata.tsv", sep="\t", index=False)

In [None]:
# to csv
X_train.assign(study_sample_type=y_train['study_sample_type']).reset_index(drop=True).to_csv('data/input/samples_X_train.csv', index=False, sep='\t')
X_test.assign(study_sample_type=y_test['study_sample_type']).reset_index(drop=True).to_csv('data/input/samples_X_test.csv', index=False, sep='\t')
y_train.drop(columns=['study_sample_type']).reset_index(drop=True).to_csv('data/input/samples_y_train.csv', index=False, sep='\t')
y_test.drop(columns=['study_sample_type']).reset_index(drop=True).to_csv('data/input/samples_y_test.csv', index=False, sep='\t')

## Train metadata per sample environment

In [None]:
training_metadata = pd.read_csv('data/input/training_metadata.tsv', sep="\t")

In [None]:
training_metadata_inside_floor = training_metadata[training_metadata['study_sample_type'] == 'inside floor']
training_metadata_forehead = training_metadata[training_metadata['study_sample_type'] == 'forehead']
training_metadata_stool = training_metadata[training_metadata['study_sample_type'] == 'stool']
training_metadata_nares = training_metadata[training_metadata['study_sample_type'] == 'nares']

In [None]:
print(training_metadata_inside_floor.shape, training_metadata_forehead.shape, training_metadata_stool.shape, training_metadata_inside_floor.shape)

In [None]:
training_metadata_inside_floor.to_csv("data/input/training_metadata_inside_floor.tsv", sep="\t", index=False)
training_metadata_forehead.to_csv("data/input/training_metadata_forehead.tsv", sep="\t", index=False)
training_metadata_stool.to_csv("data/input/training_metadata_stool.tsv", sep="\t", index=False)
training_metadata_nares.to_csv("data/input/training_metadata_nares.tsv", sep="\t", index=False)

## Test metadata per sample environment

In [None]:
test_metadata = pd.read_csv('data/input/test_metadata.tsv', sep="\t")

In [None]:
test_metadata_inside_floor = test_metadata[test_metadata['study_sample_type'] == 'inside floor']
test_metadata_forehead = test_metadata[test_metadata['study_sample_type'] == 'forehead']
test_metadata_stool = test_metadata[test_metadata['study_sample_type'] == 'stool']
test_metadata_nares = test_metadata[test_metadata['study_sample_type'] == 'nares']

In [None]:
print(test_metadata_inside_floor.shape, test_metadata_forehead.shape, test_metadata_stool.shape, test_metadata_nares.shape)

In [None]:
test_metadata_inside_floor.to_csv("data/input/test_metadata_inside_floor.tsv", sep="\t", index=False)
test_metadata_forehead.to_csv("data/input/test_metadata_forehead.tsv", sep="\t", index=False)
test_metadata_stool.to_csv("data/input/test_metadata_stool.tsv", sep="\t", index=False)
test_metadata_nares.to_csv("data/input/test_metadata_nares.tsv", sep="\t", index=False)