In [11]:
import os
import pandas as pd
from biom import load_table
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

In [12]:

path = os.getcwd()
# Load TSV file as BIOM Table
tsv_filename = path+ '/Data/tax_defined_observations.tsv'
table = load_table(tsv_filename)

# Now you can work with the biom_table object
table

986 x 951 <class 'biom.table.Table'> with 76533 nonzero entries (8% dense)

In [13]:
# Load metadata CSV
metadata_path = path+ '/Data/metadata_filtered.txt'
metadata = pd.read_csv(metadata_path, sep = '\t')
metadata["cardiovascular_disease"] = metadata["cardiovascular_disease"].map({"Diagnosed by a medical professional (doctor, physician assistant)": 1, "I do not have this condition": 0})
metadata

Unnamed: 0,#SampleID,subset_diabetes,host_taxid,allergic_to_unspecified,qiita_study_id,host_common_name,non_food_allergies_sun,physical_specimen_remaining,alcohol_types_unspecified,body_site,...,allergic_to_i_have_no_food_allergies_that_i_know_of,taxon_id,subset_bmi,description,non_food_allergies_drug_eg_penicillin,subset_healthy,env_feature,body_habitat,public,sex
0,10317.000074244.69032,True,9606.0,No,10317.0,human,No,Yes,No,UBERON:feces,...,Yes,408170.0,False,American Gut Project Stool sample,No,False,human-associated habitat,UBERON:feces,Yes,female
1,10317.000097250.74745,True,9606.0,No,10317.0,human,No,Yes,No,UBERON:feces,...,Yes,408170.0,True,American Gut Project Stool sample,No,False,human-associated habitat,UBERON:feces,Yes,male
2,10317.000037974.57180,True,9606.0,TRUE,10317.0,human,FALSE,TRUE,FALSE,UBERON:feces,...,FALSE,408170.0,True,American Gut Project Stool Sample,TRUE,True,human-associated habitat,UBERON:feces,TRUE,male
3,10317.000110314.76911,False,9606.0,No,10317.0,human,No,Yes,Yes,UBERON:feces,...,Yes,408170.0,False,American Gut Project Stool sample,No,False,human-associated habitat,UBERON:feces,Yes,female
4,10317.000051141.58828,True,9606.0,FALSE,10317.0,human,FALSE,TRUE,FALSE,UBERON:feces,...,FALSE,408170.0,True,American Gut Project Stool Sample,FALSE,False,human-associated habitat,UBERON:feces,TRUE,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1214,10317.000054290.59891,True,9606.0,FALSE,10317.0,human,FALSE,TRUE,FALSE,UBERON:feces,...,TRUE,408170.0,True,American Gut Project Stool Sample,FALSE,True,human-associated habitat,UBERON:feces,TRUE,female
1215,,,,,,,,,,,...,,,,,,,,,,
1216,,,,,,,,,,,...,,,,,,,,,,
1217,,,,,,,,,,,...,,,,,,,,,,


In [14]:
# Load OTU table CSV
otu_table_path = path+ '/Data/tax_defined_observations.tsv'
otu_table = pd.read_csv(otu_table_path, sep='\t')
otu_table

Unnamed: 0,# Constructed from biom file,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 942,Unnamed: 943,Unnamed: 944,Unnamed: 945,Unnamed: 946,Unnamed: 947,Unnamed: 948,Unnamed: 949,Unnamed: 950,Unnamed: 951
0,#OTU ID,10317.000043142.57559,10317.000038011.57180,10317.000046290.61067,10317.000109775.81188,10317.000084678.59320,10317.000030301.60418,10317.000076569.71221,10317.000038264.60384,10317.000092657.60749,...,10317.000042641.60384,10317.000032878.61470,10317.000030076.61073,10317.000107693.74745,10317.000108395.76911,10317.000047586.67836,10317.000097351.60850,10317.000027732.61094,10317.000068298.60825,10317.000029345.56980
1,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,755,529,107,51,203,1568,7,135,278,...,221,30,113,322,224,1633,226,1148,174,755
2,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,749,449,113,58,187,114,0,585,636,...,422,31,230,282,29,487,333,189,246,974
3,k__Bacteria; p__Firmicutes; c__Bacilli; o__Tur...,0,2,0,0,4,42,0,39,0,...,0,3,0,0,0,10,0,33,4,0
4,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,2572,145,615,269,579,14,0,518,341,...,1030,60,371,1657,6,16,1066,39,183,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lac...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
983,k__Bacteria; p__Chloroflexi; c__C0119; o__; f_...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
984,k__Bacteria; p__Proteobacteria; c__Gammaproteo...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lac...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
otu_table.columns = otu_table.iloc[0]

# Drop the first row as it's now redundant
otu_table = otu_table[1:]
otu_table


Unnamed: 0,#OTU ID,10317.000043142.57559,10317.000038011.57180,10317.000046290.61067,10317.000109775.81188,10317.000084678.59320,10317.000030301.60418,10317.000076569.71221,10317.000038264.60384,10317.000092657.60749,...,10317.000042641.60384,10317.000032878.61470,10317.000030076.61073,10317.000107693.74745,10317.000108395.76911,10317.000047586.67836,10317.000097351.60850,10317.000027732.61094,10317.000068298.60825,10317.000029345.56980
1,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,755,529,107,51,203,1568,7,135,278,...,221,30,113,322,224,1633,226,1148,174,755
2,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,749,449,113,58,187,114,0,585,636,...,422,31,230,282,29,487,333,189,246,974
3,k__Bacteria; p__Firmicutes; c__Bacilli; o__Tur...,0,2,0,0,4,42,0,39,0,...,0,3,0,0,0,10,0,33,4,0
4,k__Bacteria; p__Firmicutes; c__Clostridia; o__...,2572,145,615,269,579,14,0,518,341,...,1030,60,371,1657,6,16,1066,39,183,32
5,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lac...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lac...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
983,k__Bacteria; p__Chloroflexi; c__C0119; o__; f_...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
984,k__Bacteria; p__Proteobacteria; c__Gammaproteo...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lac...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df = otu_table
df = pd.DataFrame(df)
# Transpose the DataFrame
# table.set_index('genus', inplace=True)

# table = table.reset_index(drop=True)

transposed_df = df.T

# # Print the transposed DataFrame
print(transposed_df.index)
transposed_df
transposed_df.columns = transposed_df.iloc[0]

# Drop the first row as it's now redundant
transposed_df = transposed_df[1:]
transposed_df

Index(['#OTU ID', '10317.000043142.57559', '10317.000038011.57180',
       '10317.000046290.61067', '10317.000109775.81188',
       '10317.000084678.59320', '10317.000030301.60418',
       '10317.000076569.71221', '10317.000038264.60384',
       '10317.000092657.60749',
       ...
       '10317.000042641.60384', '10317.000032878.61470',
       '10317.000030076.61073', '10317.000107693.74745',
       '10317.000108395.76911', '10317.000047586.67836',
       '10317.000097351.60850', '10317.000027732.61094',
       '10317.000068298.60825', '10317.000029345.56980'],
      dtype='object', name=0, length=952)


#OTU ID,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Blautia; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__; s__,k__Bacteria; p__Firmicutes; c__Bacilli; o__Turicibacterales; f__Turicibacteraceae; g__Turicibacter; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__; s__,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Lactobacillaceae; g__Lactobacillus; s__coleohominis,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; ;,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Oscillospira; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Ruminococcus; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Gemmiger; s__formicilis,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Lactonifactor; s__longoviformis,...,k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Oxalobacteraceae; g__Janthinobacterium; s__,k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Xanthobacteraceae; g__; s__,k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Morganella; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Selenomonas; s__noxia,k__Bacteria; p__Actinobacteria; c__Thermoleophilia; o__Solirubrobacterales; ; ;,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__sobrinus,k__Bacteria; p__Chloroflexi; c__C0119; o__; f__; g__; s__,k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Tatumella;,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__minor,k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Oxalobacteraceae; g__; s__
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10317.000043142.57559,755,749,0,2572,0,171,421,1560,0,2,...,0,0,0,0,0,0,0,0,0,0
10317.000038011.57180,529,449,2,145,0,316,83,62,0,0,...,0,0,0,0,0,0,0,0,0,0
10317.000046290.61067,107,113,0,615,0,68,331,267,41,0,...,0,0,0,0,0,0,0,0,0,0
10317.000109775.81188,51,58,0,269,0,149,378,349,0,0,...,0,0,0,0,0,0,0,0,0,0
10317.000084678.59320,203,187,4,579,0,459,430,292,60,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10317.000047586.67836,1633,487,10,16,0,65,43,29,615,0,...,0,0,0,0,0,0,0,0,0,0
10317.000097351.60850,226,333,0,1066,0,503,381,189,91,0,...,0,0,0,0,0,0,0,0,0,0
10317.000027732.61094,1148,189,33,39,0,139,90,132,2,0,...,0,0,0,0,0,0,0,0,0,0
10317.000068298.60825,174,246,4,183,0,73,114,139,95,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:

df2 = transposed_df
# print(df2)
df2['SampleID'] = df2.index.tolist()
print(df2.index.tolist())

metadata = metadata.sort_values(by='#SampleID')
metadata = metadata.reset_index(drop=True)

# Drop duplicate SampleID values from data
df2 = df2.drop_duplicates('SampleID')
metadata = metadata.drop_duplicates('#SampleID')
df2

['10317.000043142.57559', '10317.000038011.57180', '10317.000046290.61067', '10317.000109775.81188', '10317.000084678.59320', '10317.000030301.60418', '10317.000076569.71221', '10317.000038264.60384', '10317.000092657.60749', '10317.000076296.57928', '10317.000046405.60472', '10317.000029385.56980', '10317.000082577.59320', '10317.000076358.60749', '10317.000110307.76911', '10317.000002942.56754', '10317.000090013.59872', '10317.000068089.61197', '10317.000071449.69032', '10317.000079147.59872', '10317.000084660.59320', '10317.000033355.61470', '10317.000044556.57559', '10317.000044556.57861', '10317.000085104.66110', '10317.000059954.58674', '10317.000108987.81188', '10317.000023880.57900', '10317.000028806.60384', '10317.000098610.58165', '10317.000063003.58828', '10317.000020525.58699', '10317.000097085.57363', '10317.000109209.81188', '10317.000054290.59891', '10317.000103739.67599', '10317.000066982.64045', '10317.000023026.56754', '10317.000042899.57861', '10317.000046933.58305',

#OTU ID,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Blautia; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__; s__,k__Bacteria; p__Firmicutes; c__Bacilli; o__Turicibacterales; f__Turicibacteraceae; g__Turicibacter; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__; s__,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Lactobacillaceae; g__Lactobacillus; s__coleohominis,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; ;,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Oscillospira; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Ruminococcus; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Gemmiger; s__formicilis,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Lactonifactor; s__longoviformis,...,k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Xanthobacteraceae; g__; s__,k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Morganella; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Selenomonas; s__noxia,k__Bacteria; p__Actinobacteria; c__Thermoleophilia; o__Solirubrobacterales; ; ;,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__sobrinus,k__Bacteria; p__Chloroflexi; c__C0119; o__; f__; g__; s__,k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Tatumella;,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__minor,k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Oxalobacteraceae; g__; s__,SampleID
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10317.000043142.57559,755,749,0,2572,0,171,421,1560,0,2,...,0,0,0,0,0,0,0,0,0,10317.000043142.57559
10317.000038011.57180,529,449,2,145,0,316,83,62,0,0,...,0,0,0,0,0,0,0,0,0,10317.000038011.57180
10317.000046290.61067,107,113,0,615,0,68,331,267,41,0,...,0,0,0,0,0,0,0,0,0,10317.000046290.61067
10317.000109775.81188,51,58,0,269,0,149,378,349,0,0,...,0,0,0,0,0,0,0,0,0,10317.000109775.81188
10317.000084678.59320,203,187,4,579,0,459,430,292,60,0,...,0,0,0,0,0,0,0,0,0,10317.000084678.59320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10317.000047586.67836,1633,487,10,16,0,65,43,29,615,0,...,0,0,0,0,0,0,0,0,0,10317.000047586.67836
10317.000097351.60850,226,333,0,1066,0,503,381,189,91,0,...,0,0,0,0,0,0,0,0,0,10317.000097351.60850
10317.000027732.61094,1148,189,33,39,0,139,90,132,2,0,...,0,0,0,0,0,0,0,0,0,10317.000027732.61094
10317.000068298.60825,174,246,4,183,0,73,114,139,95,0,...,0,0,0,0,0,0,0,0,0,10317.000068298.60825


In [18]:
# Merge the two DataFrames based on the common column '#OTU ID'
final_data = df2.merge(metadata[['#SampleID', 'cardiovascular_disease']], left_on='SampleID', right_on='#SampleID', how='left')

# Drop the duplicated '#SampleID' column
final_data.drop('SampleID', axis=1, inplace=True)

# Rename the 'cardiovascular_disease' column to 'CVD'
final_data.rename(columns={'cardiovascular_disease': 'CVD'}, inplace=True)

# Display the updated DataFrame
final_data



Unnamed: 0,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Blautia; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__; s__,k__Bacteria; p__Firmicutes; c__Bacilli; o__Turicibacterales; f__Turicibacteraceae; g__Turicibacter; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__; s__,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Lactobacillaceae; g__Lactobacillus; s__coleohominis,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; ;,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Oscillospira; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Ruminococcus; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Gemmiger; s__formicilis,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Lactonifactor; s__longoviformis,...,k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Morganella; s__,k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Selenomonas; s__noxia,k__Bacteria; p__Actinobacteria; c__Thermoleophilia; o__Solirubrobacterales; ; ;,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__sobrinus,k__Bacteria; p__Chloroflexi; c__C0119; o__; f__; g__; s__,k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Tatumella;,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__minor,k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Oxalobacteraceae; g__; s__,#SampleID,CVD
0,755,749,0,2572,0,171,421,1560,0,2,...,0,0,0,0,0,0,0,0,10317.000043142.57559,1.0
1,529,449,2,145,0,316,83,62,0,0,...,0,0,0,0,0,0,0,0,10317.000038011.57180,1.0
2,107,113,0,615,0,68,331,267,41,0,...,0,0,0,0,0,0,0,0,10317.000046290.61067,1.0
3,51,58,0,269,0,149,378,349,0,0,...,0,0,0,0,0,0,0,0,10317.000109775.81188,1.0
4,203,187,4,579,0,459,430,292,60,0,...,0,0,0,0,0,0,0,0,10317.000084678.59320,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
946,1633,487,10,16,0,65,43,29,615,0,...,0,0,0,0,0,0,0,0,10317.000047586.67836,0.0
947,226,333,0,1066,0,503,381,189,91,0,...,0,0,0,0,0,0,0,0,10317.000097351.60850,1.0
948,1148,189,33,39,0,139,90,132,2,0,...,0,0,0,0,0,0,0,0,10317.000027732.61094,1.0
949,174,246,4,183,0,73,114,139,95,0,...,0,0,0,0,0,0,0,0,10317.000068298.60825,0.0


In [19]:
# Assuming you have a DataFrame named 'data' with headers
# Extract the "CVD" and "SampleID" columns for later use
cvd_column = final_data['CVD']
sample_id_column = final_data['#SampleID']

# Remove "CVD" and "SampleID" columns from the DataFrame
final_data = final_data.drop(columns=['CVD', '#SampleID'])

# Convert the data to numeric values
final_data = final_data.apply(pd.to_numeric, errors='coerce')

# Calculate the sum of each column
column_sums = final_data.sum()

# Calculate relative abundances by dividing each column's values by its sum
relative_abundances = final_data.div(column_sums)
relative_abundances = relative_abundances.round(5)

row_sum = relative_abundances.iloc[3].sum()
print("Sum ", row_sum)
# Add back the "CVD" and "SampleID" columns to the DataFrame
relative_abundances['CVD'] = cvd_column.values
relative_abundances['#SampleID'] = sample_id_column.values

# Reset the index to remove the default index
relative_abundances.reset_index(drop=True, inplace=True)

# Print the resulting DataFrame with relative abundances and original columns
print(relative_abundances)


Sum  0.9703900000000001
     k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Blautia; s__  \
0                                              0.00121                                                  
1                                              0.00085                                                  
2                                              0.00017                                                  
3                                              0.00008                                                  
4                                              0.00033                                                  
..                                                 ...                                                  
946                                            0.00262                                                  
947                                            0.00036                                                  
948                            

### Store the data in a tsv file

In [22]:
# Specify the TSV file path
tsv_file_path = path+ '/Data/dataOutput.tsv'

# Save the DataFrame to a TSV file
relative_abundances.to_csv(tsv_file_path, sep='\t', index=False)

print(f"Data saved to '{tsv_file_path}'")

# transposed_df = final_data.T

# # # Print the transposed DataFrame
# print(transposed_df.index)
# tsv_file_path1 = 'dataOutputTranspose.csv'


# # Save the DataFrame to a TSV file
# transposed_df.to_csv(tsv_file_path1, sep='\t', index=False)

# print(f"Data saved to '{tsv_file_path1}'")


Data saved to '/Users/sharyu/Lab/Data/dataOutput.tsv'


#### data output.tsv file could be used for lda in Galaxy web application(https://huttenhower.sph.harvard.edu/galaxy/). Need to work on the analysis.