In [1]:
# Imports
import os

import bokeh
import bokeh.plotting
import numpy as np
import pandas as pd

bokeh.io.output_notebook()

In [2]:
# Set path to collect data files
data_path = "../data/clean/"

# Import necessary files
taxonomy_file = os.path.join(data_path, "taxonomic_profiles.csv")
metadata_file = os.path.join(data_path, "companion_table_W1.csv")
nutrient_file = os.path.join(data_path, "nutrient_temp_table.csv")

In [4]:
# Read each input data file into pandas
taxa_df = pd.read_csv(taxonomy_file, header=[1])
meta_df = pd.read_csv(metadata_file)
cond_df = pd.read_csv(nutrient_file)

In [5]:
# Extract sample label, id, and location from metadata
id_df = meta_df.iloc[:,[0, 4, 11, 14, 15, 16]]
id_df = id_df.rename(columns={'Sample label [TARA_station#_environmental-feature_size-fraction]': 'label', 
                              'PANGAEA sample identifier': 'pangea_id',
                              'Marine pelagic biomes (Longhurst 2007)': 'Biome',
                              'Ocean and sea regions (IHO General Sea Areas 1953) [MRGID registered at www.marineregions.com]': 'Region',
                              'Marine pelagic biomes  (Longhurst 2007) [MRGID registered at www.marineregions.com] ': 'Province'
                             })
# Extract temperature and id info for each sample
cond_df = cond_df.iloc[:,[0, 2, 3, 4, 5, 6, 7, 8, 9, 10]]
cond_df = cond_df.rename(columns={'PANGAEA Sample ID': 'pangea_id', 
                                  'Mean_Lat*': 'Latitude',
                                  'Mean_Long*': 'Longitude',
                                  'Mean_Depth [m]*': 'Depth (m)',
                                  'Mean_Temperature [deg C]*': 'Temperature (°C)',
                                  'Mean_Salinity [PSU]*': 'Salinity (PSU)',
                                  'Mean_Oxygen [umol/kg]*': 'Oxygen (µmol/kg)',
                                  'Mean_Nitrates[umol/L]*': 'Nitrates (µmol/L)',
                                  'NO2 [umol/L]**': 'NO2 (µmol/L)',
                                  'PO4 [umol/L]**': 'PO4 (µmol/L)',
                                 })
# Merge all relevant data into one df
id_df = id_df.merge(cond_df, how='left', on='pangea_id')

In [6]:
taxa_df

Unnamed: 0,Domain,Phylum,Class,Order,Family,Genus,OTU.rep,TARA_018_DCM_0.22-1.6,TARA_018_SRF_0.22-1.6,TARA_023_DCM_0.22-1.6,...,TARA_085_MES_0.22-3,TARA_085_SRF_0.22-3,TARA_093_DCM_0.22-3,TARA_093_SRF_0.22-3,TARA_094_SRF_0.22-3,TARA_096_SRF_0.22-3,TARA_098_DCM_0.22-3,TARA_098_MES_0.22-3,TARA_098_SRF_0.22-3,TARA_099_SRF_0.22-3
0,undef,undef,undef,undef,undef,undef,unclassified,5101,5314,2135,...,4310,6770,2253,813,2337,4100,4823,8847,2097,2763
1,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Rhodospirillaceae,AEGEAN-169 marine group,AACY024102418.157.1623,1021,1348,351,...,28,2,324,171,2720,3287,529,133,1480,1816
2,Bacteria,Cyanobacteria,Cyanobacteria,SubsectionI,FamilyI,Prochlorococcus,KC003383.1.1321,2697,2580,273,...,0,0,26,117,1084,989,1678,2,430,1088
3,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Rhodospirillaceae,AEGEAN-169 marine group,EU394547.1.1451,605,909,1018,...,26,0,2664,1174,1145,938,493,232,374,483
4,Bacteria,Cyanobacteria,Cyanobacteria,SubsectionI,FamilyI,Prochlorococcus,X52169.1.1473,3786,3672,225,...,0,0,16,99,1432,1174,905,0,75,529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35646,Bacteria,Proteobacteria,Gammaproteobacteria,Legionellales,Legionellaceae,Legionella,Z49716.1.1420,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35647,Bacteria,Proteobacteria,Gammaproteobacteria,Legionellales,Legionellaceae,Legionella,Z49738.1.1472,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35648,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Acidovorax,Z93983.1.1388,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35649,Bacteria,Acidobacteria,Acidobacteria,Subgroup 6,,,Z95717.1.1545,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
phyla_df = taxa_df.iloc[:, np.r_[1, 7:146]]
phyla_df = pd.melt(phyla_df, ["Phylum"], var_name="label", value_name="counts")
phyla_df = phyla_df[phyla_df['Phylum'] != 'undef']

In [6]:
abundances_list = []
sample_list = []
phyla_list = []
# Calculate relative abundances for each sample
for sample in phyla_df.label.unique():
    total = phyla_df.loc[phyla_df['label'] == sample]['counts'].sum()
    abundances = phyla_df.loc[phyla_df['label'] == sample].groupby('Phylum')['counts'].sum()/total
    abundances_list.extend(abundances.values)
    sample_list.extend([sample] * len(abundances))
    phyla_list.extend(list(abundances.index))

In [7]:
phyla_relative_df = pd.DataFrame({
    'abundance': abundances_list,
    'label': sample_list,
    'phylum': phyla_list
})

In [8]:
phyla_relative_df = phyla_relative_df.pivot(index="label", columns=["phylum"], values="abundance").fillna(0)

In [9]:
phyla_relative_df = phyla_relative_df.reset_index()

In [10]:
phyla_relative_df = phyla_relative_df.merge(id_df, how="left", on="label")

In [11]:
phyla_relative_df

Unnamed: 0,label,Acidobacteria,Actinobacteria,Aquificae,Armatimonadetes,BD1-5,BHI80-139,Bacteroidetes,Caldiserica,Candidate division BRC1,...,Province,Latitude,Longitude,Depth (m),Temperature (°C),Salinity (PSU),Oxygen (µmol/kg),Nitrates (µmol/L),NO2 (µmol/L),PO4 (µmol/L)
0,TARA_004_DCM_0.22-1.6,0.000058,0.047662,0.0,0.000000,0.000007,0.000000,0.087127,0.0,0.000000,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.57,-6.54,38.7,16.2,36.6,,,,
1,TARA_004_SRF_0.22-1.6,0.000034,0.023802,0.0,0.000011,0.000000,0.000000,0.066133,0.0,0.000000,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.55,-6.57,10.0,20.5,36.6,,,,
2,TARA_007_DCM_0.22-1.6,0.000080,0.044163,0.0,0.000000,0.000040,0.000000,0.062705,0.0,0.000000,...,"(MEDI) Mediterranean Sea, Black Sea Province [...",37.04,1.95,41.7,17.4,,,,0.00,0.01
3,TARA_007_SRF_0.22-1.6,0.000055,0.035580,0.0,0.000000,0.000055,0.000000,0.086398,0.0,0.000000,...,"(MEDI) Mediterranean Sea, Black Sea Province [...",37.02,1.95,7.5,23.8,37.5,,,0.00,0.06
4,TARA_009_DCM_0.22-1.6,0.000065,0.061861,0.0,0.000000,0.000022,0.000000,0.068969,0.0,0.000000,...,"(MEDI) Mediterranean Sea, Black Sea Province [...",39.07,5.86,55.0,16.2,37.8,,,0.02,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,TARA_151_DCM_0.22-3,0.000573,0.043616,0.0,0.000000,0.000000,0.000000,0.060330,0.0,0.000031,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.19,-28.88,77.6,16.8,36.2,228.5,1.6,0.01,0.01
135,TARA_151_SRF_0.22-3,0.000060,0.046929,0.0,0.000000,0.000009,0.000009,0.088150,0.0,0.000000,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.16,-29.01,5.4,17.3,36.2,232.1,0.3,0.02,0.01
136,TARA_152_MES_0.22-3,0.012129,0.022713,0.0,0.000010,0.000157,0.000000,0.027061,0.0,0.000157,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,43.74,-16.88,791.8,10.2,35.5,175.3,18.2,0.01,1.12
137,TARA_152_MIX_0.22-3,0.000139,0.034425,0.0,0.000009,0.000083,0.000000,0.097618,0.0,0.000000,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,43.79,-16.89,23.9,14.3,36.0,239.4,3.8,0.32,0.17


In [12]:
phyla_relative_df = phyla_relative_df.replace(['(DCM) deep chlorophyll maximum layer (ENVO:01000326)', 
                 '(SRF) surface water layer (ENVO:00002042)',
                 '(MES) mesopelagic zone (ENVO:00000213) & marine oxygen minimum zone (ENVO:01000065)',
                 '(MES) mesopelagic zone (ENVO:00000213)',
                 '(DCM) deep chlorophyll maximum layer (ENVO:01000326) & marine oxygen minimum zone (ENVO:01000065)',
                 '(MIX) marine epipelagic mixed layer (ENVO:01000061)',
                 'Westerlies Biome',
                 'Coastal Biome ',
                 'Trades Biome',
                 'Polar Biome',
                 '(NAO) North Atlantic Ocean [MRGID:1912]',
                 '(MS) Mediterranean Sea [MRGID:1905]', 
                 '(RS) Red Sea [MRGID:4264]',
                 '(IO) Indian Ocean [MRGID:1904]',
                 '(SAO) South Atlantic Ocean [MRGID:1914]',
                 '(SO) Southern Ocean [MRGID:1907]',
                 '(SPO) South Pacific Ocean [MRGID:1910]',
                 '(NPO) North Pacific Ocean [MRGID:1908]',
                 '(NAST-E) North Atlantic Subtropical Gyral Province [MRGID:21467]',
                 '(MEDI) Mediterranean Sea, Black Sea Province [MRGID:21465]',
                 '(REDS) Red Sea, Persian Gulf Province [MRGID:21474]',
                 '(ARAB) Northwest Arabian Sea Upwelling Province [MRGID:21475]',
                 '(MONS) Indian Monsoon Gyres Province [MRGID:21471]',
                 '(ISSG) Indian South Subtropical Gyre Province [MRGID:21472]',
                 '(EAFR) Eastern Africa Coastal Province [MRGID:21473]',
                 '(BENG) Benguela Current Coastal Province [MRGID:21470]',
                 '(SATL) South Atlantic Gyral Province [MRGID:21459]',
                 '(FKLD) Southwest Atlantic Shelves Province [MRGID:21469]',
                 '(ANTA) Antarctic Province [MRGID:21502]',
                 '(CHIL) Chile-Peru Current Coastal Province [MRGID:21495]',
                 '(SPSG) South Pacific Subtropical Gyre Province, North and South [MRGID:21486]',
                 '(PEOD) Pacific Equatorial Divergence Province [MRGID:21489]',
                 '(NPST) North Pacific Subtropical and Polar Front Provinces [MRGID:21484]',
                 '(PNEC) North Pacific Equatorial Countercurrent Province [MRGID:21488]',
                 '(CAMR) Central American Coastal Province [MRGID:21494]',
                 '(GUIA) Guianas Coastal Province [MRGID:21463]',
                 '(CARB) Caribbean Province [MRGID:21466]',
                 '(GFST) Gulf Stream Province [MRGID:21454]',
                 '(NAST-W) North Atlantic Subtropical Gyral Province [MRGID:21455]'
                ], 
                ['DCM', 'SRF', 'MES/OMZ', 'MES', 'DCM/OMZ', 'MIX', 'Westerlies', 'Coastal',
                 'Trades', 'Polar', 'NAO', 'MS',  'RS', 'IO', 'SAO', 'SO', 'SPO', 'NPO', 'NAST-E',
                 'MEDI', 'REDS', 'ARAB', 'MONS', 'ISSG', 'EAFR', 'BENG', 'SATL', 'FKLD', 'ANTA', 'CHIL',
                 'SPSG', 'PEOD', 'NPST', 'PNEC', 'CAMR', 'GUIA', 'CARB', 'GFST', 'NAST-W'
                ])

In [14]:
phyla_relative_df

Unnamed: 0,label,Acidobacteria,Actinobacteria,Aquificae,Armatimonadetes,BD1-5,BHI80-139,Bacteroidetes,Caldiserica,Candidate division BRC1,...,Province,Latitude,Longitude,Depth (m),Temperature (°C),Salinity (PSU),Oxygen (µmol/kg),Nitrates (µmol/L),NO2 (µmol/L),PO4 (µmol/L)
0,TARA_004_DCM_0.22-1.6,0.000058,0.047662,0.0,0.000000,0.000007,0.000000,0.087127,0.0,0.000000,...,NAST-E,36.57,-6.54,38.7,16.2,36.6,,,,
1,TARA_004_SRF_0.22-1.6,0.000034,0.023802,0.0,0.000011,0.000000,0.000000,0.066133,0.0,0.000000,...,NAST-E,36.55,-6.57,10.0,20.5,36.6,,,,
2,TARA_007_DCM_0.22-1.6,0.000080,0.044163,0.0,0.000000,0.000040,0.000000,0.062705,0.0,0.000000,...,MEDI,37.04,1.95,41.7,17.4,,,,0.00,0.01
3,TARA_007_SRF_0.22-1.6,0.000055,0.035580,0.0,0.000000,0.000055,0.000000,0.086398,0.0,0.000000,...,MEDI,37.02,1.95,7.5,23.8,37.5,,,0.00,0.06
4,TARA_009_DCM_0.22-1.6,0.000065,0.061861,0.0,0.000000,0.000022,0.000000,0.068969,0.0,0.000000,...,MEDI,39.07,5.86,55.0,16.2,37.8,,,0.02,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,TARA_151_DCM_0.22-3,0.000573,0.043616,0.0,0.000000,0.000000,0.000000,0.060330,0.0,0.000031,...,NAST-E,36.19,-28.88,77.6,16.8,36.2,228.5,1.6,0.01,0.01
135,TARA_151_SRF_0.22-3,0.000060,0.046929,0.0,0.000000,0.000009,0.000009,0.088150,0.0,0.000000,...,NAST-E,36.16,-29.01,5.4,17.3,36.2,232.1,0.3,0.02,0.01
136,TARA_152_MES_0.22-3,0.012129,0.022713,0.0,0.000010,0.000157,0.000000,0.027061,0.0,0.000157,...,NAST-E,43.74,-16.88,791.8,10.2,35.5,175.3,18.2,0.01,1.12
137,TARA_152_MIX_0.22-3,0.000139,0.034425,0.0,0.000009,0.000083,0.000000,0.097618,0.0,0.000000,...,NAST-E,43.79,-16.89,23.9,14.3,36.0,239.4,3.8,0.32,0.17


In [13]:
relative_phyla_file = os.path.join(data_path, "relative_phyla_with_metadata.csv")
phyla_relative_df.to_csv(relative_phyla_file, index=False)