In [1]:
# Imports
import os

import bokeh
import bokeh.plotting
import numpy as np
import pandas as pd

bokeh.io.output_notebook()

In [2]:
# Set path to collect data files
data_path = "../data/clean/"

# Import necessary files
taxonomy_file = os.path.join(data_path, "taxonomic_profiles.csv")
metadata_file = os.path.join(data_path, "companion_table_W1.csv")
nutrient_file = os.path.join(data_path, "nutrient_temp_table.csv")

In [3]:
# Read each input data file into pandas
taxa_df = pd.read_csv(taxonomy_file, header=[1])
meta_df = pd.read_csv(metadata_file)
cond_df = pd.read_csv(nutrient_file)

In [4]:
# Extract sample label, id, and location from metadata
id_df = meta_df.iloc[:,[0, 4, 11, 14, 15, 16]]
id_df = id_df.rename(columns={'Sample label [TARA_station#_environmental-feature_size-fraction]': 'label', 
                              'PANGAEA sample identifier': 'pangea_id',
                              'Marine pelagic biomes (Longhurst 2007)': 'Biome',
                              'Ocean and sea regions (IHO General Sea Areas 1953) [MRGID registered at www.marineregions.com]': 'Region',
                              'Marine pelagic biomes  (Longhurst 2007) [MRGID registered at www.marineregions.com] ': 'Province'
                             })
# Extract temperature and id info for each sample
cond_df = cond_df.iloc[:,[0, 2, 3, 4, 5, 6, 7, 8, 9, 10]]
cond_df = cond_df.rename(columns={'PANGAEA Sample ID': 'pangea_id', 
                                  'Mean_Lat*': 'Latitude',
                                  'Mean_Long*': 'Longitude',
                                  'Mean_Depth [m]*': 'Depth (m)',
                                  'Mean_Temperature [deg C]*': 'Temperature (°C)',
                                  'Mean_Salinity [PSU]*': 'Salinity (PSU)',
                                  'Mean_Oxygen [umol/kg]*': 'Oxygen (µmol/kg)',
                                  'Mean_Nitrates[umol/L]*': 'Nitrates (µmol/L)',
                                  'NO2 [umol/L]**': 'NO2 (µmol/L)',
                                  'PO4 [umol/L]**': 'PO4 (µmol/L)',
                                 })
# Merge all relevant data into one df
id_df = id_df.merge(cond_df, how='left', on='pangea_id')

In [5]:
id_df

Unnamed: 0,label,pangea_id,Environmental Feature,Biome,Region,Province,Latitude,Longitude,Depth (m),Temperature (°C),Salinity (PSU),Oxygen (µmol/kg),Nitrates (µmol/L),NO2 (µmol/L),PO4 (µmol/L)
0,TARA_004_DCM_0.22-1.6,TARA_X000000368,(DCM) deep chlorophyll maximum layer (ENVO:010...,Westerlies Biome,(NAO) North Atlantic Ocean [MRGID:1912],(NAST-E) North Atlantic Subtropical Gyral Prov...,36.57,-6.54,38.7,16.2,36.6,,,,
1,TARA_004_SRF_0.22-1.6,TARA_Y200000002,(SRF) surface water layer (ENVO:00002042),Westerlies Biome,(NAO) North Atlantic Ocean [MRGID:1912],(NAST-E) North Atlantic Subtropical Gyral Prov...,36.55,-6.57,10.0,20.5,36.6,,,,
2,TARA_007_DCM_0.22-1.6,TARA_A200000159,(DCM) deep chlorophyll maximum layer (ENVO:010...,Westerlies Biome,(MS) Mediterranean Sea [MRGID:1905],"(MEDI) Mediterranean Sea, Black Sea Province [...",37.04,1.95,41.7,17.4,,,,0.00,0.01
3,TARA_007_SRF_0.22-1.6,TARA_A200000113,(SRF) surface water layer (ENVO:00002042),Westerlies Biome,(MS) Mediterranean Sea [MRGID:1905],"(MEDI) Mediterranean Sea, Black Sea Province [...",37.02,1.95,7.5,23.8,37.5,,,0.00,0.06
4,TARA_009_DCM_0.22-1.6,TARA_X000001036,(DCM) deep chlorophyll maximum layer (ENVO:010...,Westerlies Biome,(MS) Mediterranean Sea [MRGID:1905],"(MEDI) Mediterranean Sea, Black Sea Province [...",39.07,5.86,55.0,16.2,37.8,,,0.02,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,TARA_151_SRF_0.22-3,TARA_B100001564,(SRF) surface water layer (ENVO:00002042),Westerlies Biome,(NAO) North Atlantic Ocean [MRGID:1912],(NAST-E) North Atlantic Subtropical Gyral Prov...,36.16,-29.01,5.4,17.3,36.2,232.1,0.3,0.02,0.01
240,TARA_152_MES_0.22-3,TARA_B100001179,(MES) mesopelagic zone (ENVO:00000213),Westerlies Biome,(NAO) North Atlantic Ocean [MRGID:1912],(NAST-E) North Atlantic Subtropical Gyral Prov...,43.74,-16.88,791.8,10.2,35.5,175.3,18.2,0.01,1.12
241,TARA_152_MIX_0.22-3,TARA_B100001175,(MIX) marine epipelagic mixed layer (ENVO:0100...,Westerlies Biome,(NAO) North Atlantic Ocean [MRGID:1912],(NAST-E) North Atlantic Subtropical Gyral Prov...,43.79,-16.89,23.9,14.3,36.0,239.4,3.8,0.32,0.17
242,TARA_152_SRF_0.22-3,TARA_B100001173,(SRF) surface water layer (ENVO:00002042),Westerlies Biome,(NAO) North Atlantic Ocean [MRGID:1912],(NAST-E) North Atlantic Subtropical Gyral Prov...,43.69,-16.85,5.4,14.3,36.0,243.1,3.0,0.31,0.16


In [6]:
taxa_df = taxa_df.iloc[:,6:]
taxa_df = pd.melt(taxa_df, ["OTU.rep"], var_name="label", value_name="counts")

In [7]:
abundances_list = []
sample_list = []
OTU_list = []
# Calculate relative abundances for each sample
for sample in taxa_df.label.unique():
    total = taxa_df.loc[taxa_df['label'] == sample]['counts'].sum()
    abundances = taxa_df.loc[taxa_df['label'] == sample].groupby('OTU.rep')['counts'].sum()/total
    abundances_list.extend(abundances.values)
    sample_list.extend([sample] * len(abundances))
    OTU_list.extend(list(abundances.index))

In [8]:
taxa_relative_df = pd.DataFrame({
    'abundance': abundances_list,
    'label': sample_list,
    'OTU': OTU_list
})

In [9]:
taxa_relative_df = taxa_relative_df.pivot(index="label", columns=["OTU"], values="abundance").fillna(0)

In [10]:
taxa_relative_df = taxa_relative_df.reset_index()

In [11]:
taxa_relative_df = taxa_relative_df.merge(id_df, how="left", on="label")

In [12]:
taxa_relative_df

Unnamed: 0,label,AAAA02020713.1.1297,AAAA02020732.66.1490,AAAA02029623.1.1373,AACY020016122.205.1491,AACY020061762.7.1504,AACY020068177.376.1853,AACY020073989.594.2129,AACY020075636.150.1670,AACY020080403.492.1950,...,Province,Latitude,Longitude,Depth (m),Temperature (°C),Salinity (PSU),Oxygen (µmol/kg),Nitrates (µmol/L),NO2 (µmol/L),PO4 (µmol/L)
0,TARA_004_DCM_0.22-1.6,0.0,0.0,0.0,0.000154,0.000035,0.000217,0.000000,0.001572,0.000035,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.57,-6.54,38.7,16.2,36.6,,,,
1,TARA_004_SRF_0.22-1.6,0.0,0.0,0.0,0.000143,0.000022,0.000790,0.000000,0.000088,0.000406,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.55,-6.57,10.0,20.5,36.6,,,,
2,TARA_007_DCM_0.22-1.6,0.0,0.0,0.0,0.000000,0.000013,0.001262,0.000000,0.000335,0.000013,...,"(MEDI) Mediterranean Sea, Black Sea Province [...",37.04,1.95,41.7,17.4,,,,0.00,0.01
3,TARA_007_SRF_0.22-1.6,0.0,0.0,0.0,0.000000,0.000000,0.000053,0.000000,0.000053,0.000000,...,"(MEDI) Mediterranean Sea, Black Sea Province [...",37.02,1.95,7.5,23.8,37.5,,,0.00,0.06
4,TARA_009_DCM_0.22-1.6,0.0,0.0,0.0,0.000000,0.000000,0.001531,0.000010,0.000864,0.000000,...,"(MEDI) Mediterranean Sea, Black Sea Province [...",39.07,5.86,55.0,16.2,37.8,,,0.02,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,TARA_151_DCM_0.22-3,0.0,0.0,0.0,0.000000,0.000000,0.000634,0.000000,0.001603,0.000067,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.19,-28.88,77.6,16.8,36.2,228.5,1.6,0.01,0.01
135,TARA_151_SRF_0.22-3,0.0,0.0,0.0,0.000000,0.000008,0.000236,0.000000,0.000675,0.000179,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.16,-29.01,5.4,17.3,36.2,232.1,0.3,0.02,0.01
136,TARA_152_MES_0.22-3,0.0,0.0,0.0,0.000000,0.000000,0.000119,0.000000,0.000027,0.000000,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,43.74,-16.88,791.8,10.2,35.5,175.3,18.2,0.01,1.12
137,TARA_152_MIX_0.22-3,0.0,0.0,0.0,0.000000,0.000000,0.000592,0.000026,0.001201,0.000185,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,43.79,-16.89,23.9,14.3,36.0,239.4,3.8,0.32,0.17


In [13]:
taxa_relative_df = taxa_relative_df.replace(['(DCM) deep chlorophyll maximum layer (ENVO:01000326)', 
                 '(SRF) surface water layer (ENVO:00002042)',
                 '(MES) mesopelagic zone (ENVO:00000213) & marine oxygen minimum zone (ENVO:01000065)',
                 '(MES) mesopelagic zone (ENVO:00000213)',
                 '(DCM) deep chlorophyll maximum layer (ENVO:01000326) & marine oxygen minimum zone (ENVO:01000065)',
                 '(MIX) marine epipelagic mixed layer (ENVO:01000061)',
                 'Westerlies Biome',
                 'Coastal Biome ',
                 'Trades Biome',
                 'Polar Biome',
                 '(NAO) North Atlantic Ocean [MRGID:1912]',
                 '(MS) Mediterranean Sea [MRGID:1905]', 
                 '(RS) Red Sea [MRGID:4264]',
                 '(IO) Indian Ocean [MRGID:1904]',
                 '(SAO) South Atlantic Ocean [MRGID:1914]',
                 '(SO) Southern Ocean [MRGID:1907]',
                 '(SPO) South Pacific Ocean [MRGID:1910]',
                 '(NPO) North Pacific Ocean [MRGID:1908]',
                 '(NAST-E) North Atlantic Subtropical Gyral Province [MRGID:21467]',
                 '(MEDI) Mediterranean Sea, Black Sea Province [MRGID:21465]',
                 '(REDS) Red Sea, Persian Gulf Province [MRGID:21474]',
                 '(ARAB) Northwest Arabian Sea Upwelling Province [MRGID:21475]',
                 '(MONS) Indian Monsoon Gyres Province [MRGID:21471]',
                 '(ISSG) Indian South Subtropical Gyre Province [MRGID:21472]',
                 '(EAFR) Eastern Africa Coastal Province [MRGID:21473]',
                 '(BENG) Benguela Current Coastal Province [MRGID:21470]',
                 '(SATL) South Atlantic Gyral Province [MRGID:21459]',
                 '(FKLD) Southwest Atlantic Shelves Province [MRGID:21469]',
                 '(ANTA) Antarctic Province [MRGID:21502]',
                 '(CHIL) Chile-Peru Current Coastal Province [MRGID:21495]',
                 '(SPSG) South Pacific Subtropical Gyre Province, North and South [MRGID:21486]',
                 '(PEOD) Pacific Equatorial Divergence Province [MRGID:21489]',
                 '(NPST) North Pacific Subtropical and Polar Front Provinces [MRGID:21484]',
                 '(PNEC) North Pacific Equatorial Countercurrent Province [MRGID:21488]',
                 '(CAMR) Central American Coastal Province [MRGID:21494]',
                 '(GUIA) Guianas Coastal Province [MRGID:21463]',
                 '(CARB) Caribbean Province [MRGID:21466]',
                 '(GFST) Gulf Stream Province [MRGID:21454]',
                 '(NAST-W) North Atlantic Subtropical Gyral Province [MRGID:21455]'
                ], 
                ['DCM', 'SRF', 'MES/OMZ', 'MES', 'DCM/OMZ', 'MIX', 'Westerlies', 'Coastal',
                 'Trades', 'Polar', 'NAO', 'MS',  'RS', 'IO', 'SAO', 'SO', 'SPO', 'NPO', 'NAST-E',
                 'MEDI', 'REDS', 'ARAB', 'MONS', 'ISSG', 'EAFR', 'BENG', 'SATL', 'FKLD', 'ANTA', 'CHIL',
                 'SPSG', 'PEOD', 'NPST', 'PNEC', 'CAMR', 'GUIA', 'CARB', 'GFST', 'NAST-W'
                ])

In [14]:
relative_taxonomy_file = os.path.join(data_path, "relative_taxonomies_with_metadata.csv")
taxa_relative_df.to_csv(relative_taxonomy_file, index=False)