In [1]:
# Imports
import os

import bokeh
import bokeh.plotting
import numpy as np
import pandas as pd

bokeh.io.output_notebook()

In [2]:
# Set path to collect data files
data_path = "../data/raw/"

# Import necessary files
OG_file = os.path.join(data_path, "TARA139.og.prok.fpkm.release")
COG_file = os.path.join(data_path, "COG.funccat.txt")
NOG_file = os.path.join(data_path, "NOG.funccat.txt")

In [3]:
# Read each input data file into pandas
OG_df = pd.read_csv(OG_file, sep='\t')
COG_df = pd.read_csv(COG_file, sep='\t', encoding='cp1252', names=['OG', 'category'])
NOG_df = pd.read_csv(NOG_file, sep='\t', encoding='cp1252', names=['OG', 'category'])

In [4]:
OG_df = OG_df.shift(periods=1, axis='columns')
OG_df.iloc[:, 0] = OG_df.index

In [5]:
# Set path to collect data files
data_path = "../data/clean/"

# Import necessary files
metadata_file = os.path.join(data_path, "companion_table_W1.csv")
nutrient_file = os.path.join(data_path, "nutrient_temp_table.csv")
core_file = os.path.join(data_path, "core_annos.csv")

In [6]:
# Read each input data file into pandas
meta_df = pd.read_csv(metadata_file)
cond_df = pd.read_csv(nutrient_file)
core_df = pd.read_csv(core_file)

In [7]:
# Extract sample label, id, and location from metadata
id_df = meta_df.iloc[:,[0, 4, 11, 14, 15, 16]]
id_df = id_df.rename(columns={'Sample label [TARA_station#_environmental-feature_size-fraction]': 'label', 
                              'PANGAEA sample identifier': 'pangea_id',
                              'Marine pelagic biomes (Longhurst 2007)': 'Biome',
                              'Ocean and sea regions (IHO General Sea Areas 1953) [MRGID registered at www.marineregions.com]': 'Region',
                              'Marine pelagic biomes  (Longhurst 2007) [MRGID registered at www.marineregions.com] ': 'Province'
                             })
# Extract temperature and id info for each sample
cond_df = cond_df.iloc[:,[0, 2, 3, 4, 5, 6, 7, 8, 9, 10]]
cond_df = cond_df.rename(columns={'PANGAEA Sample ID': 'pangea_id', 
                                  'Mean_Lat*': 'Latitude',
                                  'Mean_Long*': 'Longitude',
                                  'Mean_Depth [m]*': 'Depth (m)',
                                  'Mean_Temperature [deg C]*': 'Temperature (°C)',
                                  'Mean_Salinity [PSU]*': 'Salinity (PSU)',
                                  'Mean_Oxygen [umol/kg]*': 'Oxygen (µmol/kg)',
                                  'Mean_Nitrates[umol/L]*': 'Nitrates (µmol/L)',
                                  'NO2 [umol/L]**': 'NO2 (µmol/L)',
                                  'PO4 [umol/L]**': 'PO4 (µmol/L)',
                                 })
# Merge all relevant data into one df
id_df = id_df.merge(cond_df, how='left', on='pangea_id')

In [8]:
OG_df = pd.melt(OG_df, ["og"], var_name="label", value_name="counts")
OG_df = OG_df.rename(columns={"og": "OG"})
OG_df.head()

Unnamed: 0,OG,label,counts
0,NOG243842,TARA_100_DCM_0.22-3,0.0
1,NOG243843,TARA_100_DCM_0.22-3,0.0
2,NOG243840,TARA_100_DCM_0.22-3,0.0
3,NOG41625,TARA_100_DCM_0.22-3,6.484686e-06
4,NOG41624,TARA_100_DCM_0.22-3,3.074175e-07


In [9]:
# Dictionary of OG categories
OG_category_dict = {
    'A': 'RNA processing and modification',
    'B': 'Chromatin structure and dynamics',
    'C': 'Energy production and conversion',
    'D': 'Cell cycle control and mitosis',
    'E': 'Amino acid metabolism and transport',
    'F': 'Nucleotide metabolism and transport',
    'G': 'Carbohydrate metabolism and transport',
    'H': 'Coenzyme metabolism',
    'I': 'Lipid metabolism',
    'J': 'Translation',
    'K': 'Transcription',
    'L': 'Replication and repair',
    'M': 'Cell wall/membrane/envelope biogenesis',
    'N': 'Cell motility',
    'O': 'Post-translational modification, protein turnover, chaperone functions',
    'P': 'Inorganic ion transport and metabolism',
    'Q': 'Secondary structure',
    'T': 'Signal transduction',
    'U': 'Intracellular trafficing and secretion',
    'V': 'Defense mechanisms',
    'W': 'Extracellular structures',
    'X': 'Mobilome: prophages, transposons',
    'Y': 'Nuclear structure',
    'Z': 'Cytoskeleton',
    'R': 'General functional prediction only',
    'S': 'Function unknown'
}

In [10]:
# Use dictionary to convert letter abbreviations to categories
COG_df['category'] = COG_df['category'].astype(str).str[0] # Only taking first function, not sure yet how to handle multifunction genes
NOG_df['category'] = NOG_df['category'].astype(str).str[0]
COG_df = COG_df.replace({"category": OG_category_dict})
NOG_df = NOG_df.replace({"category": OG_category_dict})

In [11]:
OG_category_df = pd.concat([COG_df, NOG_df])

In [12]:
OG_df = OG_df.merge(OG_category_df, how="left", on="OG")

In [13]:
# Add core annotations 
core_df = core_df.rename(columns={"COG": "OG"})
OG_df = OG_df.merge(core_df, how="left", on="OG")

In [14]:
# Not sure whether to drop the NOGs that don't map to anything or do function unknown
# I think this is closer to the paper's figure
# OG_df = OG_df.replace(np.nan, 'Function unknown')
OG_df = OG_df[OG_df['category'].notna()]

In [15]:
# Drop all gene categories where we don't know the function
OG_df = OG_df.loc[OG_df["Ocean core"] == False]
OG_df = OG_df.loc[OG_df["category"] != "Function unknown"]
OG_df = OG_df.loc[OG_df["category"] != "General functional prediction only"]

In [16]:
abundances_list = []
sample_list = []
function_list = []
# Calculate relative abundances for each sample
for sample in OG_df.label.unique():
    total = OG_df.loc[OG_df['label'] == sample]['counts'].sum()
    abundances = OG_df.loc[OG_df['label'] == sample].groupby('category')['counts'].sum()/total
    abundances_list.extend(abundances.values)
    sample_list.extend([sample] * len(abundances))
    function_list.extend(list(abundances.index))

In [17]:
OG_relative_df = pd.DataFrame({
    'abundance': abundances_list,
    'label': sample_list,
    'Function': function_list
})

In [18]:
OG_relative_df = OG_relative_df.pivot(index="label", columns=["Function"], values="abundance").fillna(0)

In [19]:
OG_relative_df = OG_relative_df.reset_index()

In [20]:
OG_relative_df = OG_relative_df.merge(id_df, how="left", on="label")

In [21]:
OG_relative_df

Unnamed: 0,label,Amino acid metabolism and transport,Carbohydrate metabolism and transport,Cell cycle control and mitosis,Cell motility,Cell wall/membrane/envelope biogenesis,Chromatin structure and dynamics,Coenzyme metabolism,Cytoskeleton,Defense mechanisms,...,Province,Latitude,Longitude,Depth (m),Temperature (°C),Salinity (PSU),Oxygen (µmol/kg),Nitrates (µmol/L),NO2 (µmol/L),PO4 (µmol/L)
0,TARA_004_DCM_0.22-1.6,0.085835,0.058738,0.001530,0.024384,0.022317,0.000112,0.039094,0.0,0.011553,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.57,-6.54,38.7,16.2,36.6,,,,
1,TARA_004_SRF_0.22-1.6,0.080779,0.046429,0.003667,0.035454,0.018266,0.000372,0.056070,0.0,0.045052,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.55,-6.57,10.0,20.5,36.6,,,,
2,TARA_007_DCM_0.22-1.6,0.052906,0.091517,0.001682,0.033234,0.019290,0.000533,0.032396,0.0,0.029613,...,"(MEDI) Mediterranean Sea, Black Sea Province [...",37.04,1.95,41.7,17.4,,,,0.00,0.01
3,TARA_007_SRF_0.22-1.6,0.065422,0.093215,0.010399,0.049789,0.033640,0.000824,0.039916,0.0,0.029616,...,"(MEDI) Mediterranean Sea, Black Sea Province [...",37.02,1.95,7.5,23.8,37.5,,,0.00,0.06
4,TARA_009_DCM_0.22-1.6,0.065958,0.058542,0.001770,0.026450,0.029016,0.000240,0.032416,0.0,0.006421,...,"(MEDI) Mediterranean Sea, Black Sea Province [...",39.07,5.86,55.0,16.2,37.8,,,0.02,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,TARA_151_DCM_0.22-3,0.070074,0.060041,0.002172,0.026843,0.041505,0.000439,0.030282,0.0,0.007994,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.19,-28.88,77.6,16.8,36.2,228.5,1.6,0.01,0.01
135,TARA_151_SRF_0.22-3,0.083750,0.064625,0.002399,0.049539,0.016098,0.000042,0.026838,0.0,0.021657,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,36.16,-29.01,5.4,17.3,36.2,232.1,0.3,0.02,0.01
136,TARA_152_MES_0.22-3,0.062500,0.038800,0.002463,0.007082,0.065313,0.001081,0.030726,0.0,0.006815,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,43.74,-16.88,791.8,10.2,35.5,175.3,18.2,0.01,1.12
137,TARA_152_MIX_0.22-3,0.094420,0.060860,0.001397,0.013899,0.027726,0.000095,0.035856,0.0,0.010554,...,(NAST-E) North Atlantic Subtropical Gyral Prov...,43.79,-16.89,23.9,14.3,36.0,239.4,3.8,0.32,0.17


In [22]:
OG_relative_df = OG_relative_df.replace(['(DCM) deep chlorophyll maximum layer (ENVO:01000326)', 
                 '(SRF) surface water layer (ENVO:00002042)',
                 '(MES) mesopelagic zone (ENVO:00000213) & marine oxygen minimum zone (ENVO:01000065)',
                 '(MES) mesopelagic zone (ENVO:00000213)',
                 '(DCM) deep chlorophyll maximum layer (ENVO:01000326) & marine oxygen minimum zone (ENVO:01000065)',
                 '(MIX) marine epipelagic mixed layer (ENVO:01000061)',
                 'Westerlies Biome',
                 'Coastal Biome ',
                 'Trades Biome',
                 'Polar Biome',
                 '(NAO) North Atlantic Ocean [MRGID:1912]',
                 '(MS) Mediterranean Sea [MRGID:1905]', 
                 '(RS) Red Sea [MRGID:4264]',
                 '(IO) Indian Ocean [MRGID:1904]',
                 '(SAO) South Atlantic Ocean [MRGID:1914]',
                 '(SO) Southern Ocean [MRGID:1907]',
                 '(SPO) South Pacific Ocean [MRGID:1910]',
                 '(NPO) North Pacific Ocean [MRGID:1908]',
                 '(NAST-E) North Atlantic Subtropical Gyral Province [MRGID:21467]',
                 '(MEDI) Mediterranean Sea, Black Sea Province [MRGID:21465]',
                 '(REDS) Red Sea, Persian Gulf Province [MRGID:21474]',
                 '(ARAB) Northwest Arabian Sea Upwelling Province [MRGID:21475]',
                 '(MONS) Indian Monsoon Gyres Province [MRGID:21471]',
                 '(ISSG) Indian South Subtropical Gyre Province [MRGID:21472]',
                 '(EAFR) Eastern Africa Coastal Province [MRGID:21473]',
                 '(BENG) Benguela Current Coastal Province [MRGID:21470]',
                 '(SATL) South Atlantic Gyral Province [MRGID:21459]',
                 '(FKLD) Southwest Atlantic Shelves Province [MRGID:21469]',
                 '(ANTA) Antarctic Province [MRGID:21502]',
                 '(CHIL) Chile-Peru Current Coastal Province [MRGID:21495]',
                 '(SPSG) South Pacific Subtropical Gyre Province, North and South [MRGID:21486]',
                 '(PEOD) Pacific Equatorial Divergence Province [MRGID:21489]',
                 '(NPST) North Pacific Subtropical and Polar Front Provinces [MRGID:21484]',
                 '(PNEC) North Pacific Equatorial Countercurrent Province [MRGID:21488]',
                 '(CAMR) Central American Coastal Province [MRGID:21494]',
                 '(GUIA) Guianas Coastal Province [MRGID:21463]',
                 '(CARB) Caribbean Province [MRGID:21466]',
                 '(GFST) Gulf Stream Province [MRGID:21454]',
                 '(NAST-W) North Atlantic Subtropical Gyral Province [MRGID:21455]'
                ], 
                ['DCM', 'SRF', 'MES/OMZ', 'MES', 'DCM/OMZ', 'MIX', 'Westerlies', 'Coastal',
                 'Trades', 'Polar', 'NAO', 'MS',  'RS', 'IO', 'SAO', 'SO', 'SPO', 'NPO', 'NAST-E',
                 'MEDI', 'REDS', 'ARAB', 'MONS', 'ISSG', 'EAFR', 'BENG', 'SATL', 'FKLD', 'ANTA', 'CHIL',
                 'SPSG', 'PEOD', 'NPST', 'PNEC', 'CAMR', 'GUIA', 'CARB', 'GFST', 'NAST-W'
                ])

In [23]:
relative_known_function_file = os.path.join(data_path, "relative_known_function_with_metadata.csv")
OG_relative_df.to_csv(relative_known_function_file, index=False)