In [1]:
# Imports
import os

import bokeh
import bokeh.plotting
import numpy as np
import pandas as pd

bokeh.io.output_notebook()

In [2]:
# Set path to collect data files
data_path = "../data/clean/"

# Import necessary files
taxonomy_file = os.path.join(data_path, "taxonomic_profiles.csv")
metadata_file = os.path.join(data_path, "companion_table_W1.csv")
nutrient_file = os.path.join(data_path, "nutrient_temp_table.csv")

In [3]:
# Read each input data file into pandas
taxa_df = pd.read_csv(taxonomy_file, header=[1])
meta_df = pd.read_csv(metadata_file)
cond_df = pd.read_csv(nutrient_file)

In [4]:
# Calculate the total sum of OTU counts per sample
total_sums = taxa_df.iloc[:,7:].sum()

In [5]:
# Calculate the total sum of cyanobacteria phylum counts per sample
cyanobacteria_sums = taxa_df.loc[taxa_df["Phylum"] == "Cyanobacteria"].iloc[:,7:].sum()

In [6]:
# Calculate percent cyanobactria in each sample
cyano_df = pd.DataFrame(100*cyanobacteria_sums/total_sums, columns=["percent_cyano"])
cyano_df["label"] = cyano_df.index
cyano_df

Unnamed: 0,percent_cyano,label
TARA_018_DCM_0.22-1.6,13.142980,TARA_018_DCM_0.22-1.6
TARA_018_SRF_0.22-1.6,11.354963,TARA_018_SRF_0.22-1.6
TARA_023_DCM_0.22-1.6,7.337427,TARA_023_DCM_0.22-1.6
TARA_023_SRF_0.22-1.6,4.259198,TARA_023_SRF_0.22-1.6
TARA_025_DCM_0.22-1.6,16.825207,TARA_025_DCM_0.22-1.6
...,...,...
TARA_096_SRF_0.22-3,7.319551,TARA_096_SRF_0.22-3
TARA_098_DCM_0.22-3,17.968001,TARA_098_DCM_0.22-3
TARA_098_MES_0.22-3,0.265801,TARA_098_MES_0.22-3
TARA_098_SRF_0.22-3,10.250028,TARA_098_SRF_0.22-3


In [7]:
# Extract sample label, id, and location from metadata
id_df = meta_df.iloc[:,[0, 4, 15]]
id_df = id_df.rename(columns={"Sample label [TARA_station#_environmental-feature_size-fraction]": "label", 
                              "PANGAEA sample identifier": "pangea_id",
                              "Ocean and sea regions (IHO General Sea Areas 1953) [MRGID registered at www.marineregions.com]":"loc"})
id_df

Unnamed: 0,label,pangea_id,loc
0,TARA_004_DCM_0.22-1.6,TARA_X000000368,(NAO) North Atlantic Ocean [MRGID:1912]
1,TARA_004_SRF_0.22-1.6,TARA_Y200000002,(NAO) North Atlantic Ocean [MRGID:1912]
2,TARA_007_DCM_0.22-1.6,TARA_A200000159,(MS) Mediterranean Sea [MRGID:1905]
3,TARA_007_SRF_0.22-1.6,TARA_A200000113,(MS) Mediterranean Sea [MRGID:1905]
4,TARA_009_DCM_0.22-1.6,TARA_X000001036,(MS) Mediterranean Sea [MRGID:1905]
...,...,...,...
239,TARA_151_SRF_0.22-3,TARA_B100001564,(NAO) North Atlantic Ocean [MRGID:1912]
240,TARA_152_MES_0.22-3,TARA_B100001179,(NAO) North Atlantic Ocean [MRGID:1912]
241,TARA_152_MIX_0.22-3,TARA_B100001175,(NAO) North Atlantic Ocean [MRGID:1912]
242,TARA_152_SRF_0.22-3,TARA_B100001173,(NAO) North Atlantic Ocean [MRGID:1912]


In [8]:
# Extract temperature and id info for each sample
temp_df = cond_df.iloc[:,[0, 5]]
temp_df = temp_df.rename(columns={"PANGAEA Sample ID": "pangea_id", 
                              "Mean_Temperature [deg C]*": "temp_C"})
temp_df

Unnamed: 0,pangea_id,temp_C
0,TARA_B100000965,20.6
1,TARA_B100000959,13.0
2,TARA_B100000963,25.3
3,TARA_B100000902,19.6
4,TARA_B100000953,9.2
...,...,...
240,TARA_B100001013,8.1
241,TARA_B100001027,25.1
242,TARA_B100000886,23.8
243,*values indicate mean values based on CTD cast...,


In [9]:
# Merge all relevant data into one df
cyano_temp_df = id_df.merge(temp_df, how="left", on="pangea_id").merge(cyano_df, how="left", on="label")

In [10]:
# Display merged data
cyano_temp_df

Unnamed: 0,label,pangea_id,loc,temp_C,percent_cyano
0,TARA_004_DCM_0.22-1.6,TARA_X000000368,(NAO) North Atlantic Ocean [MRGID:1912],16.2,8.953754
1,TARA_004_SRF_0.22-1.6,TARA_Y200000002,(NAO) North Atlantic Ocean [MRGID:1912],20.5,12.160353
2,TARA_007_DCM_0.22-1.6,TARA_A200000159,(MS) Mediterranean Sea [MRGID:1905],17.4,9.226965
3,TARA_007_SRF_0.22-1.6,TARA_A200000113,(MS) Mediterranean Sea [MRGID:1905],23.8,4.535384
4,TARA_009_DCM_0.22-1.6,TARA_X000001036,(MS) Mediterranean Sea [MRGID:1905],16.2,5.045098
...,...,...,...,...,...
239,TARA_151_SRF_0.22-3,TARA_B100001564,(NAO) North Atlantic Ocean [MRGID:1912],17.3,10.383958
240,TARA_152_MES_0.22-3,TARA_B100001179,(NAO) North Atlantic Ocean [MRGID:1912],10.2,0.270159
241,TARA_152_MIX_0.22-3,TARA_B100001175,(NAO) North Atlantic Ocean [MRGID:1912],14.3,10.644666
242,TARA_152_SRF_0.22-3,TARA_B100001173,(NAO) North Atlantic Ocean [MRGID:1912],14.3,13.657929


In [13]:
# Extract relevant values from cleaned df
# [:242] since last value of all cols was nan
temps = np.array(cyano_temp_df["temp_C"])[:242]
cyano = np.nan_to_num(np.array(cyano_temp_df["percent_cyano"])[:242])
locs = np.array(cyano_temp_df["loc"])[:242]

In [14]:
# Create a color mapping for plotting (coloring by location)
color_dict = {}
colors = bokeh.palettes.Dark2[8]
for idx, loc in enumerate(np.unique(locs)):
    color_dict[loc] = colors[idx]

In [15]:
# Set up figure
p = bokeh.plotting.figure(width=900, height=500)

# Plot each point
for idx in range(len(temps)):
    p.circle(temps[idx], cyano[idx], legend_label = locs[idx], color=color_dict[locs[idx]], size=5)

# Plot options
p.add_layout(p.legend[0], 'right')
p.xaxis.axis_label = "temperature (C)"
p.yaxis.axis_label = "% cyanobacteria"
p.title = "% cyanobactera vs. temperature for various ocean sites"
p.legend.click_policy="hide"

# Show plot
bokeh.io.show(p)

# Make sure to save plot to plots folder by manually pressing save in the plot toolbar