In [None]:
!pip install opentree #use bash command line magic to install the open tree helpers

See GBIF demo pdf (link) for background on this example

In [None]:
import sys
from opentree import OT

In [None]:
# OOh! We can mix togther python and bash commands :P
# The '!' at the start of the line means the command is executed in bash
# This wget command pulls GBIF data file from the internet, and saves it as 'gbif_example.csv'
#!wget -O gbif_example.csv https://raw.githubusercontent.com/McTavishLab/biodiversity_next/master/example.csv
# This is actually occurence data downloaded from GBIF doi https://doi.org/10.15468/dl.9bigak

In [None]:
!head -n 3 ../tutorial/gbif_example.csv
# Oof! Lots of information.

In [None]:
filename = "../tutorial/gbif_example.csv"
fi = open(filename)
header = fi.readline().split('\t') # Save the first line seperately as the header

gbif_data = fi.readlines() #read in the data

#Get indexes for each column in the csv file
col_dict = {}
for i, col in enumerate(header):
    col_dict[col] = i
    
# Would this make more sense to do in Pandas? Maybe! But I like loops.

In [None]:
# Now we know what column each of out data types are in.
# So much (many) data!
col_dict

In [None]:
# As described in the TNRS, sction, 
# we can use OpenTree API's to match out Gbif identifiers to Open Tree unique identifiers

match_dict = {} # This will list the matches
ott_ids = set() # And generate a set of taxa

#Loop through each line in the gbif output
for lin in gbif_data:
    lii = lin.split('\t')
    gb_id = lii[col_dict['taxonKey']] # this grabs the gbif id number from the right column
    sys.stdout.write(".") #progress bar
    sys.stdout.flush()
    if gb_id in match_dict:
        #Skip gb_id's you have already matched
        pass
    else:
        # Do a direct match to gbif id's in the open tree taxonomy
        ott_id = OT.get_ottid_from_gbifid(gb_id)
        if ott_id == None:
            # Sometimes we don't have a record of the gbif ID, but we do have a taxon with that exact name
            # Search on the name
            spp_name = lii[col_dict['verbatimScientificName']]
            ott_id = OT.get_ottid_from_name(spp_name)
            if ott_id == None:
                sys.stdout.write("Couldn't find an id for {}, gbif {}".format(spp_name, gb_id))
        match_dict[gb_id] = ott_id
        ott_ids.add(ott_id)

In [None]:
# Lets grab a tree for those taxa!
output = OT.synth_induced_tree(ott_ids=list(ott_ids),  label_format='name')
treefile = "gbif_taxa.tre"
output.tree.write(path = treefile, schema = "newick")
sys.stdout.write("Tree written to {}\n".format(treefile))

In [None]:
# if we print to string we can take a quick look over at icytree.org or itol.embl.de
output.tree.as_string(schema="newick")

In [None]:
# Don't forget to cite your friendly phylogeneticists!
studies = output.response_dict['supporting_studies']
cites = OT.get_citations(studies) #this can be a bit slow
print(cites)

# DIY: Go to GBIF and choose a region of interest to you. Download the data as csv, and see if you can get a phylogeny for those taxa!