# Astral tree on Amaranth reference- aligned reads using shorter windows

This notebook makes a species-level phylogeny for 38 Amaranth samples. <br>
Astral uses individual genes - here using tree_slider to determine the length - and estimates phylogeny in 2 step process. <br>
3RAD data assembled with ipyrad. modest filtering (see below) <br>
This is also the input for Nanuq, so output <br>

Nov 12, 2020

In [1]:
# import packages into python
import ipyrad as ip
import ipyrad.analysis as ipa
import toytree
import toyplot.svg
import pandas as pd

# connect to parallel client
#import ipyparallel as ipp
#ipyclient = ipp.Client()
#ip.cluster_info(ipyclient)

# print the version of ipyrad you are running
print('ipyrad', ipa.__version__)
print('toytree', toytree.__version__)

ipyrad 0.9.61
toytree 2.0.4


First, use tree slider to get 0.5MB windows across the entire genome 

In [2]:
# the path to your HDF5 formatted seqs file
data = "/rigel/dsi/users/slh2181/tuberculatus_plate/ipyrad/Good_samp_beet_noMaxSNP_outfiles/Good_samp_beet_noMaxSNP.seqs.hdf5"
#set output directory
OUTDIR = "/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/"

In [3]:
# check scaffold idx (row) against scaffold names
ipa.treeslider(data).scaffold_table.head(20)

Unnamed: 0,scaffold_name,scaffold_length
0,Scaffold_10,22670516
1,Scaffold_11,22280117
2,Scaffold_12,22052327
3,Scaffold_13,20679869
4,Scaffold_14,20190685
5,Scaffold_15,17522127
6,Scaffold_16,16951160
7,Scaffold_1,38124660
8,Scaffold_2,35657244
9,Scaffold_3,30204323


In [4]:
imap={"acanthochiton": ["acanthochiton_SLH_AL_0001","acanthochiton_SLH_AL_0002"],
      "acutilobus": ["acutilobus_SLH_AL_0003","acutilobus_SLH_AL_0004"],
      "albus": ["albus_SLH_AL_0006", "albus_SLH_AL_0009", "albus_SLH_AL_0010"],
      "arenicola": ["arenicola_SLH_AL_0013","arenicola_SLH_AL_0018","arenicola_SLH_AL_0012","arenicola_SLH_AL_0015"],
 #     "asplundii": [],
      "australis": ["australis_SLH_AL_0020","australis_SLH_AL_0021"],
      "blitoides": ["blitoides_SLH_AL_0028"], #"blitoides_SLH_AL_0023",
      "blitum": ["blitum-blitum_SLH_AL_0029", "blitum-oleraceus_SLH_AL_0034", "blitum-pseudogracilis_SLH_AL_0037"],
      "californicus": ["californicus_SLH_AL_0039"],
      "cannabinus": ["cannabinus_SLH_AL_0040", "cannabinus_SLH_AL_0041"], #"cannabinus_SLH_AL_0042"],
      "caudatus": ["caudatus_SLH_AL_0102","caudatus_SLH_AL_0110","caudatus_SLH_AL_0116","caudatus_SLH_AL_0322","caudatus_SLH_AL_0540"],
      "crassipes": ["crassipes_SLH_AL_0599","crassipes_SLH_AL_0600"],
      "cruentus": ["cruentus_SLH_AL_0679", "cruentus_SLH_AL_0699", "cruentus_SLH_AL_0728", "cruentus_SLH_AL_0804", "cruentus_SLH_AL_0832"],
      "hybridus2": ["hybridus_SLH_AL_1060", "hybridus_SLH_AL_1098"],
      "deflexus": ["deflexus_SLH_AL_0951","deflexus_SLH_AL_0952", "deflexus_SLH_AL_0955","deflexus_SLH_AL_0953","deflexus_SLH_AL_0954"],
      "dubius": ["dubius_SLH_AL_0965","dubius_SLH_AL_0979","dubius_SLH_AL_0992"],
      "fimbriatus": [ "fimbriatus_SLH_AL_0998"], #"fimbriatus_SLH_AL_0997",
      "floridanus": ["floridanus_SLH_AL_1000"],
      "graecizans": ["graecizans-aschersonianus_SLH_AL_1009", "graecizans-silvestris_SLH_AL_1013", "graecizans-thellungianus_SLH_AL_1014", 
                    "asplundii_SLH_AL_0019"],
      "greggii": ["greggii_SLH_AL_1015", "greggii_SLH_AL_1016"],
      "hybridus1": ["hybridus_SLH_AL_0001-restricted", "hybridus_SLH_AL_1117"],
      "hybridus3": ["hybridus_SLH_AL_1099"], 
      "hypochondriacus": ["hypochondriacus_SLH_AL_1178", "hypochondriacus_SLH_AL_1197", "hypochondriacus_SLH_AL_1264", "hypochondriacus_SLH_AL_1285","hypochondriacus_SLH_AL_2282", "hypochondriacus_SLH_AL_2436"],
      "muricatus": ["muricatus_SLH_AL_2634"],
      "palmeri": ["palmeri-aff_SLH_AL_0017", "palmeri_SLH_AL_2637", "palmeri_SLH_AL_2644", "palmeri_SLH_AL_2647", "palmeri_SLH_AL_2649", "palmeri_SLH_AL_2650",
                  "palmeri_SLH_AL_2635","palmeri_SLH_AL_2636","palmeri_SLH_AL_2638","palmeri_SLH_AL_2639","palmeri_SLH_AL_2640","palmeri_SLH_AL_2641","palmeri_SLH_AL_2642","palmeri_SLH_AL_2643","palmeri_SLH_AL_2645","palmeri_SLH_AL_2646",
                  "palmeri_SLH_AL_159-contemp","palmeri_SLH_AL_163-contemp","palmeri_SLH_AL_173-contemp","palmeri_SLH_AL_174-contemp","palmeri_SLH_AL_235-contemp"],
      "powellii": ["powellii-bouchonii_SLH_AL_2653", "powellii-powellii_SLH_AL_2663", "powellii-powellii_SLH_AL_2665"],
      "pumilus": ["pumilus_SLH_AL_5-restricted","pumilus_SLH_AL_7-restricted"],
      "quitensis": ["quitensis_SLH_AL_2671", "quitensis_SLH_AL_2675","quitensis_SLH_AL_2753"],
      "retroflexus": ["retroflexus_SLH_AL_2770", "retroflexus_SLH_AL_2773", "retroflexus_SLH_AL_2780"],
      "spinosus": ["spinosus_SLH_AL_2792", "spinosus_SLH_AL_2793", "spinosus_SLH_AL_2806", "spinosus_SLH_AL_2809", "spinosus_SLH_AL_2811"],
      "standleyanus": ["standleyanus_SLH_AL_2815","standleyanus_SLH_AL_2816"],
      "tamaulipensis": ["tamaulipensis_SLH_AL_2817"],
      "torreyi": ["torreyi_SLH_AL_2818"],
      "tricolor": ["tricolor_SLH_AL_2869", "tricolor_SLH_AL_2940", "tricolor_SLH_AL_2953", "tricolor_SLH_AL_2978"],
      "tuberculatus": ["tuberculatus_SLH_AL_0009-restricted", "tuberculatus_SLH_AL_3003", "tuberculatus_SLH_AL_3017", "tuberculatus_SLH_AL_3027", "tuberculatus_SLH_AL_3045",
                        "tuberculatus_SLH_AL_2999","tuberculatus_SLH_AL_3000","tuberculatus_SLH_AL_3001","tuberculatus_SLH_AL_3002","tuberculatus_SLH_AL_3004","tuberculatus_SLH_AL_3005",
                        "tuberculatus_SLH_AL_3006","tuberculatus_SLH_AL_3007","tuberculatus_SLH_AL_3008","tuberculatus_SLH_AL_3009","tuberculatus_SLH_AL_3010",
                        "tuberculatus_SLH_AL_3011","tuberculatus_SLH_AL_3012","tuberculatus_SLH_AL_3013","tuberculatus_SLH_AL_3014","tuberculatus_SLH_AL_3015",
                        "tuberculatus_SLH_AL_3016","tuberculatus_SLH_AL_3018","tuberculatus_SLH_AL_3019","tuberculatus_SLH_AL_3020","tuberculatus_SLH_AL_3021",
                        "tuberculatus_SLH_AL_3022","tuberculatus_SLH_AL_3023","tuberculatus_SLH_AL_3024","tuberculatus_SLH_AL_3025","tuberculatus_SLH_AL_3026",
                        "tuberculatus_SLH_AL_3028","tuberculatus_SLH_AL_3029","tuberculatus_SLH_AL_3030","tuberculatus_SLH_AL_3031","tuberculatus_SLH_AL_3032",
                        "tuberculatus_SLH_AL_3033","tuberculatus_SLH_AL_3034","tuberculatus_SLH_AL_3035","tuberculatus_SLH_AL_3036",#"tuberculatus_SLH_AL_3037",
                        "tuberculatus_SLH_AL_3038","tuberculatus_SLH_AL_3039","tuberculatus_SLH_AL_3041","tuberculatus_SLH_AL_3042","tuberculatus_SLH_AL_3043",
                        "tuberculatus_SLH_AL_3044","tuberculatus_SLH_AL_3046","tuberculatus_SLH_AL_154-contemp","tuberculatus_SLH_AL_155-contemp",
                        "tuberculatus_SLH_AL_156-contemp","tuberculatus_SLH_AL_157-contemp","tuberculatus_SLH_AL_160-contemp","tuberculatus_SLH_AL_165-contemp",
                        "tuberculatus_SLH_AL_169-contemp","tuberculatus_SLH_AL_175-contemp","tuberculatus_SLH_AL_176-contemp","tuberculatus_SLH_AL_202-contemp",
                        "tuberculatus_SLH_AL_206-contemp","tuberculatus_SLH_AL_208-contemp","tuberculatus_SLH_AL_236-contemp","tuberculatus_SLH_AL_237-contemp"],
        "tucsonensis": ["tucsonensis_SLH_AL_3068"],
        "viridis": ["viridis_SLH_AL_3047", "viridis_SLH_AL_3062"],
        "watsonii": ["watsonii_SLH_AL_3065"],
        "wrightii": ["wrightii_SLH_AL_3066", "wrightii_SLH_AL_3067"],    
      "beet":["beet"],
     }

### Astral does not use bootstrap

In [5]:
#run RAxML on "genes" that are 1M bp long in treeslider
ts = ipa.treeslider(
        name='tub_consensus_500kb',  #this name can't end in _2
        data=data,
        workdir=OUTDIR,
        scaffold_idxs=range(16),
        window_size=500000,
        slide_size= 500000,  
        inference_method="raxml",  #options are raxml and mrbayes
        #inference_args={"N": 1, "T": 24}, #number of bootstraps and # cores
    consensus_reduce=True, # this tells it to make 1 sequence for each key in dictionary
        minsnps=50,  #within window_size, it must have at least this much diversity
        mincov=4,   #For example, mincov=0.5 will require that 50% of samples contain a site that is not N or - for the site to be included in the alignment. 
    rmincov= 0,     
    imap=imap,
    minmap= {i: 0.1 for i in imap}, #half of each key in imap dictionary
  #  keep_all_files=True,
)

In [6]:
#check that the names I specificed in IMAP are the names in the dataset.
# 1. Print names in dataset
goodnames = set(ts._pnames)
goodnames

{'acanthochiton_SLH_AL_0001',
 'acanthochiton_SLH_AL_0002',
 'acutilobus_SLH_AL_0003',
 'acutilobus_SLH_AL_0004',
 'albus_SLH_AL_0006',
 'albus_SLH_AL_0009',
 'albus_SLH_AL_0010',
 'arenicola_SLH_AL_0012',
 'arenicola_SLH_AL_0013',
 'arenicola_SLH_AL_0015',
 'arenicola_SLH_AL_0018',
 'asplundii_SLH_AL_0019',
 'australis_SLH_AL_0020',
 'australis_SLH_AL_0021',
 'beet',
 'blitoides_SLH_AL_0023',
 'blitoides_SLH_AL_0028',
 'blitum-blitum_SLH_AL_0029',
 'blitum-oleraceus_SLH_AL_0034',
 'blitum-pseudogracilis_SLH_AL_0037',
 'californicus_SLH_AL_0039',
 'cannabinus_SLH_AL_0040',
 'cannabinus_SLH_AL_0041',
 'caudatus_SLH_AL_0102',
 'caudatus_SLH_AL_0110',
 'caudatus_SLH_AL_0116',
 'caudatus_SLH_AL_0322',
 'caudatus_SLH_AL_0540',
 'crassipes_SLH_AL_0599',
 'crassipes_SLH_AL_0600',
 'cruentus_SLH_AL_0679',
 'cruentus_SLH_AL_0699',
 'cruentus_SLH_AL_0728',
 'cruentus_SLH_AL_0804',
 'cruentus_SLH_AL_0832',
 'deflexus_SLH_AL_0951',
 'deflexus_SLH_AL_0952',
 'deflexus_SLH_AL_0953',
 'deflexus_SLH_A

In [7]:
# 2. Print the names that have typos
for key, vals in ts.imap.items():
    diff = set(vals).difference(goodnames)
    print(diff)

set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()


In [8]:
ts.run(force=True, auto=True)

building database: nwindows=785; minsnps=50
[####################] 100% 4:21:46 | inferring trees 
tree_table written to /rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/tub_consensus_500kb.tree_table.csv


In [9]:
#Read in the data 
tree_table = pd.read_csv("/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/tub_consensus_500kb.tree_table.csv", sep=",")

new = tree_table.tree[tree_table.tree.notna()].reset_index(drop=True)
new.to_csv(
    "/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/tub_consensus_500kb.tree.csv",
    header=False, encoding='utf-8', index=False, sep=" ")

# Now try even smaller size genes

In [10]:
#run RAxML on "genes" that are 1M bp long in treeslider
ts2 = ipa.treeslider(
        name='tub_consensus_250kb',  #this name can't end in _2
        data=data,
        workdir=OUTDIR,
        scaffold_idxs=range(16),
        window_size=250000,
        slide_size= 250000,  
        inference_method="raxml",  #options are raxml and mrbayes
       # inference_args={"N": 100, "T": 24}, #number of bootstraps and # cores
    consensus_reduce=True, # this tells it to make 1 sequence for each key in dictionary
        minsnps=25,  #within window_size, it must have at least this much diversity
        mincov=4,   #For example, mincov=0.5 will require that 50% of samples contain a site that is not N or - for the site to be included in the alignment. 
    rmincov= 0,     
    imap=imap,
    minmap= {i: 0.1 for i in imap}, #half of each key in imap dictionary
 #   keep_all_files=True,
)

In [11]:
ts2.run(force=True, auto=True)

building database: nwindows=1575; minsnps=25
[####################] 100% 3:57:59 | inferring trees 
tree_table written to /rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/tub_consensus_250kb.tree_table.csv


In [12]:
#Read in the data 
tree_table2 = pd.read_csv("/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/tub_consensus_250kb.tree_table.csv", sep=",")

new = tree_table2.tree[tree_table2.tree.notna()].reset_index(drop=True)
new.to_csv(
    "/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/tub_consensus_250kb.tree.csv",
    header=False, encoding='utf-8', index=False, sep=" ")

### run Astral

In [13]:
Ast500 = ipa.astral(
    data = "/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/tub_consensus_500kb.tree_table.csv",
    name='Astral_asplundii_500kB',
    workdir=OUTDIR,
    annotation=1,
)

Ast500.print_command()

Ast500.run()

java -jar /rigel/home/slh2181/miniconda3/bin/astral.5.7.1.jar -i /rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/tmptrees.txt -o /rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/Astral_asplundii_500kB.tre -t 1
[astral.5.7.1.jar]
inferred tree written to (/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/Astral_asplundii_500kB.tre)


In [14]:
color_dict = {
"quitensis": "orange",# 'quitensis',
"caudatus": "orange",# 'caudatus',
"hypochondriacus": "orange",# 'hypochondriacus',
"reference": "orange",
"hybridus1": "orange",# 'hybridus',
"hybridus2": "orange",# 'hybridus',
"hybridus3": "orange",# 'hybridus',
"cruentus": "orange",# 'cruentus',
"wrightii": "orange",# 'wrightii',
"retroflexus": "orange",# 'retroflexus',
"powellii": "orange",# 'powellii',
"acutilobus": "orange",# 'acutilobus',
"watsonii": "orange",# 'watsonii',
"palmeri": "orange",# 'palmeri',
"spinosus": "orange",# 'spinosus',
"dubius": "orange",# 'dubius',
"arenicola": "blue",# 'arenicola',
"greggii": "blue",# 'greggii',
"acanthochiton": "blue",# 'acanthochiton',
"pumilus": "blue",# 'pumilus',
"floridanus": "blue",# 'floridanus',
"tuberculatus": "blue",# 'tuberculatus',
"cannabinus": "blue",# 'cannabinus',
"australis": "blue",# 'australis',
"viridis": "red",# 'viridis',
"deflexus": "red",# 'deflexus',
"muricatus": "red",# 'muricatus',
"standleyanus": "red",# 'standleyanus',
"asplundii": "purple",# 'asplundii',
"graecizans": "purple",# 'graecizans',
"tricolor": "purple",# 'tricolor',
"blitum": "purple",# 'blitum',
"albus": "green",# 'albus',
"californicus": "green",# 'californicus',
"blitoides": "green",# 'blitoides',
"torreyi": "green",# 'torreyi',
"crassipes": "green",# 'crassipes',
"tamaulipensis": "green",# 'tamaulipensis',
"fimbriatus": "green",# 'fimbriatus',
"tucsonensis": "green",# 'tucsonensis',
"beet": "black",# 'beet'
"unknown": "black",
}

In [15]:
rooted500 = toytree.tree(Ast500.tree).root("beet")

rooted500.treenode.support = 100
for node in rooted500.treenode.traverse():
    node.support = int(float(node.support))
#r2 = rooted1.collapse_nodes(min_support=75)

our_labels = rooted500.get_tip_labels()
sp_names= [i.split("_")[0].split("-")[0] for i in our_labels]
colors = [color_dict[sp] for sp in sp_names]


canvas, axes, mark=rooted500.draw(tip_labels_align=True, node_labels="support", 
                          use_edge_lengths=False,                        
                          node_labels_style={"font-size": "15px"},
                        #node_markers="r2x1.25",
                        node_sizes=27,
                        node_style={
                                "fill": "white",
                                "stroke": "black",
                        #        "stroke-width": 1,
                        },
                         # tip_labels=newnames,
                        tip_labels_colors=colors,
                        tip_labels_style={"font-size":"16px"},
                        width=600, height=800);

In [16]:
toyplot.svg.render(canvas,"/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/Astral_aspluindii_500kB.svg")

In [17]:
Ast250 = ipa.astral(
    data = "/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/tub_consensus_250kb.tree_table.csv",
    #data = ts2.tree_table,
    name='Astral_asplundii_250kB',
    workdir=OUTDIR,
    annotation=1,
)

Ast250.print_command()

Ast250.run()

java -jar /rigel/home/slh2181/miniconda3/bin/astral.5.7.1.jar -i /rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/tmptrees.txt -o /rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/Astral_asplundii_250kB.tre -t 1
[astral.5.7.1.jar]
inferred tree written to (/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/Astral_asplundii_250kB.tre)


In [18]:
rooted250 = toytree.tree(Ast250.tree).root("beet")

rooted250.treenode.support = 100
for node in rooted250.treenode.traverse():
    node.support = int(float(node.support))
#r2 = rooted1.collapse_nodes(min_support=75)

our_labels = rooted250.get_tip_labels()
sp_names= [i.split("_")[0].split("-")[0] for i in our_labels]
colors = [color_dict[sp] for sp in sp_names]


canvas, axes, mark=rooted250.draw(tip_labels_align=True, node_labels="support", 
                          use_edge_lengths=False,                        
                          node_labels_style={"font-size": "15px"},
                        #node_markers="r2x1.25",
                        node_sizes=27,
                        node_style={
                                "fill": "white",
                                "stroke": "black",
                        #        "stroke-width": 1,
                        },
                         # tip_labels=newnames,
                        tip_labels_colors=colors,
                        tip_labels_style={"font-size":"16px"},
                        width=600, height=800);

In [19]:
toyplot.svg.render(canvas,"/rigel/dsi/users/slh2181/tuberculatus_plate/asplundii/hybridus_redo/vble_locus_length/Astral_asplundii_250kB.svg")