In [1]:
## Imports
## -------------------------------------------------------------------------------------------
import ipyrad as ip
import ipyparallel as ipp
import ipyrad.analysis as ipa

  from collections import Sequence
  from collections import Iterable
  from collections import Mapping, namedtuple, defaultdict, Sequence


In [3]:
## Start Ipcluster
## -------------------------------------------------------------------------------------------
## run the below line in terminal, or with '!' in jupyter notebooks
## !ipcluster start -n 40 --cluster-id="ipyrad" --daemonize
## this will stop the ipcluster
## !ipcluster stop --cluster-id="ipyrad" 

## link the cluster with ipyclient
ipyclient = ipp.Client(cluster_id="ipyrad")

In [4]:
## check to make sure the client is working by printing cores ready to go
print(len(ipyclient))

40


In [4]:
## demultiplex the first thuja library
data_lib1 = ip.Assembly("thuja_lib1")
data_lib1.set_params("raw_fastq_path", "./*lib1.fastq.gz")
data_lib1.set_params("barcodes_path", "./barcodes_lib1.txt")
data_lib1.set_params("datatype", "ddrad")
data_lib1.set_params("output_formats", "*")
data_lib1.set_params("restriction_overhang", ("TGCAG", "AATT"))

data_lib1.get_params()

New Assembly: thuja_lib1


0   assembly_name               thuja_lib1                                   
1   project_dir                 /mnt/lfs2/ruff6699/IpyradThujaAssembly       
2   raw_fastq_path              ./*lib1.fastq.gz                             
3   barcodes_path               ./barcodes_lib1.txt                          
4   sorted_fastq_path                                                        
5   assembly_method             denovo                                       
6   reference_sequence                                                       
7   datatype                    ddrad                                        
8   restriction_overhang        ('TGCAG', 'AATT')                            
9   max_low_qual_bases          5                                            
10  phred_Qscore_offset         33                                           
11  mindepth_statistical        6                                            
12  mindepth_majrule            6                               

In [5]:
## run the first library through step 1 of demultiplexing
data_lib1.run("1", ipyclient=ipyclient)

Parallel connection | watson.ibest.uidaho.edu: 40 cores
[####################] 100% 0:05:30 | chunking large files | s1 | |
[####################] 100% 0:06:19 | sorting reads        | s1 |
[####################] 100% 0:01:51 | writing/compressing  | s1 |


In [7]:
## check out the first demultiplexed library
data_lib1.stats

Unnamed: 0,state,reads_raw
C_1_308,1,5912670
C_1_309,1,4387472
C_1_310,1,771377
C_1_311,1,6266654
C_1_313,1,8818163
C_1_314,1,2209292
C_1_315,1,4441387
C_1_316,1,1191180
C_1_318,1,6564446
C_1_319,1,2894786


In [5]:
## Load the depultiplexed second library
data_lib2 = ip.load_json("thuja_lib2.json")
#data_lib2.stats

loading Assembly: thuja_lib2
from saved path: /mnt/lfs2/ruff6699/IpyradThujaAssembly/thuja_lib2.json


In [7]:
## merge the demiltiplexed libraries of thuja; there are 2
mergedThuja =  ip.merge("mergedThuja", [data_lib1, data_lib2])

NameError: name 'data_lib1' is not defined

In [6]:
mergedThuja =  ip.load_json("mergedThuja.json")

loading Assembly: mergedThuja
from saved path: /mnt/lfs2/ruff6699/IpyradThujaAssembly/mergedThuja.json


In [6]:
## changing filter adapters to 2 means reads are searched for illumina adaptors
mergedThuja.set_params("filter_adapters", 2)
## for steps 3 and 6
mergedThuja.set_params("clust_threshold", 0.8)
## for steps 4 and 5
mergedThuja.set_params("mindepth_statistical", 10)
mergedThuja.set_params("mindepth_majrule", 5)
mergedThuja.set_params("maxdepth", 50000)
mergedThuja.set_params("max_Hs_consens", 0.10)
mergedThuja.set_params("max_Ns_consens", 0.10)

## for step 7
mergedThuja.set_params("min_samples_locus", 10)

mergedThuja.get_params()

0   assembly_name               mergedThuja                                  
1   project_dir                 /mnt/lfs2/ruff6699/IpyradThujaAssembly       
2   raw_fastq_path              Merged: thuja_lib1, thuja_lib2               
3   barcodes_path               Merged: thuja_lib1, thuja_lib2               
4   sorted_fastq_path           Merged: thuja_lib1, thuja_lib2               
5   assembly_method             denovo                                       
6   reference_sequence                                                       
7   datatype                    ddrad                                        
8   restriction_overhang        ('TGCAG', 'AATT')                            
9   max_low_qual_bases          5                                            
10  phred_Qscore_offset         33                                           
11  mindepth_statistical        10                                           
12  mindepth_majrule            5                               

In [13]:
## run step 2, then branch
mergedThuja.run("2", ipyclient=ipyclient)

Parallel connection | watson.ibest.uidaho.edu: 40 cores
[####################] 100% 0:17:13 | processing reads     | s2 |


In [14]:
## branch after step 2
mergedThuja.branch("mergedThuja_AS2")

<ipyrad.core.assembly.Assembly at 0x7f866aaea5d0>

In [15]:
## run step 3, then branch
mergedThuja.run("3", ipyclient=ipyclient)

Parallel connection | watson.ibest.uidaho.edu: 40 cores
[####################] 100% 0:01:16 | dereplicating        | s3 |
[####################] 100% 0:04:41 | clustering/mapping   | s3 |
[####################] 100% 0:00:07 | building clusters    | s3 |
[####################] 100% 0:00:01 | chunking clusters    | s3 |
[####################] 100% 0:13:07 | aligning clusters    | s3 |
[####################] 100% 0:00:05 | concat clusters      | s3 |
[####################] 100% 0:00:13 | calc cluster stats   | s3 |


In [16]:
## branch after step 3
mergedThuja.branch("mergedThuja_AS3")

<ipyrad.core.assembly.Assembly at 0x7f866a3fcb90>

In [7]:
mergedThuja.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
C_1_308,6,5912670,5909856,193559,86597,0.018012,0.001671,84349
C_1_309,6,4387472,4385453,179414,84233,0.018331,0.001743,82067
C_1_310,6,771377,770951,23070,7659,0.016059,0.001199,7454
C_1_311,6,6266654,6264381,160395,70030,0.019269,0.001556,68112
C_1_313,6,8818163,8814366,230087,106972,0.023008,0.001776,103092
C_1_314,6,2209292,2208551,79856,32464,0.023028,0.001592,31399
C_1_315,6,4441387,4440179,75887,28305,0.021828,0.001261,27542
C_1_316,6,1191180,1190619,53616,20074,0.016451,0.00121,19552
C_1_318,6,6564446,6561041,240657,96866,0.040005,0.00307,90993
C_1_319,6,2894786,2893366,64691,23793,0.018894,0.00116,23100


In [8]:
## run step 4, then branch
mergedThuja.run("4", ipyclient=ipyclient)

Parallel connection | watson.ibest.uidaho.edu: 40 cores
[####################] 100% 0:09:13 | inferring [H, E]     | s4 |


In [13]:
## branch after step 4
mergedThuja.branch("mergedThuja_AS4")

<ipyrad.core.assembly.Assembly at 0x7f171a48d490>

In [14]:
## run step 5, then branch
mergedThuja.run("5", ipyclient=ipyclient)

Parallel connection | watson.ibest.uidaho.edu: 40 cores
[####################] 100% 0:00:10 | calculating depths   | s5 |
[####################] 100% 0:00:11 | chunking clusters    | s5 |
[####################] 100% 0:15:18 | consens calling      | s5 |
[####################] 100% 0:00:27 | indexing alleles     | s5 |


In [15]:
## branch after step 5
mergedThuja.branch("mergedThuja_AS5")

<ipyrad.core.assembly.Assembly at 0x7f170913ddd0>

In [16]:
## run step 6, then branch
mergedThuja.run("6", ipyclient=ipyclient)

Parallel connection | watson.ibest.uidaho.edu: 40 cores
[####################] 100% 0:00:29 | concatenating inputs | s6 |
[####################] 100% 0:15:28 | clustering tier 1    | s6 |
[####################] 100% 0:00:03 | concatenating inputs | s6 |
[####################] 100% 0:06:41 | clustering across    | s6 |
[####################] 100% 0:00:20 | building clusters    | s6 |
[####################] 100% 0:01:23 | aligning clusters    | s6 |


In [17]:
## branch after step 6 and set up for minsamplocus=4
minsamp4 = mergedThuja.branch("mergedThuja_minsamp4")

In [22]:
## set min sample locus to 4 and run step 7
#minsamp4 = mergedThuja.branch("mergedThuja_minsamp4")
#minsamp4.set_params("min_samples_locus", 4)
minsamp4.run("7", ipyclient=ipyclient)

Parallel connection | watson.ibest.uidaho.edu: 40 cores
[####################] 100% 0:00:17 | applying filters     | s7 |
[####################] 100% 0:01:47 | building arrays      | s7 |
[####################] 100% 0:01:48 | writing conversions  | s7 |
[####################] 100% 0:06:30 | indexing vcf depths  | s7 |
[####################] 100% 0:06:40 | writing vcf output   | s7 |


In [1]:
!cat mergedThuja_minsamp4_outfiles/mergedThuja_minsamp4_stats.txt


## The number of loci caught by each filter.
## ipyrad API location: [assembly].stats_dfs.s7_filters

                            total_filters  applied_order  retained_loci
total_prefiltered_loci                  0              0         229215
filtered_by_rm_duplicates            7548           7548         221667
filtered_by_max_indels                503            503         221164
filtered_by_max_SNPs                11703          11526         209638
filtered_by_max_shared_het           2711           2540         207098
filtered_by_min_sample              82793          82614         124484
total_filtered_loci                105258         104731         124484


## The number of loci recovered for each Sample.
## ipyrad API location: [assembly].stats_dfs.s7_samples

         sample_coverage
C_1_308            26321
C_1_309            27330
C_1_310             4680
C_1_311            20252
C_1_313            25679
C_1_314            11508
C_1_315       

In [None]:
## set min sample locus to 10 and run step 7
minsamp10 = mergedThuja.branch("mergedThuja_minsamp10")
minsamp10.set_params("min_samples_locus", 10)
minsamp10.run("7", ipyclient=ipyclient)

Parallel connection | watson.ibest.uidaho.edu: 40 cores
[####################] 100% 0:00:13 | applying filters     | s7 |
[####################] 100% 0:00:50 | building arrays      | s7 |
[####################] 100% 0:00:41 | writing conversions  | s7 |
[####################] 100% 0:01:21 | indexing vcf depths  | s7 |
[                    ]   0% 0:02:31 | writing vcf output   | s7 |

In [8]:
mergedThuja.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
C_1_308,6,5912670,5909856,193559,86597,0.018012,0.001671,84349
C_1_309,6,4387472,4385453,179414,84233,0.018331,0.001743,82067
C_1_310,6,771377,770951,23070,7659,0.016059,0.001199,7454
C_1_311,6,6266654,6264381,160395,70030,0.019269,0.001556,68112
C_1_313,6,8818163,8814366,230087,106972,0.023008,0.001776,103092
C_1_314,6,2209292,2208551,79856,32464,0.023028,0.001592,31399
C_1_315,6,4441387,4440179,75887,28305,0.021828,0.001261,27542
C_1_316,6,1191180,1190619,53616,20074,0.016451,0.00121,19552
C_1_318,6,6564446,6561041,240657,96866,0.040005,0.00307,90993
C_1_319,6,2894786,2893366,64691,23793,0.018894,0.00116,23100


In [9]:
## set min sample locus to 25 and run step 7
minsamp25 = mergedThuja.branch("mergedThuja_minsamp25")
minsamp25.set_params("min_samples_locus", 25)
minsamp25.run("7", ipyclient=ipyclient)

Parallel connection | crick.ibest.uidaho.edu: 40 cores
[####################] 100% 0:00:16 | applying filters     | s7 |
[####################] 100% 0:00:14 | building arrays      | s7 |
[####################] 100% 0:00:07 | writing conversions  | s7 |
[####################] 100% 0:00:14 | indexing vcf depths  | s7 |
[####################] 100% 0:00:43 | writing vcf output   | s7 |
