## Demultiplex outer barcodes in ipyrad

This requires running ipyrad step 1 twice. <br>
This notebook includes MIGseq reads (Amaranth reads from a different project that use a different library preparation method) and Pedicularis reads (different plant/ same data type) because they were pooled to run on single Illumina run. 


In [1]:
# import packages into python
import ipyrad as ip
import ipyparallel as ipp
import ipyrad.analysis as ipa
import pandas as pd

# connect to parallel client
ipyclient = ipp.Client()
ip.cluster_info(ipyclient)

# print the version of ipyrad you are running
print('ipyrad', ipa.__version__)

Parallel connection | t103: 24 cores
ipyrad 0.9.26


In [2]:
#Read in data and barcodes that are already created

COL_plates = "/moto/eaton/projects/RAW_DATA/Amaranthus/Amaranthus_COL_3RAD_run2_R*_.fastq.gz"
TUB_plate = "/moto/eaton/users/slh2181/tuberculatus_plate/Run_191713587/bcl2fastq2/Undetermined_S0_R*_001.fastq.gz"

BARCODES_I7_plate12 = "/moto/eaton/projects/RAW_DATA/Amaranthus/Amaranthus_COL_3RAD_run2_i7_barcodes.txt"
BARCODES_I7_tuberculatus = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/i7_barcodes_tub.txt"

INNER_BARCODES_plate1 = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/Amaranthus_UGA_3RAD_plate1_inline_barcodes.txt"
INNER_BARCODES_plate2 = "/moto/eaton/projects/RAW_DATA/Amaranthus/Amaranthus_COL_3RAD_plate2_inline_barcodes.txt"
INNER_BARCODES_plate3 = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/inner_barcodes_tub.txt"

QUINOA = "/moto/eaton/projects/demux/digested_genomes/*.gz"

# the reference Amaranthus genome
REF1 = "/moto/eaton/users/slh2181/reference/Ahypochondriacus_459_v2.0.fa"

# the reference Quinoa genome
REF2 = "/moto/eaton/projects/demux/quinoa.fna"

In [4]:
! cat $BARCODES_I7_tuberculatus

Amaranthus_tuberculatus_3RAD	CTCGTCTT
MIGseq1a	ATGACCAG
MIGseq1b	AACCGTTC
MIGseq1c	TCCAATCG
MIGseq1d	CTGCACTT
MIGseq1e	CGCTTAAC
MIGseq1f	CACCACTA
MIGseq1g	ACAGCAAC
MIGseq1h	GGAAGGAT
MIGseq2a	CGATAGAG
MIGseq2b	TTCGTTGG
MIGseq2c	TGGAGAGT
MIGseq2d	TCAGACGA
MIGseq2e	GACGAATG
MIGseq2f	CATGAGGA
MIGseq2g	CGGTTGTT
MIGseq2h	TCCGTATG
Pedicularis_plate1	CGAACTGT
Pedicularis_plate2	TCGGTTAC


In [4]:
! cat $BARCODES_I7_plate12

plate1	CGAACTGT
plate1	CATTCGGT
plate1	AAGTCGAG
plate1	TATCGGTC
plate1	GTATTGGC
plate1	AACCTCCT
plate2	CTCGTCTT
plate2	TCGGTTAC
plate2	TATTCGCC
plate2	AGTCGCTT
plate2	TGGCACTA
plate2	GGTTGTCA


In [5]:
! head -n 10 $INNER_BARCODES_plate1

SLH_AL_0001	CCGAAT   	CTAACG
SLH_AL_0003	TTAGGCA	    CTAACG
SLH_AL_0004	AACTCGTC	CTAACG
SLH_AL_0006	GGTCTACGT	CTAACG
SLH_AL_0009	GATACC		CTAACG
SLH_AL_0010	AGCGTTG	    CTAACG
SLH_AL_0013	CTGCAACT	CTAACG
SLH_AL_0017	TCATGGTCA	CTAACG
SLH_AL_0018	CCGAAT  	TCGGTAC
SLH_AL_0019	TTAGGCA	    TCGGTAC


In [6]:
! head -n 10 $INNER_BARCODES_plate2

SLH_AL_0101-contemp	CCGAAT	CTAACG
SLH_AL_0012-contemp	TTAGGCA	CTAACG
SLH_AL_0095-contemp	AACTCGTC	CTAACG
SLH_AL_0072-contemp	GGTCTACGT	CTAACG
SLH_AL_0064-contemp	GATACC	CTAACG
SLH_AL_0077-contemp	AGCGTTG	CTAACG
SLH_AL_0104-contemp	CTGCAACT	CTAACG
SLH_AL_0036-contemp	TCATGGTCA	CTAACG
SLH_AL_0090-contemp	CCGAAT	TCGGTAC
SLH_AL_0027-contemp	TTAGGCA	TCGGTAC


In [7]:
# This is to demultiplex plate 1 from plate 2 when they were sequenced in Bendesky lab @ Columbia in eary 2019
demux_outer1 = ip.Assembly("demux_i7s")
demux_outer1.params.project_dir = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad"
demux_outer1.params.barcodes_path = BARCODES_I7_plate12
demux_outer1.params.raw_fastq_path = COL_plates
demux_outer1.params.datatype = "pairddrad"
demux_outer1.params.restriction_overhang = ("ATCGG", "TAGCTT")

# important: set hackers params to tell it to demux on i7 
demux_outer1.hackersonly.demultiplex_on_i7_tags = True
demux_outer1.hackersonly.merge_technical_replicates = True

New Assembly: demux_i7s


In [9]:
# run demultiplexing, this takes ~40 mins with 24 cores
demux_outer1.run('1', auto=True, force=True)

Parallel connection | t103: 24 cores
[####################] 100% 0:36:18 | sorting reads        | s1 |
[####################] 100% 0:05:22 | writing/compressing  | s1 |


In [10]:
# show demux i7 stats
demux_outer1.stats

Unnamed: 0,state,reads_raw
plate1,1,71586332
plate2,1,25966588


In [4]:
# This is to demultiplex A. tuberculatus from Peducularis plates that were sequenced at Admera in Nov 2019
demux_outer2 = ip.Assembly("demux_tub")
demux_outer2.params.project_dir = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad"
demux_outer2.params.barcodes_path = BARCODES_I7_tuberculatus
demux_outer2.params.raw_fastq_path = TUB_plate
demux_outer2.params.datatype = "pairddrad"
#demux_outer2.params.restriction_overhang = ("ATCGG", "TAGCTT") #Pedicularis is different and this shouldn't be needed

# important: set hackers params to tell it to demux on i7 
demux_outer2.hackersonly.demultiplex_on_i7_tags = True
demux_outer2.hackersonly.merge_technical_replicates = True   #use this if you have the same sample with different barcodes

New Assembly: demux_tub


In [5]:
demux_outer2.params

0   assembly_name               demux_tub                                    
1   project_dir                 ./ipyrad                                     
2   raw_fastq_path              ./Run_191713587/bcl2fastq2/Undetermined_S0_R*_001.fastq.gz
3   barcodes_path               ./ipyrad/i7_barcodes_tub.txt                 
4   sorted_fastq_path                                                        
5   assembly_method             denovo                                       
6   reference_sequence                                                       
7   datatype                    pairddrad                                    
8   restriction_overhang        ('TGCAG', '')                                
9   max_low_qual_bases          5                                            
10  phred_Qscore_offset         33                                           
11  mindepth_statistical        6                                            
12  mindepth_majrule            6                  

In [6]:
# run demultiplexing, this takes 60 mins
demux_outer2.run('1', auto=True, force=True)

Parallel connection | t103: 24 cores
[####################] 100% 0:36:14 | chunking large files | s1 | s1 |
[####################] 100% 0:17:07 | sorting reads        | s1 |
[####################] 100% 0:15:27 | writing/compressing  | s1 |


In [7]:
# show demux i7 stats 
demux_outer2.stats
#Amaranthus_tuberculatus_3RAD	1	133729311

Unnamed: 0,state,reads_raw
Amaranthus_tuberculatus_3RAD,1,133729311
MIGseq1a,1,45608
MIGseq1b,1,46139
MIGseq1c,1,46716
MIGseq1d,1,40656
MIGseq1e,1,44530
MIGseq1f,1,44527
MIGseq1g,1,59566
MIGseq1h,1,39801
MIGseq2a,1,42456


In [3]:
RAW_DATA_UGA =    "/moto/eaton/projects/RAW_DATA/Amaranthus/Amaranthus_UGA_3RAD_*.gz"
RAW_DATA_plate1 = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/demux_i7s_fastqs/plate1_*.gz"
RAW_DATA_plate2 = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/demux_i7s_fastqs/plate2_*.gz"
RAW_DATA_plate3 = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/demux_tub_fastqs/Amaranthus_*.gz"

In [4]:
# digest the quinoa genome and write as fastq data
dg = ipa.digest_genome(
    fasta="/moto/eaton/projects/demux/quinoa.fna.gz",
    name="quinoa",
    workdir="/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/quinoa4",
    re1="AAGCTT",
    re2="CCGG",
    paired=True,
    min_size=200,
    max_size=1000,
    ncopies=10,
    readlen=150,
    #nscaffolds=20,
)
dg.run()


# digest the quinoa genome and write as fastq data
dg2 = ipa.digest_genome(
    fasta="/moto/eaton/users/slh2181/reference/beet/GCF_000511025.2_RefBeet-1.2.2_genomic.fna.gz",
    name="beet",
    workdir="/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/beet4",
    re1="AAGCTT",
    re2="CCGG",
    paired=True,
    min_size=200,
    max_size=1000,
    ncopies=10,
    readlen=150,
    #nscaffolds=20,
)
dg2.run()

extracted 281471 reads
extracted 110308 reads


In [5]:
lib5 = ip.Assembly("quinoa")              #This is quinoa digested reference
lib5.params.sorted_fastq_path = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/quinoa4/quinoa_*.gz"
lib5.params.datatype = "pair3rad"
lib5.params.project_dir = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/quinoa_ref4"
lib5.params.restriction_overhang = ("ATCGG", "TAGCTT")


lib6 = ip.Assembly("beet")              #This is beet digested reference
lib6.params.sorted_fastq_path = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/beet4/beet*.gz"
lib6.params.datatype = "pair3rad"
lib6.params.project_dir = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/beet_ref4"
lib6.params.restriction_overhang = ("ATCGG", "TAGCTT")


lib5.run('1', force=True, ipyclient=ipyclient)
lib6.run('1', force=True, ipyclient=ipyclient)

New Assembly: quinoa
New Assembly: beet
Parallel connection | t103: 24 cores
[####################] 100% 0:00:14 | loading reads        | s1 |
Parallel connection | t103: 24 cores
[####################] 100% 0:00:04 | loading reads        | s1 |


In [None]:
# load each library from i7-demux'd fastq files
lib1 = ip.Assembly("UGA_plate1")          #This is UGA
lib1.params.raw_fastq_path = RAW_DATA_UGA
lib1.params.barcodes_path = INNER_BARCODES_plate1
lib1.params.datatype = "pair3rad"
lib1.params.project_dir = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/"
lib1.params.max_barcode_mismatch = 1
lib1.params.restriction_overhang = ("ATCGG", "TAGCTT")

lib2 = ip.Assembly("Columbia_plate1")     #This is Columbia plate 1
lib2.params.raw_fastq_path = RAW_DATA_plate1
lib2.params.barcodes_path = INNER_BARCODES_plate1
lib2.params.datatype = "pair3rad"
lib2.params.project_dir = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad"
lib2.params.max_barcode_mismatch = 1
lib2.params.restriction_overhang = ("ATCGG", "TAGCTT")

lib3 = ip.Assembly("Columbia_contemp")     #This is Columbia (half) plate 2
lib3.params.raw_fastq_path = RAW_DATA_plate2
lib3.params.barcodes_path = INNER_BARCODES_plate2
lib3.params.datatype = "pair3rad"
lib3.params.project_dir = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad"
lib3.params.max_barcode_mismatch = 1
lib3.params.restriction_overhang = ("ATCGG", "TAGCTT")

lib4 = ip.Assembly("tuberculatus")         #This is tuberculatus plate
lib4.params.raw_fastq_path = RAW_DATA_plate3
lib4.params.barcodes_path = INNER_BARCODES_plate3
lib4.params.datatype = "pair3rad"
lib4.params.project_dir = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad"
lib4.params.max_barcode_mismatch = 1
lib4.params.restriction_overhang = ("ATCGG", "TAGCTT")

#lib5 = ip.Assembly("quinoa")              #This is quinoa digested reference
#lib5.params.sorted_fastq_path = QUINOA
#lib5.params.datatype = "pair3rad"
#lib5.params.project_dir = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad"
#lib5.params.restriction_overhang = ("ATCGG", "TAGCTT")


# run step 1 to sort/load fastq files 
lib1.run('1', force=True, ipyclient=ipyclient)  #2 mins
lib2.run('1', force=True, ipyclient=ipyclient)  #half hr
lib3.run('1', force=True, ipyclient=ipyclient)  #10 mins
lib4.run('1', force=True, ipyclient=ipyclient)  #1 hr
#lib5.run('1', force=True, ipyclient=ipyclient)

New Assembly: UGA_plate1
New Assembly: Columbia_plate1
New Assembly: Columbia_contemp
New Assembly: tuberculatus
Parallel connection | t103: 24 cores
[####################] 100% 0:01:13 | sorting reads        | s1 |
[####################] 100% 0:00:13 | writing/compressing  | s1 |
Parallel connection | t103: 24 cores
[                    ]   0% 0:04:46 | sorting reads        | s1 |

In [None]:
# load accession:species dataframe
spnames = pd.read_csv("/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/Amaranth_sample_names_tub.csv")

# subset only rows with species names not empty
spnames = spnames[spnames.Species.notna()]

# convert dataframe to a dictionary 
ndict = {spnames.Sample[i]: spnames.Species[i] for i in spnames.index}

# combine id and spp name into one long name
for key, value in ndict.items():
    ndict[key] = value + "_" + key

In [None]:
# create a merged assembly
merged = ip.merge(
    name="229_samp_quinoa_beet2", 
    assemblies=[lib1, lib2, lib3, lib4, lib5, lib6],
    rename_dict=ndict,
    )
merged.save()

In [None]:
# set params on this assembly
merged.params.project_dir = "/moto/eaton/users/slh2181/tuberculatus_plate/ipyrad/"
merged.params.assembly_method = "reference"
merged.params.reference_sequence = REF1
merged.params.restriction_overhang = ("ATCGG", "TAGCTT")
merged.params.datatype = "pair3rad"
merged.params.phred_Qscore_offset = 43
merged.params.filter_adapters = 3
merged.params.mindepth_majrule = 3
merged.params.mindepth_statistical = 6
merged.params.max_barcode_mismatch = 1
merged.params.min_samples_locus = 10
merged.params.max_Ns_consens = 0.07
merged.params.max_Hs_consens = 0.07
merged.params.max_Indels_locus = 10
merged.params.output_formats = "psl"
merged.params

In [None]:
merged.run("234567", force=True, ipyclient=ipyclient)

In [15]:
merged.stats_dfs.s7_samples.sample_coverage < 5000

reference                    False
acanthochiton_SLH_AL_0001    False
acanthochiton_SLH_AL_0002    False
acutilobus_SLH_AL_0003       False
acutilobus_SLH_AL_0004       False
                             ...  
viridis_SLH_AL_3047          False
viridis_SLH_AL_3062          False
watsonii_SLH_AL_3065         False
wrightii_SLH_AL_3066         False
wrightii_SLH_AL_3067         False
Name: sample_coverage, Length: 231, dtype: bool

In [29]:
mask = merged.stats_dfs.s7_samples.sample_coverage < 4000
lose = merged.stats_dfs.s7_samples[mask].index.tolist()
lose

['beet',
 'blitoides_SLH_AL_0023',
 'cannabinus_SLH_AL_0042',
 'quinoa',
 'unknown_SLH_AL_0055-contemp',
 'unknown_SLH_AL_0062-contemp']

#### Although I did not use the assembly above, this shows that quinoa has too few loci, and lead to removing quinoa in notebook 3. 