![image](silene.jpeg)

# sRNA profiles of flower dimorphism in _Silene latifolia_

#### Eddy J. Mendoza-Galindo
#### Advisor: Aline Muyle, CEFE Montpellier
June 2023

# <span style="color:#4dd98b"> Alingment and quantification

In [5]:
! cat scripts/count.sh

cd raw/sRNA_MGX/trimmed/
for file in *.fastq
do
echo "working with $file"
perl -e ' $count=0; $len=0; while(<>) { s/\r?\n//; s/\t/ /g; if (s/^@//) { if ($. != 1) { print "\n" } s/ |$/\t/; $count++; $_ .= "\t"; } else { s/ //g; $len += length($_) } print $_; } print "\n"; ' $file | sed -E 's/^.+\t(\w+)\+.*$/\1/g' | perl -e ' $col=0; while (<>) { s/\r?\n//; @F = split /\t/, $_; $len = length($F[$col]); print "$_\t$len\n" }; ' | awk '$2 ~ /(21|22|24)/ ' > ${file}_count.tsv
done


In [6]:
! bash scripts/count.sh

working with F1B_final_trimming.fastq
working with F1L_final_trimming.fastq
working with F2B_final_trimming.fastq
working with F2L_final_trimming.fastq
working with F3B_final_trimming.fastq
working with F3L_final_trimming.fastq
working with M1B_final_trimming.fastq
working with M1L_final_trimming.fastq
working with M2B_final_trimming.fastq
working with M2L_final_trimming.fastq
working with M4B_final_trimming.fastq
working with M4L_final_trimming.fastq


### <span style="color:#4dd98b"> Alingment, females without Y

In [2]:
# Remove Y chromosome from genome and map Females
! grep ">" genome/genome3.fa | grep -v "scaffold" | sed 's/>//' > genome/no_y.list
! seqtk subseq genome/genome3.fa genome/no_y.list > genome/no_y.fa
! grep ">" genome/no_y.fa

>chr1
>chr2
>chr3
>chr4
>chr5
>chr6
>chr7
>chr8
>chr9
>chr10
>chr11
>chr12


In [8]:
# create a list file of the bams
! ls females_no_y/F*.bam > bam.list
! ls males/M*.bam >> bam.list

### <span style="color:#4dd98b">Merge females (without y) and males (with y) (Inside the conda ShortStack4 environment so it works)

samtools merge -r -@ 12 -f -o merged.bam -b bam.list
    
#### <span style="color:#4dd98b"> First, PTGS (21-22 nt), asking for _de novo_ and template-based miRNA annotation
    
ShortStack --genomefile genome/genome3.fa --bamfile merged.bam --threads 8 --outdir only_21-22_known_de_novo_f-y --dicermax 22 --mmap u --dn_mirna --knownRNAs ../raw/caryophyllaceae_mirnas.fa 

#### <span style="color:#4dd98b"> Then, RDdM (24 nt), no miRNA identification
    
ShortStack --genomefile genome/genome3.fa --bamfile merged.bam --threads 8 --outdir only_24_f-y --dicermin 23 --mmap u 

### <span style="color:#4dd98b"> Depth Flowers

In [1]:
# get bam for each lenght and individual
! rm -r raw/bams_per_length/
! mkdir raw/bams_per_length

# first 21-22, females
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F1B_ptgs.bam females_no_y/F1B_dicer.bam 
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F2B_ptgs.bam females_no_y/F2B_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F3B_ptgs.bam females_no_y/F3B_dicer.bam 

# then 24, females
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F1B_rddm.bam females_no_y/F1B_dicer.bam 
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F2B_rddm.bam females_no_y/F2B_dicer.bam
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F3B_rddm.bam females_no_y/F3B_dicer.bam 

# first 21-22, males
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M1B_ptgs.bam males/M1B_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M2B_ptgs.bam males/M2B_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M4B_ptgs.bam males/M4B_dicer.bam

# then 24, males
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M1B_rddm.bam males/M1B_dicer.bam 
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M2B_rddm.bam males/M2B_dicer.bam
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M4B_rddm.bam males/M4B_dicer.bam

In [2]:
# calculate depth for each individual, only for flower buds and convert it to bed in flowers
# females
! samtools depth -@ 12 raw/bams_per_length/F*B_ptgs.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/females_flowers_ptgs.bed
! samtools depth -@ 12 raw/bams_per_length/F*B_rddm.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/females_flowers_rddm.bed
# males
! samtools depth -@ 12 raw/bams_per_length/M*B_ptgs.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/males_flowers_ptgs.bed
! samtools depth -@ 12 raw/bams_per_length/M*B_rddm.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/males_flowers_rddm.bed

In [5]:
# get the number of mapped reads for the normalization 
! echo "____________________ FEMALES FLOWERS_____________________"
! samtools view -c -F 260 -@ 12 females_no_y/F1B_dicer.bam
! samtools view -c -F 260 -@ 12 females_no_y/F2B_dicer.bam 
! samtools view -c -F 260 -@ 12 females_no_y/F3B_dicer.bam 
! echo "____________________ MALES FLOWERS_____________________"
! samtools view -c -F 260 -@ 12 males/M1B_dicer.bam
! samtools view -c -F 260 -@ 12 males/M2B_dicer.bam
! samtools view -c -F 260 -@ 12 males/M4B_dicer.bam

____________________ FEMALES FLOWERS_____________________
27241167
29660632
25216360
____________________ MALES FLOWERS_____________________
13200497
23914694
23578008


In [None]:
# Get depths for each gene
# 21/22

# Females
! bedtools intersect -a raw/depth/females_flowers_ptgs.bed -b annotation/mrnas.bed -wb | cut -f 1-6,10 > raw/depth/flowers_females_gene_depth_ptgs.tsv
# Males
! bedtools intersect -a raw/depth/males_flowers_ptgs.bed -b annotation/mrnas.bed -wb | cut -f 1-6,10 > raw/depth/flowers_males_gene_depth_ptgs.tsv

# 24

# Add promoter
! bedtools slop -i annotation/mrnas.bed -g chromSizes.txt -l 200 -r 0 -s > annotation/mrnas_plus_promoter.bed

# Females
! bedtools intersect -a raw/depth/females_flowers_rddm.bed -b annotation/mrnas_plus_promoter.bed -wb | cut -f 1-6,10 > raw/depth/flowers_females_gene_depth_rddm.tsv
# Males
! bedtools intersect -a raw/depth/males_flowers_rddm.bed -b annotation/mrnas_plus_promoter.bed -wb | cut -f 1-6,10 > raw/depth/flowers_males_gene_depth_rddm.tsv

### <span style="color:#4dd98b"> Depth Leaves

In [11]:
# first 21-22, females
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F1L_ptgs.bam females_no_y/F1L_dicer.bam 
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F2L_ptgs.bam females_no_y/F2L_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F3L_ptgs.bam females_no_y/F3L_dicer.bam 

# then 24, females
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F1L_rddm.bam females_no_y/F1L_dicer.bam 
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F2L_rddm.bam females_no_y/F2L_dicer.bam
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F3L_rddm.bam females_no_y/F3L_dicer.bam 

# first 21-22, males
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M1L_ptgs.bam males/M1L_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M2L_ptgs.bam males/M2L_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M4L_ptgs.bam males/M4L_dicer.bam

# then 24, males
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M1L_rddm.bam males/M1L_dicer.bam 
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M2L_rddm.bam males/M2L_dicer.bam
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M4L_rddm.bam males/M4L_dicer.bam

In [12]:
# Calculate depth for leaves

# females
! samtools depth -@ 12 raw/bams_per_length/F*L_ptgs.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/females_leaves_ptgs.bed
! samtools depth -@ 12 raw/bams_per_length/F*L_rddm.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/females_leaves_rddm.bed
# males
! samtools depth -@ 12 raw/bams_per_length/M*L_ptgs.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/males_leaves_ptgs.bed
! samtools depth -@ 12 raw/bams_per_length/M*L_rddm.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/males_leaves_rddm.bed

In [5]:
# get the number of mapped reads for the normalization 
! echo "____________________ FEMALES LEAVES_____________________"
! samtools view -c -F 260 -@ 12 females_no_y/F1L_dicer.bam
! samtools view -c -F 260 -@ 12 females_no_y/F2L_dicer.bam 
! samtools view -c -F 260 -@ 12 females_no_y/F3L_dicer.bam 
! echo "____________________ MALES LEAVES_____________________"
! samtools view -c -F 260 -@ 12 males/M1L_dicer.bam
! samtools view -c -F 260 -@ 12 males/M2L_dicer.bam
! samtools view -c -F 260 -@ 12 males/M4L_dicer.bam

____________________ FEMALES LEAVES_____________________
24695806
31425418
21443081
____________________ MALES LEAVES_____________________
20502566
17763390
26371650


In [6]:
# Get depths for each gene
# 21/22

# Females
! bedtools intersect -a raw/depth/females_leaves_ptgs.bed -b annotation/mrnas.bed -wb | cut -f 1-6,10 > raw/depth/leaves_females_gene_depth_ptgs.tsv
# Males
! bedtools intersect -a raw/depth/males_leaves_ptgs.bed -b annotation/mrnas.bed -wb | cut -f 1-6,10 > raw/depth/leaves_males_gene_depth_ptgs.tsv

# 24 (including promoter)

# Females
! bedtools intersect -a raw/depth/females_leaves_rddm.bed -b annotation/mrnas_plus_promoter.bed -wb | cut -f 1-6,10 > raw/depth/leaves_females_gene_depth_rddm.tsv
# Males
! bedtools intersect -a raw/depth/males_leaves_rddm.bed -b annotation/mrnas_plus_promoter.bed -wb | cut -f 1-6,10 > raw/depth/leaves_males_gene_depth_rddm.tsv

scaffold_1	44590435	44591544	Silat_scaffold_1_Gene.13294	0	-	sex_biased_leaves_sbge	male-biased	Sex-biased genes in leaves

scaffold_1	44590435	44591544	Silat_scaffold_1_Gene.13294	0	-	sex_biased_leaves_sbge	male-biased	Sex-biased genes in leaves

scaffold_1	44590435	44591544	Silat_scaffold_1_Gene.13294	0	-	sex_biased_leaves_sbge	male-biased	Sex-biased genes in leaves

scaffold_1	44590435	44591544	Silat_scaffold_1_Gene.13294	0	-	sex_biased_leaves_sbge	male-biased	Sex-biased genes in leaves



# <span style="color:#3de2d8"> RNA-seq analysis

In [5]:
# Build a Kallisto index
! kallisto index -i annotation/mrnas.idx annotation/mrnas.fa


[build] loading fasta file annotation/mrnas.fa
[build] k-mer length: 31
        from 27 target sequences
        with pseudorandom nucleotides
[build] counting k-mers ... done.
[build] building target de Bruijn graph ...  done 
[build] creating equivalence classes ...  done
[build] target de Bruijn graph has 454447 contigs and contains 50244895 k-mers 



In [12]:
! bash ../scripts/kallisto_paper.sh

######################### WORKING WITH C1_01_B ################################3

[quant] fragment length distribution will be estimated from the data
[index] k-mer length: 31
[index] number of targets: 35,459
[index] number of k-mers: 50,244,895
[index] number of equivalence classes: 116,882
[quant] running in paired-end mode
[quant] will process pair 1: ../../rna-seq/trimmed/C1_01_B_1.trimmed.fastq
                             ../../rna-seq/trimmed/C1_01_B_2.trimmed.fastq
[quant] finding pseudoalignments for the reads ... done
[quant] processed 26,007,298 reads, 20,035,547 reads pseudoaligned
[quant] estimated average fragment length: 178.205
[   em] quantifying the abundances ... done
[   em] the Expectation-Maximization algorithm ran for 688 rounds
[bstrp] number of EM bootstraps complete: 100012

######################### WORKING WITH C1_01_L ################################3

[quant] fragment length distribution will be estimated from the data
[index] k-mer length: 31
[index] num

In [3]:
# Merge results
! ls rna_seq/kallisto/ | head -n 16 > rna_seq/kallisto/samples.list
# first I manually changed the dir names so it was less messy (move "combined" from the start to the end of the name)
! bash ../scripts/merge_kallisto_raw_paper.sh 

C1_01_B RUNNING
C1_01_B DONE
_________
C1_01_L RUNNING
C1_01_L DONE
_________
C1_03_B RUNNING
C1_03_B DONE
_________
C1_03_L RUNNING
C1_03_L DONE
_________
C1_04_B_combined RUNNING
C1_04_B_combined DONE
_________
C1_04_L RUNNING
C1_04_L DONE
_________
C1_05_B_combined RUNNING
C1_05_B_combined DONE
_________
C1_05_L RUNNING
C1_05_L DONE
_________
C1_26_B RUNNING
C1_26_B DONE
_________
C1_26_L RUNNING
C1_26_L DONE
_________
C1_27_B RUNNING
C1_27_B DONE
_________
C1_27_L RUNNING
C1_27_L DONE
_________
C1_29_B_combined RUNNING
C1_29_B_combined DONE
_________
C1_29_L RUNNING
C1_29_L DONE
_________
C1_34_B_combined RUNNING
C1_34_B_combined DONE
_________
C1_34_L RUNNING
C1_34_L DONE
_________


# <span style="color:#3de2d8"> PTGS

In [34]:
# Get fasta sequences from the overlapping clusters

#! awk 'FNR==NR{a[$0];next}{if (($2 in a)){print}}' <(cut -f 2 results/overlaps/flowers.tsv | sort | uniq) raw/only_21-22_known_de_novo_f-y/Results.txt | awk '{print $2 "\t" "21-22" "\t" $11 "NNNN"; system("echo " $11 "| rev")}' > temp.tab
# I mannually removed the line breaks that were introduced with awk

! awk 'FNR==NR{a[$0];next}{if (($2 in a)){print}}' <(cut -f 2 results/overlaps/flowers.tsv | sort | uniq) raw/only_21-22_known_de_novo_f-y/Results.txt | awk '{print $2 "\t" "21-22" "\t" $11}' > temp.tab

! perl -e ' $len=0; while(<>) { s/\r?\n//; @F=split /\t/, $_; print ">$F[0]"; if (length($F[1])) { print " $F[1]" } print "\n"; $s=$F[2]; $len+= length($s); $s=~s/.{60}(?=.)/$&\n/g; print "$s\n"; } warn "\nConverted $. tab-delimited lines to FASTA format\nTotal sequence length: $len\n\n"; ' temp.tab > results/overlaps/srnas_flowers.fa

! rm temp.tab


Converted 12 tab-delimited lines to FASTA format
Total sequence length: 275



In [35]:
# Now for leaves

! awk 'FNR==NR{a[$0];next}{if (($2 in a)){print}}' <(cut -f 2 results/overlaps/leaves.tsv | sort | uniq) raw/only_21-22_known_de_novo_f-y/Results.txt | awk '{print $2 "\t" "21-22" "\t" $11}' > temp.tab
! perl -e ' $len=0; while(<>) { s/\r?\n//; @F=split /\t/, $_; print ">$F[0]"; if (length($F[1])) { print " $F[1]" } print "\n"; $s=$F[2]; $len+= length($s); $s=~s/.{60}(?=.)/$&\n/g; print "$s\n"; } warn "\nConverted $. tab-delimited lines to FASTA format\nTotal sequence length: $len\n\n"; ' temp.tab > results/overlaps/srnas_leaves.fa

! rm temp.tab


Converted 6 tab-delimited lines to FASTA format
Total sequence length: 141



In [90]:
# index genome for blast
! makeblastdb -in genome/silat.fa -dbtype nucl



Building a new DB, current time: 05/11/2023 16:55:31
New DB name:   /home/eddy/silene/genome/silat.fa
New DB title:  genome/silat.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 19 sequences in 19.2123 seconds.




In [22]:
# Short BLAST against the genome, try 1
! blastn -db genome/silat.fa -query results/overlaps/srnas_hairpins_flowers.fa -num_threads 12 -task blastn-short -word_size 5 -gapopen 1 -gapextend 1 -out results/overlaps/srnas_hairpins_flowers.blast

#### <span style="color:#3de2d8">  Get inverted repeats from the genome (inside tes environment). 
einverted -sequence genome/silat.fa -threshold 80 -gap 8 -match 4 -mismatch -6 -maxrepeat 100 -outseq annotation/inv_rep.fa -outfile annotation/inv_rep.san
    
    Parameters used: [default]
    
    -gap penalty of 8 [12]
    -match score 4 [3]
    -mismatch score -6 [-4]
    -minimum score threshold 80 [50] (I guess it's calculated from the identity and the number of gaps -not documented)
    -maximum lenght from start to finish 100 (assuming at ~40bp in the terminal loop)

In [None]:
To create a bed file including the information about the repeat:
    
    - Chr
    - Start
    - End
    - ID
    - Score
    - Strand
    - Matching baisepairs be
    - Repeat length
    - Mismatches
    - Number of gaps
    - Total mismatches (gaps + mismatches)

In [None]:
paste \
<(grep "Score" inv_rep.san | sed -E 's/^(\w+):.+/\1/g' ) \
<(sed -n '/Score/ { n; p }' inv_rep.san | sed -E 's/\s*(\w+)\s*.+/\1/g' ) \
<(sed -n '/Score/ { n; n; n; p }' inv_rep.san | sed -E 's/\s*(\w+)\s*.+/\1/g' ) \
<(! echo inverted_repeat_{1..187797} | sed -E 's/\s/\n/g' | awk '{print $0 "\t" "0" "\t" "+"}') \
<(grep "Score" inv_rep.san | sed -E 's/.+: (\w+)\/(\w+).+, (\w+) ga.+/\1\t\2\t\3/g' | awk '{print $1 "\t" $2 "\t" $2-$1 "\t" $3 "\t" $2-$1+$3 }' ) \
  > inv_rep.bed

In [28]:
# Get fasta files for inverted repeats
! bedtools getfasta -nameOnly -fi genome/silat.fa -bed annotation/inv_rep.bed -fo annotation/inv_rep_full.fa
# make it a BLAST database
! makeblastdb -in annotation/inv_rep_full.fa -dbtype nucl



Building a new DB, current time: 05/30/2023 11:30:59
New DB name:   /home/eddy/silene/annotation/inv_rep_full.fa
New DB title:  annotation/inv_rep_full.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 187797 sequences in 1.03208 seconds.




In [108]:
# BLAST sRNAs against the inverted repeat library
# alingment above 15 bp
# reduced penalty for mismatches
# reduced gap penalty

# Flowers
! blastn -db annotation/inv_rep_full.fa -query results/overlaps/srnas_flowers.fa -num_threads 12 -task blastn-short -word_size 15 -penalty -1 -gapopen 1 -outfmt 6 -out results/overlaps/srnas_flowers.blast 
# Leaves
! blastn -db annotation/inv_rep_full.fa -query results/overlaps/srnas_leaves.fa -num_threads 12 -task blastn-short -word_size 15 -penalty -1 -gapopen 1 -outfmt 6 -out results/overlaps/srnas_leaves.blast

In [109]:
# I got the reverse complements from <https://www.bioinformatics.org/sms2/rev_comp.html> and edited them manually
# Flowers
! blastn -db annotation/inv_rep_full.fa -query results/overlaps/srnas_flowers_reverse_comp.fa -num_threads 12 -task blastn-short -word_size 15 -penalty -1 -gapopen 1 -outfmt 6 -out results/overlaps/srnas_flowers_reverse_comp.blast
# Leaves
! blastn -db annotation/inv_rep_full.fa -query results/overlaps/srnas_leaves_reverse_comp.fa -num_threads 12 -task blastn-short -word_size 15 -penalty -1 -gapopen 1 -outfmt 6 -out results/overlaps/srnas_leaves_reverse_comp.blast

I was planning to select only the inverted repeats that aling to both the sRNA and its reverse complement but since it is already an inverted repeat, tables are basically the same. I only needed to select alingments >21 bp.

In [125]:
# Get the correspondences and annotations for the precursos

# Flowers
! join -1 4 -2 3 <(cut -f 1-4 annotation/inv_rep.bed | sort -k4) <(join -1 2 -2 1 results/overlaps/flowers.tsv <(awk '$3 >= 21' results/overlaps/srnas_flowers.blast | cut -f 1-2 ) )
! join -1 4 -2 3 <(cut -f 1-4 annotation/inv_rep.bed | sort -k4) <(join -1 2 -2 1 results/overlaps/flowers.tsv <(awk '$3 >= 21' results/overlaps/srnas_flowers.blast | cut -f 1-2 ) ) | sed -E 's/\s+/\t/g' > results/overlaps/flowers_precursos.tsv
! echo "_____________________________________________________________________"

# Leaves
! join -1 4 -2 3 <(cut -f 1-4 annotation/inv_rep.bed | sort -k4) <(join -1 2 -2 1 results/overlaps/leaves.tsv <(awk '$3 >= 21' results/overlaps/srnas_leaves.blast | cut -f 1-2 ) | sort -k3 )
! join -1 4 -2 3 <(cut -f 1-4 annotation/inv_rep.bed | sort -k4) <(join -1 2 -2 1 results/overlaps/leaves.tsv <(awk '$3 >= 21' results/overlaps/srnas_leaves.blast | cut -f 1-2 ) | sort -k3 ) | sed -E 's/\s+/\t/g' > results/overlaps/leaves_precursos.tsv

inverted_repeat_82099 chr9 171650675 171650772 Cluster_43988 Silat_chr12_Gene.1440
_____________________________________________________________________
inverted_repeat_100834 chr11 26842100 26842194 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_107555 chr11 133984933 133985003 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_115770 chr12 4887030 4887128 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_115770 chr12 4887030 4887128 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_12972 chr1 191921461 191921554 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_13134 chr1 194653142 194653237 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_13320 chr1 198785892 198785990 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_134649 chr12 276687495 276687593 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_13516 chr2 3592422 3592509 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_137010 chr12 319332388 31

The "precursos.tsv" files contain:

    - precursor ID
    - chr
    - start
    - end
    - sRNA cluster
    - sex-biased gene target

In [126]:
! makeblastdb -in annotation/genes.fa -dbtype nucl



Building a new DB, current time: 05/31/2023 18:09:34
New DB name:   /home/eddy/silene/annotation/genes.fa
New DB title:  annotation/genes.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 13071 sequences in 1.10136 seconds.




In [134]:
# check if the flowers' precursor is overlapping to a gene or a TE
! bedtools intersect -a <(cut -f 2-4 results/overlaps/flowers_precursos.tsv) -b annotation/genes.bed -wb
! bedtools intersect -a <(cut -f 2-4 results/overlaps/flowers_precursos.tsv) -b annotation/tes.bed -wb

scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-

scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-

scaffold_1	1	38	Target "Motif:RLG_Athila_108210_FL_DL" 1 27

scaffold_1	1	38	Target "Motif:RLG_Athila_108210_FL_DL" 1 27



In [141]:
# check if the leaves' precursor actually comes from TEs
! bedtools intersect -a <(cut -f 2-4 results/overlaps/leaves_precursos.tsv) -b annotation/tes.bed -wb
! echo "_____________________________________________________________________"
# Now to TEsorter annotation
! bedtools intersect -a <(cut -f 2-4 results/overlaps/leaves_precursos.tsv) -b tesorter_genome/tes_genome.dom.bed -wb

scaffold_1	1	38	Target "Motif:RLG_Athila_108210_FL_DL" 1 27

chr11	133984933	133984954	chr11	133984897	133984954	Target "Motif:RLG_Retand_116_partial_D" 25880 25938
chr11	133984958	133985003	chr11	133984958	133985015	Target "Motif:RLG_Retand_116_partial_D" 43913 43967
chr11	133984970	133985003	chr11	133984970	133985189	Target "Motif:RLG_Retand_116_partial_D" 43919 44138
chr1	198785892	198785972	chr1	198785886	198785972	Target "Motif:RLG_Retand_116_partial_D" 37495 37580
chr2	3592427	3592503	chr2	3592427	3592503	Target "Motif:RLG_Retand_28930_FL_DL" 967 1048
chr2	3592439	3592508	chr2	3592439	3592508	Target "Motif:RLG_Retand_33319_FL_DL" 3636 3710
chr12	319495018	319495050	chr12	319494856	319495050	Target "Motif:RLG_Retand_13569_FL_DL" 296 488
chr12	319495081	319495113	chr12	319495081	319495245	Target "Motif:RLG_Retand_13569_FL_DL" 296 462
chr4	8272012	8272102	chr4	8272012	8272103	Target "Motif:RLG_Retand_28930_FL_DL" 958 1053
chr4	8272343	8272412	chr4	8272337	8272412	Target "Motif:RLG_R

In [128]:
# get the gene sequences of the genes that may be under PTGS to check if they are sex specific
! perl -e ' ($id,$fasta)=@ARGV; open(ID,$id); while (<ID>) { s/\r?\n//; /^>?(\S+)/; $ids{$1}++; } $num_ids = keys %ids; open(F, $fasta); $s_read = $s_wrote = $print_it = 0; while (<F>) { if (/^>(\S+)/) { $s_read++; if ($ids{$1}) { $s_wrote++; $print_it = 1; delete $ids{$1} } else { $print_it = 0 } }; if ($print_it) { print $_ } }; END { warn "Searched $s_read FASTA records.\nFound $s_wrote IDs out of $num_ids in the ID list.\n" } ' results/overlaps/genes.list annotation/genes.fa > results/overlaps/genes.fa

Searched 13071 FASTA records.
Found 2 IDs out of 2 in the ID list.


In [142]:
# Find similar sequences to the genes that have a sRNA with identified precursor
! blastn -db annotation/genes.fa -query results/overlaps/genes.fa -num_threads 12 -outfmt 6 -out results/overlaps/genes.blast

In [None]:
! RNAfold < annotation/inv_rep_full.fa −−outfile[=<filename>]


[36mInput string (upper or lower case); @ to quit[0m
[1m....,....1....,....2....,....3....,....4....,....5....,....6....,....7....,....8[0m


# <span style="color:#f0b27a"> RdDM

In [25]:
# Add -+ 5000bp to genes
! bedtools slop -i results/differential_expression/sex_biased_flowers_sbge.bed -g chromSizes.txt -b 2000 -s >  results/rddm/sb_long_genes.bed
! bedtools slop -i annotation/full_genes.bed -g chromSizes.txt -b 2000 -s >  annotation/long_genes.bed

In [24]:
# get overlaps

# flowers
! bedtools intersect -nonamecheck -a <(cut -f 1-4 results/differential_expression/sex_biased_flowers_rddm.bed) -b <(cut -f 1-4 results/rddm/sb_long_genes.bed) -wb | cut -f 4,8
! bedtools intersect -nonamecheck -a <(cut -f 1-4 results/differential_expression/sex_biased_flowers_rddm.bed) -b <(cut -f 1-4 results/rddm/sb_long_genes.bed) -wb | cut -f 4,8 > results/rddm/flowers.tsv
! echo "__________________________________________________________"

# leaves
# flowers
! bedtools intersect -nonamecheck -a <(cut -f 1-4 results/differential_expression/sex_biased_leaves_rddm.bed) -b <(cut -f 1-4 results/rddm/sb_long_genes.bed) -wb | cut -f 4,8
! bedtools intersect -nonamecheck -a <(cut -f 1-4 results/differential_expression/sex_biased_leaves_rddm.bed) -b <(cut -f 1-4 results/rddm/sb_long_genes.bed) -wb | cut -f 4,8 > results/rddm/leaves.tsv
! echo "__________________________________________________________"

Cluster_2028_RdDM	Silat_chr1_Gene.4771
Cluster_9316_RdDM	Silat_chr2_Gene.30622
Cluster_10374_RdDM	Silat_chr3_Gene.37746
Cluster_12819_RdDM	Silat_chr3_Gene.42197
Cluster_18293_RdDM	Silat_chr5_Gene.5379
Cluster_23734_RdDM	Silat_chr7_Gene.21503
Cluster_23735_RdDM	Silat_chr7_Gene.21503
Cluster_29947_RdDM	Silat_chr9_Gene.17911
Cluster_30499_RdDM	Silat_chr9_Gene.46125
Cluster_37902_RdDM	Silat_chr11_Gene.15485
Cluster_40701_RdDM	Silat_chr12_Gene.49048
Cluster_40705_RdDM	Silat_chr12_Gene.49048
Cluster_40829_RdDM	Silat_chr12_Gene.45298
Cluster_41057_RdDM	Silat_chr12_Gene.46166
Cluster_41091_RdDM	Silat_chr12_Gene.14608
Cluster_41602_RdDM	Silat_chr12_Gene.32908
Cluster_42879_RdDM	Silat_chr12_Gene.26394
Cluster_43013_RdDM	Silat_chr12_Gene.25578
Cluster_43985_RdDM	Silat_chr12_Gene.1440
Cluster_44141_RdDM	Silat_chr12_Gene.8361
Cluster_44454_RdDM	Silat_scaffold_1_Gene.21140
Cluster_44455_RdDM	Silat_scaffold_1_Gene.21140
Cluster_45098_RdDM	Silat_scaffold_1_Gene.33918
Cluster_45154_RdDM	Silat_scaffold_

In [29]:
# create bed of unbiased genes
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(cut -f 4 results/differential_expression/sex_biased_flowers_sbge.bed ) annotation/long_genes.bed | cut -f 1-4 > unbiased.genes.flowers

# create RdDM bed file
! tail -n +2 raw/only_24_f-y/Results.txt | awk '{print $3 "\t" $4 "\t" $5 "\t" $2 "_RdDM" }' > results/rddm/rddm.bed

# create list of unbiased clusters
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(cut -f 4 results/differential_expression/sex_biased_flowers_rddm.bed ) results/rddm/rddm.bed | cut -f 1-4 > unbiased.clusters.flowers
 
# calculations
! echo "Female-biased Genes with overlapping Female-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b <(grep "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes with overlapping Female-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b <(grep "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Female-biased Genes with overlapping Male-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes with overlapping Male-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

# Unbiased genes
! echo "______________________________________"
! echo "UN-biased Genes with overlapping Female-biased sRNAS"
! bedtools intersect -nonamecheck -a unbiased.genes.flowers -b <(grep "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "UN-biased Genes with overlapping Male-biased sRNAS"
! bedtools intersect -nonamecheck -a unbiased.genes.flowers -b <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

# Unbiased sRNAs
! echo "Female-biased Genes with overlapping UN-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b unbiased.clusters.flowers | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes with overlapping UN-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b unbiased.clusters.flowers | cut -f 4 | sort | uniq | wc -l

# Both unbiased
! echo "UN-biased Genes with overlapping UN-biased sRNAS"
! bedtools intersect -nonamecheck -a unbiased.genes.flowers -b unbiased.clusters.flowers | cut -f 4 | sort | uniq | wc -l

# Genes without overlappin sRNAs
! echo "______________________________________"
! echo "Female-biased Genes WITHOUT overlapping sRNAS"
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(bedtools intersect -nonamecheck -a <(grep "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b results/rddm/rddm.bed | cut -f 4 | sort | uniq ) <(grep "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes WITHOUT overlapping sRNAS"
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b results/rddm/rddm.bed | cut -f 4 | sort | uniq ) <(grep -v "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

# No overlap and no sex bias
! echo "______________________________________"
! echo "UN-biased Genes WITHOUT overlapping sRNAS"
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(bedtools intersect -nonamecheck -a unbiased.genes.flowers -b results/rddm/rddm.bed | cut -f 4 | sort | uniq ) unbiased.genes.flowers | cut -f 4 | sort | uniq | wc -l

! rm unbiased*

Female-biased Genes with overlapping Female-biased sRNAS
5
Male-biased Genes with overlapping Female-biased sRNAS
6
Female-biased Genes with overlapping Male-biased sRNAS
1
Male-biased Genes with overlapping Male-biased sRNAS
10
______________________________________
UN-biased Genes with overlapping Female-biased sRNAS
52
UN-biased Genes with overlapping Male-biased sRNAS
39
Female-biased Genes with overlapping UN-biased sRNAS
398
Male-biased Genes with overlapping UN-biased sRNAS
803
UN-biased Genes with overlapping UN-biased sRNAS
7030
______________________________________
Female-biased Genes WITHOUT overlapping sRNAS
165
Male-biased Genes WITHOUT overlapping sRNAS
365
______________________________________
UN-biased Genes WITHOUT overlapping sRNAS
4254
