![image](silene.jpeg)

# sRNA profiles of flower dimorphism in _Silene latifolia_

#### Eddy J. Mendoza-Galindo
#### Advisor: Aline Muyle, CEFE Montpellier
April 2023

# <span style="color:#4dd98b"> Alingment and quantification

#### <span style="color:#4dd98b"> Exploration of sRNA abundance based on length and species, only for 21,22, 24

In [5]:
! cat scripts/count.sh

cd raw/sRNA_MGX/trimmed/
for file in *.fastq
do
echo "working with $file"
perl -e ' $count=0; $len=0; while(<>) { s/\r?\n//; s/\t/ /g; if (s/^@//) { if ($. != 1) { print "\n" } s/ |$/\t/; $count++; $_ .= "\t"; } else { s/ //g; $len += length($_) } print $_; } print "\n"; ' $file | sed -E 's/^.+\t(\w+)\+.*$/\1/g' | perl -e ' $col=0; while (<>) { s/\r?\n//; @F = split /\t/, $_; $len = length($F[$col]); print "$_\t$len\n" }; ' | awk '$2 ~ /(21|22|24)/ ' > ${file}_count.tsv
done


In [6]:
! bash scripts/count.sh

working with F1B_final_trimming.fastq
working with F1L_final_trimming.fastq
working with F2B_final_trimming.fastq
working with F2L_final_trimming.fastq
working with F3B_final_trimming.fastq
working with F3L_final_trimming.fastq
working with M1B_final_trimming.fastq
working with M1L_final_trimming.fastq
working with M2B_final_trimming.fastq
working with M2L_final_trimming.fastq
working with M4B_final_trimming.fastq
working with M4L_final_trimming.fastq


#### <span style="color:#4dd98b"> Select reads for 21, 22 and 24 nt long

In [1]:
! cat scripts/filter_size.sh

cd raw/
rm -r fastq/
mkdir fastq

out=fastq
files=sRNA_MGX/trimmed/*.fastq


for file in $files
do

name=$(echo $file | sed -E 's/^sR.*ed\/(\w+)_f.*/\1/g')

echo "WORKING WITH $name"

# Select reads of 21, 22 and 24 in length
seqtk comp $file | awk '$2 == 21' | cut -f 1 > 21.list
seqtk subseq $file 21.list > $out/${name}_21.fq

seqtk comp $file | awk '$2 == 22' | cut -f 1 > 22.list
seqtk subseq $file 22.list > $out/${name}_22.fq

seqtk comp $file | awk '$2 == 24' | cut -f 1 > 24.list
seqtk subseq $file 24.list > $out/${name}_24.fq

rm 21.list 22.list 24.list
cat ${name}_21.fq ${name}_22.fq ${name}_24.fq > ${name}_dicer.fq 
rm ${name}_21.fq ${name}_22.fq ${name}_24.fq 

done

In [4]:
! bash scripts/filter_size.sh

WORKING WITH F1B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F1L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F2B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F2L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F3B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F3L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M1B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M1L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M2B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M2L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M4B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M4L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done


In [6]:
#check lengths after filtering
! seqtk comp raw/fastq/M2B_dicer.fq | cut -f 2 | sort | uniq 
! seqtk comp raw/fastq/F1L_dicer.fq | cut -f 2 | sort | uniq 

21
22
24
21
22
24


#### <span style="color:#4dd98b"> ShortStack

We followed the Shortstack workflow,
Only uniquely-aligned reads are used as weights for placement of multi-mapped reads.

conda activate ShortStack4 # ShortStack only works under its environment

ShortStack --genomefile ../genome/silat.fa --readfile fastq/*.fq --threads 8 --knownRNAs caryophyllaceae_mirnas.fa --mmap u

In [26]:
# Remove Y chromosome from genome and map Females
! grep ">" genome/silat.fa | grep -v "scaffold" | sed 's/>//' > genome/no_y.list
! seqtk subseq genome/silat.fa genome/no_y.list > genome/no_y.fa
! grep ">" genome/no_y.fa

>chr1
>chr2
>chr3
>chr4
>chr5
>chr6
>chr7
>chr8
>chr9
>chr10
>chr11
>chr12


### <span style="color:#4dd98b"> Aling females to the genome without Y (ran inside raw/ and under the conda envirnonment of ShortStack$)
ShortStack --genomefile ../genome/no_y.fa --readfile fastq/F* --threads 8 --align_only --outdir females_no_y

### <span style="color:#4dd98b">Merge females (without y) and males (with y) (Inside SS environment so it works)

samtools merge -r -@ 12 -f -o raw/merged_females-y.bam -b raw/bam.list
    
#### <span style="color:#4dd98b"> First, PTGS (21-22 nt), asking for _de novo_ and template-based miRNA annotation
    
ShortStack --genomefile ../genome/silat.fa --bamfile merged_females-y.bam --threads 8 --outdir only_21-22_known_de_novo_f-y_try5 --dicermax 22 --mmap u --dn_mirna --knownRNAs caryophyllaceae_mirnas.fa 

#### <span style="color:#4dd98b"> Then, RDdM (24 nt), no miRNA identification
    
ShortStack --genomefile ../genome/silat.fa --bamfile merged_females-y.bam --threads 8 --outdir only_24_f-y --dicermin 23 --mmap u 

### <span style="color:#4dd98b"> Depth quantification

In [None]:
! bash scripts/mapping_depth.sh # Output is very heavy

### <span style="color:#4dd98b"> Circos plot

In [3]:
! bedtools makewindows -g chromSizes.txt -w 1000000 > annotation/1mb_windows.bed # create windows

In [26]:
# make merged aligments for each sex
! samtools merge -r -@ 12 -f -o raw/female_flower.bam raw/females_no_y/*B_dicer.bam 
! samtools merge -r -@ 12 -f -o raw/male_flower.bam raw/ShortStack_results/M*B_dicer.bam

[bam_translate] RG tag "F2B_dicer" on read "A00924:314:H5CTHDRX2:2:2260:8621:30953" encountered with no corresponding entry in header, tag lost. Unknown tags are only reported once per input file for each tag ID.
[bam_translate] RG tag "F3B_dicer" on read "A00924:314:H5CTHDRX2:2:2220:25192:21793" encountered with no corresponding entry in header, tag lost. Unknown tags are only reported once per input file for each tag ID.


In [22]:
# get bam for each lenght and individual
! rm -r raw/bams_per_length/
! mkdir raw/bams_per_length

# first 21-22, females
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F1B_ptgs.bam raw/females_no_y/F1B_dicer.bam 
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F2B_ptgs.bam raw/females_no_y/F2B_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F3B_ptgs.bam raw/females_no_y/F3B_dicer.bam 

# then 24, females
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F1B_rddm.bam raw/females_no_y/F1B_dicer.bam 
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F2B_rddm.bam raw/females_no_y/F2B_dicer.bam
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F3B_rddm.bam raw/females_no_y/F3B_dicer.bam 

# first 21-22, males
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M1B_ptgs.bam raw/ShortStack_results/M1B_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M2B_ptgs.bam raw/ShortStack_results/M2B_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M4B_ptgs.bam raw/ShortStack_results/M4B_dicer.bam

# then 24, males
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M1B_rddm.bam raw/ShortStack_results/M1B_dicer.bam 
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M2B_rddm.bam raw/ShortStack_results/M2B_dicer.bam
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M4B_rddm.bam raw/ShortStack_results/M4B_dicer.bam

rm: cannot remove 'raw/bams_per_length/': No such file or directory


In [9]:
# calculate depth for each individual, only for flower buds and convert it to bed, 21-22 in flowers
# females
! samtools depth -@ 12 raw/bams_per_length/F*B_ptgs.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/females_flowers_ptgs.bed
! samtools depth -@ 12 raw/bams_per_length/F*B_rddm.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/females_flowers_rddm.bed
# males
! samtools depth -@ 12 raw/bams_per_length/M*B_ptgs.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/males_flowers_ptgs.bed
! samtools depth -@ 12 raw/bams_per_length/M*B_rddm.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/males_flowers_rddm.bed

In [4]:
# get the number of mapped reads for the normalization 
! echo "____________________ FEMALES FLOWERS_____________________"
! samtools view -c -F 260 -@ 12 raw/females_no_y/F1B_dicer.bam
! samtools view -c -F 260 -@ 12 raw/females_no_y/F2B_dicer.bam 
! samtools view -c -F 260 -@ 12 raw/females_no_y/F3B_dicer.bam 
! echo "____________________ MALES FLOWERS_____________________"
! samtools view -c -F 260 -@ 12 raw/ShortStack_results/M1B_dicer.bam
! samtools view -c -F 260 -@ 12 raw/ShortStack_results/M2B_dicer.bam
! samtools view -c -F 260 -@ 12 raw/ShortStack_results/M4B_dicer.bam

____________________ FEMALES FLOWERS_____________________
54935002
29689313
25245689
____________________ MALES FLOWERS_____________________
13224600
23943681
23582102


In [11]:
# get the window identity, 21-22
! bedtools intersect -b annotation/1mb_windows.bed -a raw/depth/females_flowers_ptgs.bed -wb -wa | awk '{print $0 "\t" $7 "_" $8 "_" $9}' > raw/depth/females_flowers_ptgs.depth
! bedtools intersect -b annotation/1mb_windows.bed -a raw/depth/males_flowers_ptgs.bed -wb -wa | awk '{print $0 "\t" $7 "_" $8 "_" $9}' > raw/depth/males_flowers_ptgs.depth
# get the window identity, 24
! bedtools intersect -b annotation/1mb_windows.bed -a raw/depth/females_flowers_rddm.bed -wb -wa | awk '{print $0 "\t" $7 "_" $8 "_" $9}' > raw/depth/females_flowers_rddm.depth
! bedtools intersect -b annotation/1mb_windows.bed -a raw/depth/males_flowers_rddm.bed -wb -wa | awk '{print $0 "\t" $7 "_" $8 "_" $9}' > raw/depth/males_flowers_rddm.depth

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000



In [15]:
# get bed file for TE distribution
! cut -f 1,4,5,9 annotation/Slat_v2_Class_I_II_TRF_MITE_sorted.gff > annotation/repeats.bed
! bedtools intersect -b annotation/1mb_windows.bed -a annotation/repeats.bed -wb -wa | awk '{print $0 "\t" $7 "_" $8 "_" $9}' > raw/depth/repeats.bedbed

! grep -v "monomer" annotation/repeats.bed | grep -v "(" | grep -v "rich" > annotation/tes.bed 

scaffold_1	0	1000000

scaffold_1	0	1000000



# <span style="color:#3de2d8"> RNA-seq analysis

In [None]:
# Check quality
#! mkdir rna-seq/fastqc
#! rna-seq/FastQC/fastqc -t 16 -o rna-seq/fastqc rna-seq/raw/*.gz
! multiqc --outdir rna-seq/fastqc rna-seq/fastqc/

In [None]:
# Trimming 
! bash scripts/trimm.sh

In [None]:
! mkdir rna-seq/trimmed/fastqc
! rna-seq/FastQC/fastqc -t 16 -o rna-seq/trimmed/fastqc rna-seq/trimmed/*.fastq
! multiqc --outdir rna-seq/trimmed/fastqc rna-seq/trimmed/fastqc/

In [72]:
# Create bed file for the annotation
! cut -f 1-6 -d ' ' annotation/ALLGenesFromVulgarisIntoLatifolia.txt | tail -n +2 | sed -E 's/(\w+)\s(\Ge.+)\::out.+(\s.\s\w+\s\w+)/\1\tSilat_\1_\2\t\3/' | sed -E 's/ /\t/g' | awk '{print $1 "\t" $4 "\t" $5 "\t" $2 "\t" 0 "\t" $3}' > annotation/transcripts.bed 

# Identify overlapping annotations
! bedtools intersect -a annotation/transcripts.bed -b annotation/transcripts.bed -wb | cut -f 4,10 | awk '{if ($1 != $2) {print $2}}' | sort | uniq > annotation/overlaps.txt
! wc -l annotation/overlaps.txt
# 11609 overlapping annotations were found !!!!
# Nor remove those annotations
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' annotation/overlaps.txt annotation/transcripts.bed > annotation/genes.bed

# Verify they don't overlap anymore
! bedtools intersect -a annotation/genes.bed -b annotation/genes.bed -wb | cut -f 4,10 | awk '{if ($1 != $2) {print}}'

# 22 loci had the same ID. Some mapped close, others not; others had only different extents but couldn't be removed before before cause they had the same ID
# To homogeneize and not loose them, I just selected those that have the longest genomic range
! cut -f 4 annotation/genes.bed | sort | uniq -d > annotation/duplicated.list
! awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' annotation/duplicated.list annotation/transcripts.bed | sort -k4 > annotation/duplicated.tsv
! awk '{len[$4] = $3 - $2 > len[$4] ? $3 - $2 : len[$4]; range[$4, len[$4]] = $0} END {for (id in len) print range[id, len[id]]}' annotation/duplicated.tsv | sort -k4 > annotation/duplicated.bed
# Now remove all duplicated entries and paste the selected ones
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' annotation/duplicated.list annotation/genes.bed > annotation/clean_genes.bed
! cat annotation/clean_genes.bed annotation/duplicated.bed > annotation/genes.bed
# Check no ID is duplicated
! cut -f 4 annotation/genes.bed | sort | uniq -d # NO ONE IS NOW!

# Get fasta files for genes 
! bedtools getfasta -nameOnly -fi genome/silat.fa -bed annotation/genes.bed -fo annotation/genes.fa

# Clean intermediate files 
! rm annotation/duplicated* annotation/clean*

scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-

scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-

11609 annotation/overlaps.txt
scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-

scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-



In [71]:
# Build a Kallisto index
! kallisto index -i annotation/genes.idx annotation/genes.fa


[build] loading fasta file annotation/genes.fa
[build] k-mer length: 31
        with pseudorandom nucleotides
[build] counting k-mers ... done.
[build] building target de Bruijn graph ...  done 
[build] creating equivalence classes ...  done
[build] target de Bruijn graph has 420049 contigs and contains 60722289 k-mers 



In [None]:
! bash scripts/kallisto.sh

In [74]:
# Merge results
! ls rna-seq/kallisto/ | head -n 16 > rna-seq/kallisto/samples.list
# first I manually changed the dir names so it was less messy
! bash scripts/merge_kallisto_raw.sh 

rm: cannot remove 'kallisto/raw_counts.tsv': No such file or directory
rm: cannot remove 'kallisto/tpm.tsv': No such file or directory
C1_01_B RUNNING
C1_01_B DONE
_________
C1_01_L RUNNING
C1_01_L DONE
_________
C1_03_B RUNNING
C1_03_B DONE
_________
C1_03_L RUNNING
C1_03_L DONE
_________
C1_04_B_combined RUNNING
C1_04_B_combined DONE
_________
C1_04_L RUNNING
C1_04_L DONE
_________
C1_05_B_combined RUNNING
C1_05_B_combined DONE
_________
C1_05_L RUNNING
C1_05_L DONE
_________
C1_26_B RUNNING
C1_26_B DONE
_________
C1_26_L RUNNING
C1_26_L DONE
_________
C1_27_B RUNNING
C1_27_B DONE
_________
C1_27_L RUNNING
C1_27_L DONE
_________
C1_29_B_combined RUNNING
C1_29_B_combined DONE
_________
C1_29_L RUNNING
C1_29_L DONE
_________
C1_34_B_combined RUNNING
C1_34_B_combined DONE
_________
C1_34_L RUNNING
C1_34_L DONE
_________


### <span style="color:#3de2d8"> Are those transcripts TEs?

Inside the "tes" environment I run:
    
TEsorter annotation/genes.fa -db rexdb-plant

The results are in tesorter_genes/

NOTE: The pipeline failed at some point but hmmscan did finished. Something went wrong while translating the genes,

In [84]:
# Clean the results
! grep "Silat" tesorter_genes/genes.fa.rexdb-plant.domtbl | sed -E 's/(Class.+)\s+\-.+(Silat.+)\|.+\-\s+\w+\s+(.+)\s+.+\-/\1   \2   \3/g' | sed -E 's/\s+/\t/'g | cut -f 1-3 > tesorter_genes/results.tsv

In [87]:
# How many annotations come from a TE?
! cut -f 2 tesorter_genes/results.tsv | sort | uniq | wc -l

903


In [138]:
# How many sex-biased genes come from a TE?

# From flowers
! awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' <(cut -f 2 tesorter_genes/results.tsv | sort | uniq ) results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 4 | wc -l
! awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' <(cut -f 2 tesorter_genes/results.tsv | sort | uniq ) results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 4
! echo "##########################"

# From leaves
! awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' <(cut -f 2 tesorter_genes/results.tsv | sort | uniq ) results/differential_expression/sex_biased_leaves_sbge.bed | cut -f 4 | wc -l
! awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' <(cut -f 2 tesorter_genes/results.tsv | sort | uniq ) results/differential_expression/sex_biased_leaves_sbge.bed | cut -f 4

114
Silat_chr1_Gene.45232
Silat_chr1_Gene.9530
Silat_chr1_Gene.16376
Silat_chr1_Gene.4771
Silat_chr1_Gene.21208
Silat_chr1_Gene.12285
Silat_chr1_Gene.5295
Silat_chr1_Gene.16635
Silat_chr1_Gene.20959
Silat_chr1_Gene.35042
Silat_chr1_Gene.34093
Silat_chr1_Gene.38146
Silat_chr1_Gene.22553
Silat_chr2_Gene.15808
Silat_chr2_Gene.27753
Silat_chr2_Gene.27487
Silat_chr2_Gene.23058
Silat_chr2_Gene.20713
Silat_chr2_Gene.21257
Silat_chr2_Gene.16042
Silat_chr2_Gene.28035
Silat_chr2_Gene.8318
Silat_chr2_Gene.46625
Silat_chr2_Gene.20910
Silat_chr3_Gene.18331
Silat_chr3_Gene.43796
Silat_chr3_Gene.37852
Silat_chr3_Gene.49131
Silat_chr3_Gene.25685
Silat_chr3_Gene.18507
Silat_chr4_Gene.18622
Silat_chr4_Gene.970
Silat_chr4_Gene.37017
Silat_chr4_Gene.21184
Silat_chr4_Gene.41565
Silat_chr4_Gene.20563
Silat_chr5_Gene.42038
Silat_chr5_Gene.39202
Silat_chr5_Gene.15331
Silat_chr5_Gene.11325
Silat_chr5_Gene.20981
Silat_chr5_Gene.6617
Silat_chr6_Gene.19787
Silat_chr6_Gene.16883
Silat_chr6_Gene.47075
Silat_chr6_Ge

114 TE-genes in Flowers

17 TE-genes in Leaves

# <span style="color:#3de2d8"> PTGS

### <span style="color:#3de2d8"> Statistical analysis

In [75]:
# Add 400bd upstream and downstream (UTR and promoter) strand-wise
! bedtools slop -i annotation/genes.bed -g chromSizes.txt -b 400 -s > annotation/full_genes.bed

In [None]:
# Benchmark for PTGS, get the intersections for the DE
! bedtools intersect -a results/differential_expression/sex_biased_flowers_sbge.bed -b results/differential_expression/sex_biased_flowers_ptgs.bed -wa -wb | cut -f 4,11 > results/overlaps/flowers.tsv
! bedtools intersect -a results/differential_expression/sex_biased_leaves_sbge.bed -b results/differential_expression/sex_biased_leaves_ptgs.bed -wa -wb | cut -f 4,11 > results/overlaps/leaves.tsv
! bedtools intersect -a results/differential_expression/tissue_biased_females_sbge.bed -b results/differential_expression/tissue_biased_females_ptgs.bed -wa -wb | cut -f 4,11 > results/overlaps/females.tsv
! bedtools intersect -a results/differential_expression/tissue_biased_males_sbge.bed -b results/differential_expression/tissue_biased_males_ptgs.bed -wa -wb | cut -f 4,11 > results/overlaps/males.tsv

In [30]:
# create a bed file for the srna clusters
! awk '{print $3 "\t" $4 "\t" $5 "\t" $2}' raw/only_21-22_known_de_novo_f-y/Results.txt | tail -n +2 > annotation/ptgs.bed

In [32]:
# NOTE: gene bed files already include +-400bp

# create lists of unbiased genes
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(cut -f 4 results/differential_expression/sex_biased_flowers_sbge.bed ) annotation/full_genes.bed | cut -f 1-4 > unbiased.genes.flowers

# the PTGS bed file was already done == annotation/ptgs.bed

# create list of unbiased clusters
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(cut -f 4 results/differential_expression/sex_biased_flowers_ptgs.bed ) annotation/ptgs.bed | cut -f 1-4 > unbiased.clusters.flowers
 
# calculations
! echo "Female-biased Genes with overlapping Female-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep "female-biased" results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 1-4 ) -b <(grep "female-biased" results/differential_expression/sex_biased_flowers_ptgs.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes with overlapping Female-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 1-4 ) -b <(grep "female-biased" results/differential_expression/sex_biased_flowers_ptgs.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Female-biased Genes with overlapping Male-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep "female-biased" results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 1-4 ) -b <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_ptgs.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes with overlapping Male-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 1-4 ) -b <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_ptgs.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

# Unbiased genes
! echo "______________________________________"
! echo "UN-biased Genes with overlapping Female-biased sRNAS"
! bedtools intersect -nonamecheck -a unbiased.genes.flowers -b <(grep "female-biased" results/differential_expression/sex_biased_flowers_ptgs.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "UN-biased Genes with overlapping Male-biased sRNAS"
! bedtools intersect -nonamecheck -a unbiased.genes.flowers -b <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_ptgs.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

# Unbiased sRNAs
! echo "Female-biased Genes with overlapping UN-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep "female-biased" results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 1-4 ) -b unbiased.clusters.flowers | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes with overlapping UN-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 1-4 ) -b unbiased.clusters.flowers | cut -f 4 | sort | uniq | wc -l

# Both unbiased
! echo "UN-biased Genes with overlapping UN-biased sRNAS"
! bedtools intersect -nonamecheck -a unbiased.genes.flowers -b unbiased.clusters.flowers | cut -f 4 | sort | uniq | wc -l

# Genes without overlappin sRNAs
# first get the sb genes that overlap to a cluster, then remove from the total list of sb genes
! echo "______________________________________"
! echo "Female-biased Genes WITHOUT overlapping sRNAS"
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(bedtools intersect -nonamecheck -a <(grep "female-biased" results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 1-4 ) -b annotation/ptgs.bed | cut -f 4 | sort | uniq ) <(grep "female-biased" results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes WITHOUT overlapping sRNAS"
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 1-4 ) -b annotation/ptgs.bed | cut -f 4 | sort | uniq ) <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_sbge.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

# No sex bias and overlap
# get the unbiased genes that have overlap, then remove those from the unbiased genes 
! echo "______________________________________"
! echo "UN-biased Genes WITHOUT overlapping sRNAS"
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(bedtools intersect -nonamecheck -a unbiased.genes.flowers -b annotation/ptgs.bed | cut -f 4 | sort | uniq ) unbiased.genes.flowers | cut -f 4 | sort | uniq | wc -l

! rm unbiased*

Female-biased Genes with overlapping Female-biased sRNAS
2
Male-biased Genes with overlapping Female-biased sRNAS
2
Female-biased Genes with overlapping Male-biased sRNAS
0
Male-biased Genes with overlapping Male-biased sRNAS
6
______________________________________
UN-biased Genes with overlapping Female-biased sRNAS
25
UN-biased Genes with overlapping Male-biased sRNAS
15
Female-biased Genes with overlapping UN-biased sRNAS
239
Male-biased Genes with overlapping UN-biased sRNAS
519
UN-biased Genes with overlapping UN-biased sRNAS
3563
______________________________________
Female-biased Genes WITHOUT overlapping sRNAS
328
Male-biased Genes WITHOUT overlapping sRNAS
652
______________________________________
UN-biased Genes WITHOUT overlapping sRNAS
7742


### <span style="color:#3de2d8"> DE clusters in TEs

In [72]:
# in FLOWERS, number of male-biased sRNA clusters overlapping with a TE
! bedtools intersect -a annotation/tes.bed -b <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' <(grep "flowers" results/DEsRNAClusters.tsv | grep "TRUE" | cut -f 1) annotation/ptgs.bed) -wb | cut -f 8 | sort | uniq | wc -l
! bedtools intersect -a annotation/tes.bed -b <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' <(grep "flowers" results/DEsRNAClusters.tsv | grep "FALSE" | cut -f 1) annotation/ptgs.bed) -wb | cut -f 8 | sort | uniq | wc -l
# Now in LEAVES
! bedtools intersect -a annotation/tes.bed -b <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' <(grep "leaves" results/DEsRNAClusters.tsv | grep "TRUE" | cut -f 1) annotation/ptgs.bed) -wb | cut -f 8 | sort | uniq | wc -l
! bedtools intersect -a annotation/tes.bed -b <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' <(grep "leaves" results/DEsRNAClusters.tsv | grep "FALSE" | cut -f 1) annotation/ptgs.bed) -wb | cut -f 8 | sort | uniq | wc -l

scaffold_1	205595	205995	Cluster_44358

scaffold_1	205595	205995	Cluster_44358

432
scaffold_1	1	38	Target "Motif:RLG_Athila_108210_FL_DL" 1 27

scaffold_1	1	38	Target "Motif:RLG_Athila_108210_FL_DL" 1 27

171
scaffold_1	205595	205995	Cluster_44358

scaffold_1	205595	205995	Cluster_44358

334
scaffold_1	1	38	Target "Motif:RLG_Athila_108210_FL_DL" 1 27

scaffold_1	1	38	Target "Motif:RLG_Athila_108210_FL_DL" 1 27

160


### <span style="color:#3de2d8"> Gene-level sRNA mapping - Flowers

In [53]:
# Get depths for each gene
# Flowers both sexes
! bedtools intersect -a raw/depth/females_flowers_ptgs.bed -b results/differential_expression/sex_biased_flowers_sbge.bed -wb | cut -f 1-6,10 > results/gene-level/flowers_depth_females.tsv
! bedtools intersect -a raw/depth/males_flowers_ptgs.bed -b results/differential_expression/sex_biased_flowers_sbge.bed -wb | cut -f 1-6,10 > results/gene-level/flowers_depth_males.tsv

scaffold_1	10977825	10980423	Silat_scaffold_1_Gene.18933	0	+	sex_biased_flowers_sbge

scaffold_1	10977825	10980423	Silat_scaffold_1_Gene.18933	0	+	sex_biased_flowers_sbge

scaffold_1	10977825	10980423	Silat_scaffold_1_Gene.18933	0	+	sex_biased_flowers_sbge

scaffold_1	10977825	10980423	Silat_scaffold_1_Gene.18933	0	+	sex_biased_flowers_sbge

Error: Unable to open file raw/depth/females_leaves_ptgs.bed. Exiting.
Error: Unable to open file raw/depth/males_leaves_ptgs.bed. Exiting.


### <span style="color:#3de2d8"> Gene-level sRNA mapping - Leaves

We didn't have depth for leaves so we create them, including 24nt sRNAs too for later

In [2]:
# first 21-22, females
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F1L_ptgs.bam raw/females_no_y/F1L_dicer.bam 
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F2L_ptgs.bam raw/females_no_y/F2L_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F3L_ptgs.bam raw/females_no_y/F3L_dicer.bam 

# then 24, females
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F1L_rddm.bam raw/females_no_y/F1L_dicer.bam 
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F2L_rddm.bam raw/females_no_y/F2L_dicer.bam
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F3L_rddm.bam raw/females_no_y/F3L_dicer.bam 

# first 21-22, males
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M1L_ptgs.bam raw/ShortStack_results/M1L_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M2L_ptgs.bam raw/ShortStack_results/M2L_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M4L_ptgs.bam raw/ShortStack_results/M4L_dicer.bam

# then 24, males
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M1L_rddm.bam raw/ShortStack_results/M1L_dicer.bam 
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M2L_rddm.bam raw/ShortStack_results/M2L_dicer.bam
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M4L_rddm.bam raw/ShortStack_results/M4L_dicer.bam

In [3]:
# Calculate depth for flowers

# females
! samtools depth -@ 12 raw/bams_per_length/F*L_ptgs.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/females_leaves_ptgs.bed
! samtools depth -@ 12 raw/bams_per_length/F*L_rddm.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/females_leaves_rddm.bed
# males
! samtools depth -@ 12 raw/bams_per_length/M*L_ptgs.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/males_leaves_ptgs.bed
! samtools depth -@ 12 raw/bams_per_length/M*L_rddm.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/males_leaves_rddm.bed

In [5]:
# get the number of mapped reads for the normalization 
! echo "____________________ FEMALES LEAVES_____________________"
! samtools view -c -F 260 -@ 12 raw/females_no_y/F1L_dicer.bam
! samtools view -c -F 260 -@ 12 raw/females_no_y/F2L_dicer.bam 
! samtools view -c -F 260 -@ 12 raw/females_no_y/F3L_dicer.bam 
! echo "____________________ MALES LEAVES_____________________"
! samtools view -c -F 260 -@ 12 raw/ShortStack_results/M1L_dicer.bam
! samtools view -c -F 260 -@ 12 raw/ShortStack_results/M2L_dicer.bam
! samtools view -c -F 260 -@ 12 raw/ShortStack_results/M4L_dicer.bam

____________________ FEMALES LEAVES_____________________
24695806
31425418
21443081
____________________ MALES LEAVES_____________________
20502566
17763390
26371650


In [6]:
# Get depths for each SB gene in leaves
# Flowers both sexes
! bedtools intersect -a raw/depth/females_leaves_ptgs.bed -b results/differential_expression/sex_biased_leaves_sbge.bed -wb | cut -f 1-6,10 > results/gene-level/leaves_depth_females.tsv
! bedtools intersect -a raw/depth/males_leaves_ptgs.bed -b results/differential_expression/sex_biased_leaves_sbge.bed -wb | cut -f 1-6,10 > results/gene-level/leaves_depth_males.tsv

scaffold_1	44590435	44591544	Silat_scaffold_1_Gene.13294	0	-	sex_biased_leaves_sbge	male-biased	Sex-biased genes in leaves

scaffold_1	44590435	44591544	Silat_scaffold_1_Gene.13294	0	-	sex_biased_leaves_sbge	male-biased	Sex-biased genes in leaves

scaffold_1	44590435	44591544	Silat_scaffold_1_Gene.13294	0	-	sex_biased_leaves_sbge	male-biased	Sex-biased genes in leaves

scaffold_1	44590435	44591544	Silat_scaffold_1_Gene.13294	0	-	sex_biased_leaves_sbge	male-biased	Sex-biased genes in leaves



### <span style="color:#3de2d8"> Looking for sRNA precursors

In [34]:
# Get fasta sequences from the overlapping clusters

#! awk 'FNR==NR{a[$0];next}{if (($2 in a)){print}}' <(cut -f 2 results/overlaps/flowers.tsv | sort | uniq) raw/only_21-22_known_de_novo_f-y/Results.txt | awk '{print $2 "\t" "21-22" "\t" $11 "NNNN"; system("echo " $11 "| rev")}' > temp.tab
# I mannually removed the line breaks that were introduced with awk

! awk 'FNR==NR{a[$0];next}{if (($2 in a)){print}}' <(cut -f 2 results/overlaps/flowers.tsv | sort | uniq) raw/only_21-22_known_de_novo_f-y/Results.txt | awk '{print $2 "\t" "21-22" "\t" $11}' > temp.tab

! perl -e ' $len=0; while(<>) { s/\r?\n//; @F=split /\t/, $_; print ">$F[0]"; if (length($F[1])) { print " $F[1]" } print "\n"; $s=$F[2]; $len+= length($s); $s=~s/.{60}(?=.)/$&\n/g; print "$s\n"; } warn "\nConverted $. tab-delimited lines to FASTA format\nTotal sequence length: $len\n\n"; ' temp.tab > results/overlaps/srnas_flowers.fa

! rm temp.tab


Converted 12 tab-delimited lines to FASTA format
Total sequence length: 275



In [35]:
# Now for leaves

! awk 'FNR==NR{a[$0];next}{if (($2 in a)){print}}' <(cut -f 2 results/overlaps/leaves.tsv | sort | uniq) raw/only_21-22_known_de_novo_f-y/Results.txt | awk '{print $2 "\t" "21-22" "\t" $11}' > temp.tab
! perl -e ' $len=0; while(<>) { s/\r?\n//; @F=split /\t/, $_; print ">$F[0]"; if (length($F[1])) { print " $F[1]" } print "\n"; $s=$F[2]; $len+= length($s); $s=~s/.{60}(?=.)/$&\n/g; print "$s\n"; } warn "\nConverted $. tab-delimited lines to FASTA format\nTotal sequence length: $len\n\n"; ' temp.tab > results/overlaps/srnas_leaves.fa

! rm temp.tab


Converted 6 tab-delimited lines to FASTA format
Total sequence length: 141



In [90]:
# index genome for blast
! makeblastdb -in genome/silat.fa -dbtype nucl



Building a new DB, current time: 05/11/2023 16:55:31
New DB name:   /home/eddy/silene/genome/silat.fa
New DB title:  genome/silat.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 19 sequences in 19.2123 seconds.




In [22]:
# Short BLAST against the genome, try 1
! blastn -db genome/silat.fa -query results/overlaps/srnas_hairpins_flowers.fa -num_threads 12 -task blastn-short -word_size 5 -gapopen 1 -gapextend 1 -out results/overlaps/srnas_hairpins_flowers.blast

#### <span style="color:#3de2d8">  Get inverted repeats from the genome (inside tes environment). 
einverted -sequence genome/silat.fa -threshold 80 -gap 8 -match 4 -mismatch -6 -maxrepeat 100 -outseq annotation/inv_rep.fa -outfile annotation/inv_rep.san
    
    Parameters used: [default]
    
    -gap penalty of 8 [12]
    -match score 4 [3]
    -mismatch score -6 [-4]
    -minimum score threshold 80 [50] (I guess it's calculated from the identity and the number of gaps -not documented)
    -maximum lenght from start to finish 100 (assuming at ~40bp in the terminal loop)

In [None]:
To create a bed file including the information about the repeat:
    
    - Chr
    - Start
    - End
    - ID
    - Score
    - Strand
    - Matching baisepairs be
    - Repeat length
    - Mismatches
    - Number of gaps
    - Total mismatches (gaps + mismatches)

In [None]:
paste \
<(grep "Score" inv_rep.san | sed -E 's/^(\w+):.+/\1/g' ) \
<(sed -n '/Score/ { n; p }' inv_rep.san | sed -E 's/\s*(\w+)\s*.+/\1/g' ) \
<(sed -n '/Score/ { n; n; n; p }' inv_rep.san | sed -E 's/\s*(\w+)\s*.+/\1/g' ) \
<(! echo inverted_repeat_{1..187797} | sed -E 's/\s/\n/g' | awk '{print $0 "\t" "0" "\t" "+"}') \
<(grep "Score" inv_rep.san | sed -E 's/.+: (\w+)\/(\w+).+, (\w+) ga.+/\1\t\2\t\3/g' | awk '{print $1 "\t" $2 "\t" $2-$1 "\t" $3 "\t" $2-$1+$3 }' ) \
  > inv_rep.bed

In [28]:
# Get fasta files for inverted repeats
! bedtools getfasta -nameOnly -fi genome/silat.fa -bed annotation/inv_rep.bed -fo annotation/inv_rep_full.fa
# make it a BLAST database
! makeblastdb -in annotation/inv_rep_full.fa -dbtype nucl



Building a new DB, current time: 05/30/2023 11:30:59
New DB name:   /home/eddy/silene/annotation/inv_rep_full.fa
New DB title:  annotation/inv_rep_full.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 187797 sequences in 1.03208 seconds.




In [108]:
# BLAST sRNAs against the inverted repeat library
# alingment above 15 bp
# reduced penalty for mismatches
# reduced gap penalty

# Flowers
! blastn -db annotation/inv_rep_full.fa -query results/overlaps/srnas_flowers.fa -num_threads 12 -task blastn-short -word_size 15 -penalty -1 -gapopen 1 -outfmt 6 -out results/overlaps/srnas_flowers.blast 
# Leaves
! blastn -db annotation/inv_rep_full.fa -query results/overlaps/srnas_leaves.fa -num_threads 12 -task blastn-short -word_size 15 -penalty -1 -gapopen 1 -outfmt 6 -out results/overlaps/srnas_leaves.blast

In [109]:
# I got the reverse complements from <https://www.bioinformatics.org/sms2/rev_comp.html> and edited them manually
# Flowers
! blastn -db annotation/inv_rep_full.fa -query results/overlaps/srnas_flowers_reverse_comp.fa -num_threads 12 -task blastn-short -word_size 15 -penalty -1 -gapopen 1 -outfmt 6 -out results/overlaps/srnas_flowers_reverse_comp.blast
# Leaves
! blastn -db annotation/inv_rep_full.fa -query results/overlaps/srnas_leaves_reverse_comp.fa -num_threads 12 -task blastn-short -word_size 15 -penalty -1 -gapopen 1 -outfmt 6 -out results/overlaps/srnas_leaves_reverse_comp.blast

I was planning to select only the inverted repeats that aling to both the sRNA and its reverse complement but since it is already an inverted repeat, tables are basically the same. I only needed to select alingments >21 bp.

In [125]:
# Get the correspondences and annotations for the precursos

# Flowers
! join -1 4 -2 3 <(cut -f 1-4 annotation/inv_rep.bed | sort -k4) <(join -1 2 -2 1 results/overlaps/flowers.tsv <(awk '$3 >= 21' results/overlaps/srnas_flowers.blast | cut -f 1-2 ) )
! join -1 4 -2 3 <(cut -f 1-4 annotation/inv_rep.bed | sort -k4) <(join -1 2 -2 1 results/overlaps/flowers.tsv <(awk '$3 >= 21' results/overlaps/srnas_flowers.blast | cut -f 1-2 ) ) | sed -E 's/\s+/\t/g' > results/overlaps/flowers_precursos.tsv
! echo "_____________________________________________________________________"

# Leaves
! join -1 4 -2 3 <(cut -f 1-4 annotation/inv_rep.bed | sort -k4) <(join -1 2 -2 1 results/overlaps/leaves.tsv <(awk '$3 >= 21' results/overlaps/srnas_leaves.blast | cut -f 1-2 ) | sort -k3 )
! join -1 4 -2 3 <(cut -f 1-4 annotation/inv_rep.bed | sort -k4) <(join -1 2 -2 1 results/overlaps/leaves.tsv <(awk '$3 >= 21' results/overlaps/srnas_leaves.blast | cut -f 1-2 ) | sort -k3 ) | sed -E 's/\s+/\t/g' > results/overlaps/leaves_precursos.tsv

inverted_repeat_82099 chr9 171650675 171650772 Cluster_43988 Silat_chr12_Gene.1440
_____________________________________________________________________
inverted_repeat_100834 chr11 26842100 26842194 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_107555 chr11 133984933 133985003 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_115770 chr12 4887030 4887128 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_115770 chr12 4887030 4887128 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_12972 chr1 191921461 191921554 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_13134 chr1 194653142 194653237 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_13320 chr1 198785892 198785990 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_134649 chr12 276687495 276687593 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_13516 chr2 3592422 3592509 Cluster_45289 Silat_scaffold_13_Gene.43660
inverted_repeat_137010 chr12 319332388 31

The "precursos.tsv" files contain:

    - precursor ID
    - chr
    - start
    - end
    - sRNA cluster
    - sex-biased gene target

In [126]:
! makeblastdb -in annotation/genes.fa -dbtype nucl



Building a new DB, current time: 05/31/2023 18:09:34
New DB name:   /home/eddy/silene/annotation/genes.fa
New DB title:  annotation/genes.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 13071 sequences in 1.10136 seconds.




In [134]:
# check if the flowers' precursor is overlapping to a gene or a TE
! bedtools intersect -a <(cut -f 2-4 results/overlaps/flowers_precursos.tsv) -b annotation/genes.bed -wb
! bedtools intersect -a <(cut -f 2-4 results/overlaps/flowers_precursos.tsv) -b annotation/tes.bed -wb

scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-

scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-

scaffold_1	1	38	Target "Motif:RLG_Athila_108210_FL_DL" 1 27

scaffold_1	1	38	Target "Motif:RLG_Athila_108210_FL_DL" 1 27



In [141]:
# check if the leaves' precursor actually comes from TEs
! bedtools intersect -a <(cut -f 2-4 results/overlaps/leaves_precursos.tsv) -b annotation/tes.bed -wb
! echo "_____________________________________________________________________"
# Now to TEsorter annotation
! bedtools intersect -a <(cut -f 2-4 results/overlaps/leaves_precursos.tsv) -b tesorter_genome/tes_genome.dom.bed -wb

scaffold_1	1	38	Target "Motif:RLG_Athila_108210_FL_DL" 1 27

chr11	133984933	133984954	chr11	133984897	133984954	Target "Motif:RLG_Retand_116_partial_D" 25880 25938
chr11	133984958	133985003	chr11	133984958	133985015	Target "Motif:RLG_Retand_116_partial_D" 43913 43967
chr11	133984970	133985003	chr11	133984970	133985189	Target "Motif:RLG_Retand_116_partial_D" 43919 44138
chr1	198785892	198785972	chr1	198785886	198785972	Target "Motif:RLG_Retand_116_partial_D" 37495 37580
chr2	3592427	3592503	chr2	3592427	3592503	Target "Motif:RLG_Retand_28930_FL_DL" 967 1048
chr2	3592439	3592508	chr2	3592439	3592508	Target "Motif:RLG_Retand_33319_FL_DL" 3636 3710
chr12	319495018	319495050	chr12	319494856	319495050	Target "Motif:RLG_Retand_13569_FL_DL" 296 488
chr12	319495081	319495113	chr12	319495081	319495245	Target "Motif:RLG_Retand_13569_FL_DL" 296 462
chr4	8272012	8272102	chr4	8272012	8272103	Target "Motif:RLG_Retand_28930_FL_DL" 958 1053
chr4	8272343	8272412	chr4	8272337	8272412	Target "Motif:RLG_R

In [128]:
# get the gene sequences of the genes that may be under PTGS to check if they are sex specific
! perl -e ' ($id,$fasta)=@ARGV; open(ID,$id); while (<ID>) { s/\r?\n//; /^>?(\S+)/; $ids{$1}++; } $num_ids = keys %ids; open(F, $fasta); $s_read = $s_wrote = $print_it = 0; while (<F>) { if (/^>(\S+)/) { $s_read++; if ($ids{$1}) { $s_wrote++; $print_it = 1; delete $ids{$1} } else { $print_it = 0 } }; if ($print_it) { print $_ } }; END { warn "Searched $s_read FASTA records.\nFound $s_wrote IDs out of $num_ids in the ID list.\n" } ' results/overlaps/genes.list annotation/genes.fa > results/overlaps/genes.fa

Searched 13071 FASTA records.
Found 2 IDs out of 2 in the ID list.


In [142]:
# Find similar sequences to the genes that have a sRNA with identified precursor
! blastn -db annotation/genes.fa -query results/overlaps/genes.fa -num_threads 12 -outfmt 6 -out results/overlaps/genes.blast

In [None]:
! RNAfold < annotation/inv_rep_full.fa −−outfile[=<filename>]


[36mInput string (upper or lower case); @ to quit[0m
[1m....,....1....,....2....,....3....,....4....,....5....,....6....,....7....,....8[0m


# <span style="color:#f0b27a"> RdDM

### <span style="color:#f0b27a"> Cluster Overlaps

In [25]:
# Add -+ 5000bp to genes
! bedtools slop -i results/differential_expression/sex_biased_flowers_sbge.bed -g chromSizes.txt -b 2000 -s >  results/rddm/sb_long_genes.bed
! bedtools slop -i annotation/full_genes.bed -g chromSizes.txt -b 2000 -s >  annotation/long_genes.bed

In [24]:
# get overlaps

# flowers
! bedtools intersect -nonamecheck -a <(cut -f 1-4 results/differential_expression/sex_biased_flowers_rddm.bed) -b <(cut -f 1-4 results/rddm/sb_long_genes.bed) -wb | cut -f 4,8
! bedtools intersect -nonamecheck -a <(cut -f 1-4 results/differential_expression/sex_biased_flowers_rddm.bed) -b <(cut -f 1-4 results/rddm/sb_long_genes.bed) -wb | cut -f 4,8 > results/rddm/flowers.tsv
! echo "__________________________________________________________"

# leaves
# flowers
! bedtools intersect -nonamecheck -a <(cut -f 1-4 results/differential_expression/sex_biased_leaves_rddm.bed) -b <(cut -f 1-4 results/rddm/sb_long_genes.bed) -wb | cut -f 4,8
! bedtools intersect -nonamecheck -a <(cut -f 1-4 results/differential_expression/sex_biased_leaves_rddm.bed) -b <(cut -f 1-4 results/rddm/sb_long_genes.bed) -wb | cut -f 4,8 > results/rddm/leaves.tsv
! echo "__________________________________________________________"

Cluster_2028_RdDM	Silat_chr1_Gene.4771
Cluster_9316_RdDM	Silat_chr2_Gene.30622
Cluster_10374_RdDM	Silat_chr3_Gene.37746
Cluster_12819_RdDM	Silat_chr3_Gene.42197
Cluster_18293_RdDM	Silat_chr5_Gene.5379
Cluster_23734_RdDM	Silat_chr7_Gene.21503
Cluster_23735_RdDM	Silat_chr7_Gene.21503
Cluster_29947_RdDM	Silat_chr9_Gene.17911
Cluster_30499_RdDM	Silat_chr9_Gene.46125
Cluster_37902_RdDM	Silat_chr11_Gene.15485
Cluster_40701_RdDM	Silat_chr12_Gene.49048
Cluster_40705_RdDM	Silat_chr12_Gene.49048
Cluster_40829_RdDM	Silat_chr12_Gene.45298
Cluster_41057_RdDM	Silat_chr12_Gene.46166
Cluster_41091_RdDM	Silat_chr12_Gene.14608
Cluster_41602_RdDM	Silat_chr12_Gene.32908
Cluster_42879_RdDM	Silat_chr12_Gene.26394
Cluster_43013_RdDM	Silat_chr12_Gene.25578
Cluster_43985_RdDM	Silat_chr12_Gene.1440
Cluster_44141_RdDM	Silat_chr12_Gene.8361
Cluster_44454_RdDM	Silat_scaffold_1_Gene.21140
Cluster_44455_RdDM	Silat_scaffold_1_Gene.21140
Cluster_45098_RdDM	Silat_scaffold_1_Gene.33918
Cluster_45154_RdDM	Silat_scaffold_

### <span style="color:#f0b27a"> chi-square test

In [29]:
# create bed of unbiased genes
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(cut -f 4 results/differential_expression/sex_biased_flowers_sbge.bed ) annotation/long_genes.bed | cut -f 1-4 > unbiased.genes.flowers

# create RdDM bed file
! tail -n +2 raw/only_24_f-y/Results.txt | awk '{print $3 "\t" $4 "\t" $5 "\t" $2 "_RdDM" }' > results/rddm/rddm.bed

# create list of unbiased clusters
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(cut -f 4 results/differential_expression/sex_biased_flowers_rddm.bed ) results/rddm/rddm.bed | cut -f 1-4 > unbiased.clusters.flowers
 
# calculations
! echo "Female-biased Genes with overlapping Female-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b <(grep "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes with overlapping Female-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b <(grep "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Female-biased Genes with overlapping Male-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes with overlapping Male-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

# Unbiased genes
! echo "______________________________________"
! echo "UN-biased Genes with overlapping Female-biased sRNAS"
! bedtools intersect -nonamecheck -a unbiased.genes.flowers -b <(grep "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "UN-biased Genes with overlapping Male-biased sRNAS"
! bedtools intersect -nonamecheck -a unbiased.genes.flowers -b <(grep -v "female-biased" results/differential_expression/sex_biased_flowers_rddm.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

# Unbiased sRNAs
! echo "Female-biased Genes with overlapping UN-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b unbiased.clusters.flowers | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes with overlapping UN-biased sRNAS"
! bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b unbiased.clusters.flowers | cut -f 4 | sort | uniq | wc -l

# Both unbiased
! echo "UN-biased Genes with overlapping UN-biased sRNAS"
! bedtools intersect -nonamecheck -a unbiased.genes.flowers -b unbiased.clusters.flowers | cut -f 4 | sort | uniq | wc -l

# Genes without overlappin sRNAs
! echo "______________________________________"
! echo "Female-biased Genes WITHOUT overlapping sRNAS"
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(bedtools intersect -nonamecheck -a <(grep "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b results/rddm/rddm.bed | cut -f 4 | sort | uniq ) <(grep "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

! echo "Male-biased Genes WITHOUT overlapping sRNAS"
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(bedtools intersect -nonamecheck -a <(grep -v "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) -b results/rddm/rddm.bed | cut -f 4 | sort | uniq ) <(grep -v "female-biased" results/rddm/sb_long_genes.bed | cut -f 1-4 ) | cut -f 4 | sort | uniq | wc -l

# No overlap and no sex bias
! echo "______________________________________"
! echo "UN-biased Genes WITHOUT overlapping sRNAS"
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(bedtools intersect -nonamecheck -a unbiased.genes.flowers -b results/rddm/rddm.bed | cut -f 4 | sort | uniq ) unbiased.genes.flowers | cut -f 4 | sort | uniq | wc -l

! rm unbiased*

Female-biased Genes with overlapping Female-biased sRNAS
5
Male-biased Genes with overlapping Female-biased sRNAS
6
Female-biased Genes with overlapping Male-biased sRNAS
1
Male-biased Genes with overlapping Male-biased sRNAS
10
______________________________________
UN-biased Genes with overlapping Female-biased sRNAS
52
UN-biased Genes with overlapping Male-biased sRNAS
39
Female-biased Genes with overlapping UN-biased sRNAS
398
Male-biased Genes with overlapping UN-biased sRNAS
803
UN-biased Genes with overlapping UN-biased sRNAS
7030
______________________________________
Female-biased Genes WITHOUT overlapping sRNAS
165
Male-biased Genes WITHOUT overlapping sRNAS
365
______________________________________
UN-biased Genes WITHOUT overlapping sRNAS
4254


### <span style="color:#f0b27a"> candidates

In [1]:
# get gene sequences and look for paralongs in the genome
! perl -e ' ($id,$fasta)=@ARGV; open(ID,$id); while (<ID>) { s/\r?\n//; /^>?(\S+)/; $ids{$1}++; } $num_ids = keys %ids; open(F, $fasta); $s_read = $s_wrote = $print_it = 0; while (<F>) { if (/^>(\S+)/) { $s_read++; if ($ids{$1}) { $s_wrote++; $print_it = 1; delete $ids{$1} } else { $print_it = 0 } }; if ($print_it) { print $_ } }; END { warn "Searched $s_read FASTA records.\nFound $s_wrote IDs out of $num_ids in the ID list.\n" } ' results/rddm/genes.list annotation/genes.fa > results/rddm/genes.fa
! blastn -db annotation/genes.fa -query results/rddm/genes.fa -num_threads 12 -outfmt 6 -out results/rddm/genes.blast

Searched 13071 FASTA records.
Found 8 IDs out of 10 in the ID list.


In [10]:
# check if the non-TE genes have a TE insertion nearby
! bedtools window -a <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' results/rddm/non-te_genes.list annotation/full_genes.bed | cut -f 1-4) -b annotation/tes.bed -w 2500
! bedtools window -a <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' results/rddm/non-te_genes.list annotation/full_genes.bed | cut -f 1-4) -b annotation/tes.bed -w 2500 > results/rddm/tes_around_genes.tsv

! echo "##############################################"
# using TE domains
! bedtools window -a <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' results/rddm/non-te_genes.list annotation/full_genes.bed | cut -f 1-4) -b tesorter_genome/tes_genome.dom.bed -w 2500

chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1837703	1837859	Target "Motif:rnd-5_family-2244" 3 160
chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1837860	1837885	Target "Motif:RLC_SIRE_76712_FL_DL" 12966 13148
chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1838213	1838685	Target "Motif:RLG_Tekay_54006_partial_D" 1492 1972
chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1838729	1838781	Target "Motif:rnd-1_family-420" 139 191
chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1838730	1838791	Target "Motif:rnd-1_family-420" 5 61
chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1843979	1844190	Target "Motif:rnd-1_family-528" 1 212
chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1844139	1844197	Target "Motif:RLG_Retand_14683_FL_DLTP" 3331 3389
chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1844970	1845139	Target "Motif:RLG_Retand_90523_FL_DL" 16421 16577
chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1844990	1845017	Target "Motif:RLG_Tekay_57285_FL_DL" 5093 5230
chr3	1839790	1844111	Sil

In [15]:
# check if the tf candidate has more clusters around
! bedtools window -a <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' <(echo Silat_chr3_Gene.37746) annotation/full_genes.bed | cut -f 1-4) -b results/rddm/rddm.bed -w 2000
! bedtools window -a <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' <(echo Silat_chr3_Gene.37746) annotation/full_genes.bed | cut -f 1-4) -b results/rddm/rddm.bed -w 2000 > results/rddm/clusters_around_genes.tsv

chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1838131	1838554	Cluster_10372_RdDM
chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1843893	1844308	Cluster_10373_RdDM
chr3	1839790	1844111	Silat_chr3_Gene.37746	chr3	1844928	1845346	Cluster_10374_RdDM


### <span style="color:#f0b27a"> gene-level mapping

In [8]:
# get long coordinates
! bedtools slop -i results/differential_expression/sex_biased_flowers_sbge.bed -g chromSizes.txt -b 2000 -s >  results/rddm/flowers_long_genes.bed
! bedtools slop -i results/differential_expression/sex_biased_leaves_sbge.bed -g chromSizes.txt -b 2000 -s >  results/rddm/leaves_long_genes.bed

In [9]:
# Get depths for each gene

# Flowers both sexes
! bedtools intersect -nonamecheck -a raw/depth/females_flowers_rddm.bed -b results/rddm/flowers_long_genes.bed -wb | cut -f 1-6,10 > results/rddm/flowers_depth_females.tsv
! bedtools intersect -nonamecheck -a raw/depth/males_flowers_rddm.bed -b results/rddm/flowers_long_genes.bed -wb | cut -f 1-6,10 > results/rddm/flowers_depth_males.tsv

# Leaves both sexes
! bedtools intersect -nonamecheck -a raw/depth/females_leaves_rddm.bed -b results/rddm/leaves_long_genes.bed -wb | cut -f 1-6,10 > results/rddm/leaves_depth_females.tsv
! bedtools intersect -nonamecheck -a raw/depth/males_leaves_rddm.bed -b results/rddm/leaves_long_genes.bed -wb | cut -f 1-6,10 > results/rddm/leaves_depth_males.tsv

# <span style="color:#ce7af0"> PopGen

In [11]:
! bedtools intersect -nonamecheck -a <(awk '$5 != "NA" ' popgen/pi_flower_sbg_pi.txt | cut -f 2-6 | tail -n +2) -b results/differential_expression/sex_biased_flowers_sbge.bed -wb | cut -f 1-5,9,13 | cut -f 6 | sort | uniq -dc

      2 Silat_chr10_Gene.12906
      2 Silat_chr10_Gene.31583


In [14]:
# get a bed file of the unbiased genes
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' <(cut -f 4 results/differential_expression/sex_biased_flowers_sbge.bed) annotation/full_genes.bed > results/differential_expression/unbiased_genes_flowers.bed
# calculations were done in my personal laptop