![image](silene.jpeg)

# sRNA profiles of flower dimorphism in _Silene latifolia_

#### Eddy J. Mendoza-Galindo
#### Advisor: Aline Muyle, CEFE Montpellier
April 2023

#### <span style="color:#4dd98b"> Exploration of sRNA abundance based on length and species, only for 21,22, 24

In [5]:
! cat scripts/count.sh

cd raw/sRNA_MGX/trimmed/
for file in *.fastq
do
echo "working with $file"
perl -e ' $count=0; $len=0; while(<>) { s/\r?\n//; s/\t/ /g; if (s/^@//) { if ($. != 1) { print "\n" } s/ |$/\t/; $count++; $_ .= "\t"; } else { s/ //g; $len += length($_) } print $_; } print "\n"; ' $file | sed -E 's/^.+\t(\w+)\+.*$/\1/g' | perl -e ' $col=0; while (<>) { s/\r?\n//; @F = split /\t/, $_; $len = length($F[$col]); print "$_\t$len\n" }; ' | awk '$2 ~ /(21|22|24)/ ' > ${file}_count.tsv
done


In [6]:
! bash scripts/count.sh

working with F1B_final_trimming.fastq
working with F1L_final_trimming.fastq
working with F2B_final_trimming.fastq
working with F2L_final_trimming.fastq
working with F3B_final_trimming.fastq
working with F3L_final_trimming.fastq
working with M1B_final_trimming.fastq
working with M1L_final_trimming.fastq
working with M2B_final_trimming.fastq
working with M2L_final_trimming.fastq
working with M4B_final_trimming.fastq
working with M4L_final_trimming.fastq


#### <span style="color:#4dd98b"> Select reads for 21, 22 and 24 nt long

In [1]:
! cat scripts/filter_size.sh

cd raw/
rm -r fastq/
mkdir fastq

out=fastq
files=sRNA_MGX/trimmed/*.fastq


for file in $files
do

name=$(echo $file | sed -E 's/^sR.*ed\/(\w+)_f.*/\1/g')

echo "WORKING WITH $name"

# Select reads of 21, 22 and 24 in length
seqtk comp $file | awk '$2 == 21' | cut -f 1 > 21.list
seqtk subseq $file 21.list > $out/${name}_21.fq

seqtk comp $file | awk '$2 == 22' | cut -f 1 > 22.list
seqtk subseq $file 22.list > $out/${name}_22.fq

seqtk comp $file | awk '$2 == 24' | cut -f 1 > 24.list
seqtk subseq $file 24.list > $out/${name}_24.fq

rm 21.list 22.list 24.list
cat ${name}_21.fq ${name}_22.fq ${name}_24.fq > ${name}_dicer.fq 
rm ${name}_21.fq ${name}_22.fq ${name}_24.fq 

done

In [4]:
! bash scripts/filter_size.sh

WORKING WITH F1B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F1L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F2B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F2L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F3B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F3L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M1B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M1L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M2B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M2L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M4B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M4L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done


In [6]:
#check lengths after filtering
! seqtk comp raw/fastq/M2B_dicer.fq | cut -f 2 | sort | uniq 
! seqtk comp raw/fastq/F1L_dicer.fq | cut -f 2 | sort | uniq 

21
22
24
21
22
24


# <span style="color:#4dd98b"> Alingment and quantification

We followed the Shortstack workflow,
Only uniquely-aligned reads are used as weights for placement of multi-mapped reads.

conda activate ShortStack4 # ShortStack only works under its environment

ShortStack --genomefile ../genome/silat.fa --readfile fastq/*.fq --threads 8 --knownRNAs caryophyllaceae_mirnas.fa --mmap u

In [26]:
# Remove Y chromosome from genome and map Females
! grep ">" genome/silat.fa | grep -v "scaffold" | sed 's/>//' > genome/no_y.list
! seqtk subseq genome/silat.fa genome/no_y.list > genome/no_y.fa
! grep ">" genome/no_y.fa

>chr1
>chr2
>chr3
>chr4
>chr5
>chr6
>chr7
>chr8
>chr9
>chr10
>chr11
>chr12


### <span style="color:#4dd98b"> Aling females to the genome without Y (ran inside raw/ and under the conda envirnonment of ShortStack$)
ShortStack --genomefile ../genome/no_y.fa --readfile fastq/F* --threads 8 --align_only --outdir females_no_y

### <span style="color:#4dd98b">Merge females (without y) and males (with y) (Inside SS environment so it works)

samtools merge -r -@ 12 -f -o raw/merged_females-y.bam -b raw/bam.list
    
#### <span style="color:#4dd98b"> First, PTGS (21-22 nt), asking for _de novo_ and template-based miRNA annotation
    
ShortStack --genomefile ../genome/silat.fa --bamfile merged_females-y.bam --threads 8 --outdir only_21-22_known_de_novo_f-y_try5 --dicermax 22 --mmap u --dn_mirna --knownRNAs caryophyllaceae_mirnas.fa 

#### <span style="color:#4dd98b"> Then, RDdM (24 nt), no miRNA identification
    
ShortStack --genomefile ../genome/silat.fa --bamfile merged_females-y.bam --threads 8 --outdir only_24_f-y --dicermin 23 --mmap u 

### <span style="color:#4dd98b"> Depth quantification

In [None]:
! bash scripts/mapping_depth.sh # Output is very heavy

### <span style="color:#4dd98b"> Circos plot

In [3]:
! bedtools makewindows -g chromSizes.txt -w 1000000 > annotation/1mb_windows.bed # create windows

In [26]:
# make merged aligments for each sex
! samtools merge -r -@ 12 -f -o raw/female_flower.bam raw/females_no_y/*B_dicer.bam 
! samtools merge -r -@ 12 -f -o raw/male_flower.bam raw/ShortStack_results/M*B_dicer.bam

[bam_translate] RG tag "F2B_dicer" on read "A00924:314:H5CTHDRX2:2:2260:8621:30953" encountered with no corresponding entry in header, tag lost. Unknown tags are only reported once per input file for each tag ID.
[bam_translate] RG tag "F3B_dicer" on read "A00924:314:H5CTHDRX2:2:2220:25192:21793" encountered with no corresponding entry in header, tag lost. Unknown tags are only reported once per input file for each tag ID.


In [22]:
# get bam for each lenght and individual
! rm -r raw/bams_per_length/
! mkdir raw/bams_per_length

# first 21-22, females
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F1B_ptgs.bam raw/females_no_y/F1B_dicer.bam 
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F2B_ptgs.bam raw/females_no_y/F2B_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/F3B_ptgs.bam raw/females_no_y/F3B_dicer.bam 

# then 24, females
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F1B_rddm.bam raw/females_no_y/F1B_dicer.bam 
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F2B_rddm.bam raw/females_no_y/F2B_dicer.bam
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/F3B_rddm.bam raw/females_no_y/F3B_dicer.bam 

# first 21-22, males
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M1B_ptgs.bam raw/ShortStack_results/M1B_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M2B_ptgs.bam raw/ShortStack_results/M2B_dicer.bam
! samtools view  -h -e 'length(seq)==21 || length(seq)==22' -@ 12 -o raw/bams_per_length/M4B_ptgs.bam raw/ShortStack_results/M4B_dicer.bam

# then 24, males
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M1B_rddm.bam raw/ShortStack_results/M1B_dicer.bam 
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M2B_rddm.bam raw/ShortStack_results/M2B_dicer.bam
! samtools view  -h -e 'length(seq)==24' -@ 12 -o raw/bams_per_length/M4B_rddm.bam raw/ShortStack_results/M4B_dicer.bam

rm: cannot remove 'raw/bams_per_length/': No such file or directory


In [9]:
# calculate depth for each individual, only for flower buds and convert it to bed, 21-22 in flowers
# females
! samtools depth -@ 12 raw/bams_per_length/F*ptgs.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/females_flowers_ptgs.bed
! samtools depth -@ 12 raw/bams_per_length/F*rddm.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/females_flowers_rddm.bed
# males
! samtools depth -@ 12 raw/bams_per_length/M*ptgs.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/males_flowers_ptgs.bed
! samtools depth -@ 12 raw/bams_per_length/M*rddm.bam | awk '{print $1 "\t" $2 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > raw/depth/males_flowers_rddm.bed

In [5]:
# get the number of mapped reads for the normalization
! echo "____________________ FEMALE 21-22nt _____________________"
! samtools view -c -F 260 -@ 12 raw/bams_per_length/F1B_ptgs.bam
! samtools view -c -F 260 -@ 12 raw/bams_per_length/F2B_ptgs.bam
! samtools view -c -F 260 -@ 12 raw/bams_per_length/F3B_ptgs.bam
! echo "____________________ FEMALE 24nt _____________________"
! samtools view -c -F 260 -@ 12 raw/bams_per_length/F1B_rddm.bam
! samtools view -c -F 260 -@ 12 raw/bams_per_length/F2B_rddm.bam
! samtools view -c -F 260 -@ 12 raw/bams_per_length/F3B_rddm.bam
! echo "____________________ MALE 21-22nt _____________________"
! samtools view -c -F 260 -@ 12 raw/bams_per_length/M1B_ptgs.bam
! samtools view -c -F 260 -@ 12 raw/bams_per_length/M2B_ptgs.bam
! samtools view -c -F 260 -@ 12 raw/bams_per_length/M4B_ptgs.bam
! echo "____________________ MALE 24nt _____________________"
! samtools view -c -F 260 -@ 12 raw/bams_per_length/M1B_rddm.bam
! samtools view -c -F 260 -@ 12 raw/bams_per_length/M2B_rddm.bam
! samtools view -c -F 260 -@ 12 raw/bams_per_length/M4B_rddm.bam


____________________ FEMALE 21-22nt _____________________
19022354
10510431
8511923
____________________ FEMALE 24nt _____________________
35912648
19178882
16733766
____________________ MALE 21-22nt _____________________
6486661
9227331
9622793
____________________ MALE 24nt _____________________
6737939
14716350
13959309


In [11]:
# get the window identity, 21-22
! bedtools intersect -b annotation/1mb_windows.bed -a raw/depth/females_flowers_ptgs.bed -wb -wa | awk '{print $0 "\t" $7 "_" $8 "_" $9}' > raw/depth/females_flowers_ptgs.depth
! bedtools intersect -b annotation/1mb_windows.bed -a raw/depth/males_flowers_ptgs.bed -wb -wa | awk '{print $0 "\t" $7 "_" $8 "_" $9}' > raw/depth/males_flowers_ptgs.depth
# get the window identity, 24
! bedtools intersect -b annotation/1mb_windows.bed -a raw/depth/females_flowers_rddm.bed -wb -wa | awk '{print $0 "\t" $7 "_" $8 "_" $9}' > raw/depth/females_flowers_rddm.depth
! bedtools intersect -b annotation/1mb_windows.bed -a raw/depth/males_flowers_rddm.bed -wb -wa | awk '{print $0 "\t" $7 "_" $8 "_" $9}' > raw/depth/males_flowers_rddm.depth

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000

scaffold_1	0	1000000



In [15]:
# get bed file for TE distribution
! cut -f 1,4,5,9 annotation/Slat_v2_Class_I_II_TRF_MITE_sorted.gff > annotation/repeats.bed
! bedtools intersect -b annotation/1mb_windows.bed -a annotation/repeats.bed -wb -wa | awk '{print $0 "\t" $7 "_" $8 "_" $9}' > raw/depth/repeats.bedbed

! grep -v "monomer" annotation/repeats.bed | grep -v "(" | grep -v "rich" > annotation/tes.bed 

scaffold_1	0	1000000

scaffold_1	0	1000000



# <span style="color:#3de2d8"> RNA-seq analysis

In [None]:
# Check quality
#! mkdir rna-seq/fastqc
#! rna-seq/FastQC/fastqc -t 16 -o rna-seq/fastqc rna-seq/raw/*.gz
! multiqc --outdir rna-seq/fastqc rna-seq/fastqc/

In [None]:
# Trimming 
! bash scripts/trimm.sh

In [None]:
! mkdir rna-seq/trimmed/fastqc
! rna-seq/FastQC/fastqc -t 16 -o rna-seq/trimmed/fastqc rna-seq/trimmed/*.fastq
! multiqc --outdir rna-seq/trimmed/fastqc rna-seq/trimmed/fastqc/

In [72]:
# Create bed file for the annotation
! cut -f 1-6 -d ' ' annotation/ALLGenesFromVulgarisIntoLatifolia.txt | tail -n +2 | sed -E 's/(\w+)\s(\Ge.+)\::out.+(\s.\s\w+\s\w+)/\1\tSilat_\1_\2\t\3/' | sed -E 's/ /\t/g' | awk '{print $1 "\t" $4 "\t" $5 "\t" $2 "\t" 0 "\t" $3}' > annotation/transcripts.bed 

# Identify overlapping annotations
! bedtools intersect -a annotation/transcripts.bed -b annotation/transcripts.bed -wb | cut -f 4,10 | awk '{if ($1 != $2) {print $2}}' | sort | uniq > annotation/overlaps.txt
! wc -l annotation/overlaps.txt
# 11609 overlapping annotations were found !!!!
# Nor remove those annotations
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' annotation/overlaps.txt annotation/transcripts.bed > annotation/genes.bed

# Verify they don't overlap anymore
! bedtools intersect -a annotation/genes.bed -b annotation/genes.bed -wb | cut -f 4,10 | awk '{if ($1 != $2) {print}}'

# 22 loci had the same ID. Some mapped close, others not; others had only different extents but couldn't be removed before before cause they had the same ID
# To homogeneize and not loose them, I just selected those that have the longest genomic range
! cut -f 4 annotation/genes.bed | sort | uniq -d > annotation/duplicated.list
! awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' annotation/duplicated.list annotation/transcripts.bed | sort -k4 > annotation/duplicated.tsv
! awk '{len[$4] = $3 - $2 > len[$4] ? $3 - $2 : len[$4]; range[$4, len[$4]] = $0} END {for (id in len) print range[id, len[id]]}' annotation/duplicated.tsv | sort -k4 > annotation/duplicated.bed
# Now remove all duplicated entries and paste the selected ones
! awk 'FNR==NR{a[$0];next}{if (!($4 in a)){print}}' annotation/duplicated.list annotation/genes.bed > annotation/clean_genes.bed
! cat annotation/clean_genes.bed annotation/duplicated.bed > annotation/genes.bed
# Check no ID is duplicated
! cut -f 4 annotation/genes.bed | sort | uniq -d # NO ONE IS NOW!

# Get fasta files for genes 
! bedtools getfasta -nameOnly -fi genome/silat.fa -bed annotation/genes.bed -fo annotation/genes.fa

# Clean intermediate files 
! rm annotation/duplicated* annotation/clean*

scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-

scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-

11609 annotation/overlaps.txt
scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-

scaffold_1	760283	762422	Silat_scaffold_1_Gene.7463	0	-



In [71]:
# Build a Kallisto index
! kallisto index -i annotation/genes.idx annotation/genes.fa


[build] loading fasta file annotation/genes.fa
[build] k-mer length: 31
        with pseudorandom nucleotides
[build] counting k-mers ... done.
[build] building target de Bruijn graph ...  done 
[build] creating equivalence classes ...  done
[build] target de Bruijn graph has 420049 contigs and contains 60722289 k-mers 



In [None]:
! bash scripts/kallisto.sh

In [74]:
# Merge results
! ls rna-seq/kallisto/ | head -n 16 > rna-seq/kallisto/samples.list
# first I manually changed the dir names so it was less messy
! bash scripts/merge_kallisto_raw.sh 

rm: cannot remove 'kallisto/raw_counts.tsv': No such file or directory
rm: cannot remove 'kallisto/tpm.tsv': No such file or directory
C1_01_B RUNNING
C1_01_B DONE
_________
C1_01_L RUNNING
C1_01_L DONE
_________
C1_03_B RUNNING
C1_03_B DONE
_________
C1_03_L RUNNING
C1_03_L DONE
_________
C1_04_B_combined RUNNING
C1_04_B_combined DONE
_________
C1_04_L RUNNING
C1_04_L DONE
_________
C1_05_B_combined RUNNING
C1_05_B_combined DONE
_________
C1_05_L RUNNING
C1_05_L DONE
_________
C1_26_B RUNNING
C1_26_B DONE
_________
C1_26_L RUNNING
C1_26_L DONE
_________
C1_27_B RUNNING
C1_27_B DONE
_________
C1_27_L RUNNING
C1_27_L DONE
_________
C1_29_B_combined RUNNING
C1_29_B_combined DONE
_________
C1_29_L RUNNING
C1_29_L DONE
_________
C1_34_B_combined RUNNING
C1_34_B_combined DONE
_________
C1_34_L RUNNING
C1_34_L DONE
_________


# <span style="color:#3de2d8"> PTGS

In [75]:
# Add 400bd upstream and downstream (UTR and promoter) strand-wise
! bedtools slop -i annotation/genes.bed -g chromSizes.txt -b 400 -s > annotation/full_genes.bed

In [None]:
# Benchmark for PTGS, get the intersections for the DE
! bedtools intersect -a results/differential_expression/sex_biased_flowers_sbge.bed -b results/differential_expression/sex_biased_flowers_ptgs.bed -wa -wb | cut -f 4,11 > results/overlaps/flowers.tsv
! bedtools intersect -a results/differential_expression/sex_biased_leaves_sbge.bed -b results/differential_expression/sex_biased_leaves_ptgs.bed -wa -wb | cut -f 4,11 > results/overlaps/leaves.tsv
! bedtools intersect -a results/differential_expression/tissue_biased_females_sbge.bed -b results/differential_expression/tissue_biased_females_ptgs.bed -wa -wb | cut -f 4,11 > results/overlaps/females.tsv
! bedtools intersect -a results/differential_expression/tissue_biased_males_sbge.bed -b results/differential_expression/tissue_biased_males_ptgs.bed -wa -wb | cut -f 4,11 > results/overlaps/males.tsv

In [30]:
# create a bed file for the srna clusters
! awk '{print $3 "\t" $4 "\t" $5 "\t" $2}' raw/only_21-22_known_de_novo_f-y/Results.txt | tail -n +2 > annotation/ptgs.bed

In [40]:
# chi-square calculations. FLOWERS
! echo "BIASED GENES VS BIASED SRNAS"
! bedtools intersect -a results/differential_expression/sex_biased_flowers_sbge.bed -b results/differential_expression/sex_biased_flowers_ptgs.bed -wa -wb | cut -f 4 | sort | uniq | wc -l

# create lists of unbiased elements
! diff <(cut -f 4 annotation/genes.bed | sort) <(cut -f 4 results/differential_expression/sex_biased_flowers_sbge.bed | sort) | grep "Silat" | sed 's/< //g' > list.unbiased.genes
! diff <(cut -f 2 raw/only_21-22_known_de_novo_f-y/Results.txt | sort) <(cut -f 4 results/differential_expression/sex_biased_flowers_ptgs.bed | sort) | grep "Cluster" | sed 's/< //g' > list.unbiased.srnas
# intersect and count number of genes

! echo "UNBIASED GENES VS BIASED SRNAS"
! bedtools intersect -a <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' list.unbiased.genes annotation/genes.bed) -b results/differential_expression/sex_biased_flowers_ptgs.bed | cut -f 4 | sort | uniq | wc -l

! echo "BIASED GENES VS UNBIASED SRNAS"
! bedtools intersect -a results/differential_expression/sex_biased_flowers_sbge.bed -b <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' list.unbiased.srnas annotation/ptgs.bed) | cut -f 4 | sort | uniq | wc -l

! echo "UNBIASED GENES VS UNBIASED SRNAS"
! bedtools intersect -a <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' list.unbiased.genes annotation/genes.bed) -b <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' list.unbiased.srnas annotation/ptgs.bed) | cut -f 4 | sort | uniq | wc -l

! rm list.unbiased*

BIASED GENES VS BIASED SRNAS
scaffold_1	205595	205995	Cluster_44358	AUAACUCAAACUAGCUAGAUACAC	.	N	sex_biased_flowers_ptgs

scaffold_1	205595	205995	Cluster_44358	AUAACUCAAACUAGCUAGAUACAC	.	N	sex_biased_flowers_ptgs

10
UNBIASED GENES VS BIASED SRNAS
scaffold_1	205595	205995	Cluster_44358	AUAACUCAAACUAGCUAGAUACAC	.	N	sex_biased_flowers_ptgs

scaffold_1	205595	205995	Cluster_44358	AUAACUCAAACUAGCUAGAUACAC	.	N	sex_biased_flowers_ptgs

32
BIASED GENES VS UNBIASED SRNAS
scaffold_1	4556571	4556979	Cluster_44362

scaffold_1	4556571	4556979	Cluster_44362

758
UNBIASED GENES VS UNBIASED SRNAS
scaffold_1	4556571	4556979	Cluster_44362

scaffold_1	4556571	4556979	Cluster_44362

1764
BIASED GENES WITH NO OVERLAP
scaffold_1	205595	205995	Cluster_44358

scaffold_1	205595	205995	Cluster_44358

980
UNBIASED GENES WITH NO OVERLAP
scaffold_1	205595	205995	Cluster_44358

scaffold_1	205595	205995	Cluster_44358

11328


In [37]:
# chi-square calculations. LEAVES
! echo "BIASED GENES VS BIASED SRNAS"
! bedtools intersect -a results/differential_expression/sex_biased_leaves_sbge.bed -b results/differential_expression/sex_biased_leaves_ptgs.bed -wa -wb | cut -f 4 | sort | uniq | wc -l

# create lists of unbiased elements
! diff <(cut -f 4 annotation/genes.bed | sort) <(cut -f 4 results/differential_expression/sex_biased_leaves_sbge.bed | sort) | grep "Silat" | sed 's/< //g' > list.unbiased.genes
! diff <(cut -f 2 raw/only_21-22_known_de_novo_f-y/Results.txt | sort) <(cut -f 4 results/differential_expression/sex_biased_leaves_ptgs.bed | sort) | grep "Cluster" | sed 's/< //g' > list.unbiased.srnas
# intersect and count number of genes

! echo "UNBIASED GENES VS BIASED SRNAS"
! bedtools intersect -a <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' list.unbiased.genes annotation/genes.bed) -b results/differential_expression/sex_biased_leaves_ptgs.bed | cut -f 4 | sort | uniq | wc -l

! echo "BIASED GENES VS UNBIASED SRNAS"
! bedtools intersect -a results/differential_expression/sex_biased_leaves_sbge.bed -b <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' list.unbiased.srnas annotation/ptgs.bed) | cut -f 4 | sort | uniq | wc -l

! echo "UNBIASED GENES VS UNBIASED SRNAS"
! bedtools intersect -a <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' list.unbiased.genes annotation/genes.bed) -b <(awk 'FNR==NR{a[$0];next}{if (($4 in a)){print}}' list.unbiased.srnas annotation/ptgs.bed) | cut -f 4 | sort | uniq | wc -l

! rm list.unbiased*

BIASED GENES VS BIASED SRNAS
scaffold_1	205595	205995	Cluster_44358	AUAACUCAAACUAGCUAGAUACAC	.	N	sex_biased_leaves_ptgs

scaffold_1	205595	205995	Cluster_44358	AUAACUCAAACUAGCUAGAUACAC	.	N	sex_biased_leaves_ptgs

3
UNBIASED GENES VS BIASED SRNAS
scaffold_1	205595	205995	Cluster_44358	AUAACUCAAACUAGCUAGAUACAC	.	N	sex_biased_leaves_ptgs

scaffold_1	205595	205995	Cluster_44358	AUAACUCAAACUAGCUAGAUACAC	.	N	sex_biased_leaves_ptgs

27
BIASED GENES VS UNBIASED SRNAS
scaffold_1	3254854	3255458	Cluster_44360

scaffold_1	3254854	3255458	Cluster_44360

38
UNBIASED GENES VS UNBIASED SRNAS
scaffold_1	3254854	3255458	Cluster_44360

scaffold_1	3254854	3255458	Cluster_44360

2220


### <span style="color:#3de2d8"> Looking for precursors

In [89]:
# Get fasta sequences from the overlapping clusters
#! awk 'FNR==NR{a[$0];next}{if (($2 in a)){print}}' <(cut -f 2 results/overlaps/flowers.tsv | sort | uniq) raw/only_21-22_known_de_novo_f-y/Results.txt | awk '{print $2 "\t" "21-22" "\t" $11 "NNNN"; system("echo " $11 "| rev")}' > temp.tab

# I mannually removed the line breaks that were introduced with awk
! perl -e ' $len=0; while(<>) { s/\r?\n//; @F=split /\t/, $_; print ">$F[0]"; if (length($F[1])) { print " $F[1]" } print "\n"; $s=$F[2]; $len+= length($s); $s=~s/.{60}(?=.)/$&\n/g; print "$s\n"; } warn "\nConverted $. tab-delimited lines to FASTA format\nTotal sequence length: $len\n\n"; ' temp.tab > results/overlaps/srnas_hairpins_flowers.fa

! rm temp.tab


Converted 12 tab-delimited lines to FASTA format
Total sequence length: 598



In [92]:
# Now for leaves
#! awk 'FNR==NR{a[$0];next}{if (($2 in a)){print}}' <(cut -f 2 results/overlaps/leaves.tsv | sort | uniq) raw/only_21-22_known_de_novo_f-y/Results.txt | awk '{print $2 "\t" "21-22" "\t" $11 "NNNN"; system("echo " $11 "| rev")}' > temp.tab

# I mannually removed the line breaks that were introduced with awk
! perl -e ' $len=0; while(<>) { s/\r?\n//; @F=split /\t/, $_; print ">$F[0]"; if (length($F[1])) { print " $F[1]" } print "\n"; $s=$F[2]; $len+= length($s); $s=~s/.{60}(?=.)/$&\n/g; print "$s\n"; } warn "\nConverted $. tab-delimited lines to FASTA format\nTotal sequence length: $len\n\n"; ' temp.tab > results/overlaps/srnas_hairpins_leaves.fa

! rm temp.tab


Converted 6 tab-delimited lines to FASTA format
Total sequence length: 306



In [90]:
# index genome for blast
! makeblastdb -in genome/silat.fa -dbtype nucl



Building a new DB, current time: 05/11/2023 16:55:31
New DB name:   /home/eddy/silene/genome/silat.fa
New DB title:  genome/silat.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 19 sequences in 19.2123 seconds.




In [100]:
! blastn -db genome/silat.fa -query results/overlaps/srnas_hairpins_flowers.fa -out results/overlaps/srnas_hairpins_flowers.blast
! blastn -db genome/silat.fa -query results/overlaps/srnas_hairpins_leaves.fa -out results/overlaps/srnas_hairpins_leaves.blast

CFastaReader: Hyphens are invalid and will be ignored around line 2
