![image](silene.jpeg)

# sRNA profiles of flower dimorphism in _Silene latifolia_

#### Eddy J. Mendoza-Galindo
#### Advisor: Aline Muyle, CEFE Montpellier
April 2023

#### <span style="color:#4dd98b"> Exploration of sRNA abundance based on length and species, only for 21,22, 24

In [5]:
! cat scripts/count.sh

cd raw/sRNA_MGX/trimmed/
for file in *.fastq
do
echo "working with $file"
perl -e ' $count=0; $len=0; while(<>) { s/\r?\n//; s/\t/ /g; if (s/^@//) { if ($. != 1) { print "\n" } s/ |$/\t/; $count++; $_ .= "\t"; } else { s/ //g; $len += length($_) } print $_; } print "\n"; ' $file | sed -E 's/^.+\t(\w+)\+.*$/\1/g' | perl -e ' $col=0; while (<>) { s/\r?\n//; @F = split /\t/, $_; $len = length($F[$col]); print "$_\t$len\n" }; ' | awk '$2 ~ /(21|22|24)/ ' > ${file}_count.tsv
done


In [6]:
! bash scripts/count.sh

working with F1B_final_trimming.fastq
working with F1L_final_trimming.fastq
working with F2B_final_trimming.fastq
working with F2L_final_trimming.fastq
working with F3B_final_trimming.fastq
working with F3L_final_trimming.fastq
working with M1B_final_trimming.fastq
working with M1L_final_trimming.fastq
working with M2B_final_trimming.fastq
working with M2L_final_trimming.fastq
working with M4B_final_trimming.fastq
working with M4L_final_trimming.fastq


#### <span style="color:#4dd98b"> Select reads for 21, 22 and 24 nt long

In [1]:
! cat scripts/filter_size.sh

cd raw/
rm -r fastq/
mkdir fastq

out=fastq
files=sRNA_MGX/trimmed/*.fastq


for file in $files
do

name=$(echo $file | sed -E 's/^sR.*ed\/(\w+)_f.*/\1/g')

echo "WORKING WITH $name"

# Select reads of 21, 22 and 24 in length
seqtk comp $file | awk '$2 == 21' | cut -f 1 > 21.list
seqtk subseq $file 21.list > $out/${name}_21.fq

seqtk comp $file | awk '$2 == 22' | cut -f 1 > 22.list
seqtk subseq $file 22.list > $out/${name}_22.fq

seqtk comp $file | awk '$2 == 24' | cut -f 1 > 24.list
seqtk subseq $file 24.list > $out/${name}_24.fq

rm 21.list 22.list 24.list
cat ${name}_21.fq ${name}_22.fq ${name}_24.fq > ${name}_dicer.fq 
rm ${name}_21.fq ${name}_22.fq ${name}_24.fq 

done

In [4]:
! bash scripts/filter_size.sh

WORKING WITH F1B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F1L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F2B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F2L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F3B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH F3L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M1B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M1L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M2B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M2L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M4B
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done
WORKING WITH M4L
21-nt sRNAS done
22-nt sRNAS done
24-nt sRNAS done


In [6]:
#check lengths after filtering
! seqtk comp raw/fastq/M2B_dicer.fq | cut -f 2 | sort | uniq 
! seqtk comp raw/fastq/F1L_dicer.fq | cut -f 2 | sort | uniq 

21
22
24
21
22
24


# <span style="color:#4dd98b"> Alingment and quantification

We followed the Shortstack workflow,
Only uniquely-aligned reads are used as weights for placement of multi-mapped reads.

conda activate ShortStack4 # ShortStack only works under its environment

ShortStack --genomefile ../genome/silat.fa --readfile fastq/*.fq --threads 8 --knownRNAs caryophyllaceae_mirnas.fa --mmap u

In [26]:
# Remove Y chromosome from genome and map Females
! grep ">" genome/silat.fa | grep -v "scaffold" | sed 's/>//' > genome/no_y.list
! seqtk subseq genome/silat.fa genome/no_y.list > genome/no_y.fa
! grep ">" genome/no_y.fa

>chr1
>chr2
>chr3
>chr4
>chr5
>chr6
>chr7
>chr8
>chr9
>chr10
>chr11
>chr12


### <span style="color:#4dd98b"> Aling females to the genome without Y (ran inside raw/ and under the conda envirnonment of ShortStack$)
ShortStack --genomefile ../genome/no_y.fa --readfile fastq/F* --threads 8 --align_only --outdir females_no_y

### <span style="color:#4dd98b">Merge females (without y) and males (with y) (Inside SS environment so it works)

samtools merge -r -@ 12 -f -o raw/merged_females-y.bam -b raw/bam.list
    
#### <span style="color:#4dd98b"> First, PTGS (21-22 nt), asking for _de novo_ and template-based miRNA annotation
    
ShortStack --genomefile ../genome/silat.fa --bamfile merged_females-y.bam --threads 8 --outdir only_21-22_known_de_novo_f-y_try5 --dicermax 22 --mmap u --dn_mirna --knownRNAs caryophyllaceae_mirnas.fa 

#### <span style="color:#4dd98b"> Then, RDdM (24 nt), no miRNA identification
    
ShortStack --genomefile ../genome/silat.fa --bamfile merged_females-y.bam --threads 8 --outdir only_24_f-y --dicermin 23 --mmap u 

### <span style="color:#4dd98b"> Depth quantification

In [None]:
! bash scripts/mapping_depth.sh # Output is very heavy

# <span style="color:#3de2d8"> RNA-seq analysis

In [None]:
# Check quality
#! mkdir rna-seq/fastqc
#! rna-seq/FastQC/fastqc -t 16 -o rna-seq/fastqc rna-seq/raw/*.gz
! multiqc --outdir rna-seq/fastqc rna-seq/fastqc/

In [None]:
# Trimming 
! bash scripts/trimm.sh

In [None]:
! mkdir rna-seq/trimmed/fastqc
! rna-seq/FastQC/fastqc -t 16 -o rna-seq/trimmed/fastqc rna-seq/trimmed/*.fastq
! multiqc --outdir rna-seq/trimmed/fastqc rna-seq/trimmed/fastqc/

In [4]:
# Build a Kallisto index
! kallisto index -i annotation/vulgaris_transcriptome.idx annotation/vulgaris_transcriptome.fa


[build] loading fasta file annotation/vulgaris_transcriptome.fa
[build] k-mer length: 31
        with pseudorandom nucleotides
[build] counting k-mers ... done.
[build] building target de Bruijn graph ...  done 
[build] creating equivalence classes ...  done
[build] target de Bruijn graph has 217304 contigs and contains 31569734 k-mers 



In [None]:
! bash scripts/kallisto.sh