## setup working directory

In [None]:
%%bash
mkdir sample_data
cd sample_data
mkdir reads
mkdir barcodes
mkdir filtered_reads
cd barcodes
mkdir barcodes_cleaned
mkdir barcode_combinations

## create cleaned files of all barcoding rounds

In [1]:
%%bash
scripts="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/SPLiT-seq_DGE_matrix"
home_folder="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data"

out=$home_folder/barcodes/barcodes_cleaned


# create r1barcodes file
in1=$home_folder/barcodes/barcodes_ordering_list/R1_barcodes.txt
name1="r1barcodes"

python3 $scripts/barcodes_to_txt.py \
    --in_dir $in1 \
    --out_dir $out \
    --file_name $name1

# create r2barcodes file
in2=$home_folder/barcodes/barcodes_ordering_list/R2_barcodes.txt
name2="r2barcodes"

python3 $scripts/barcodes_to_txt.py \
    --in_dir $in2 \
    --out_dir $out \
    --file_name $name2
    
# create r3barcodes file
in3=$home_folder/barcodes/barcodes_ordering_list/R3_barcodes.txt
name3="r3barcodes"

python3 $scripts/barcodes_to_txt.py \
    --in_dir $in3 \
    --out_dir $out \
    --file_name $name3

## create file containing all possible combinations of barcodes

In [2]:
%%bash
scripts="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/SPLiT-seq_DGE_matrix"
home_folder="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data"

in=$home_folder/barcodes/barcodes_cleaned
out=$home_folder/barcodes/barcode_combinations


# make all possible combinations of barcodes based on the three input files containing
# R1, R2 and R3 barcodes
bc1=$in/r1barcodes.txt
bc2=$in/r2barcodes.txt
bc3=$in/r3barcodes.txt

python3 $scripts/create_bc_comb.py \
    --bc1 $bc1 \
    --bc2 $bc2 \
    --bc3 $bc3 \
    --out_dir $out

## filter raw reads

In [3]:
%%bash
scripts="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/SPLiT-seq_DGE_matrix"
home_folder="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data"


# filter raw reads
bc_ref=$home_folder/barcodes/barcode_combinations/barcode_combinations.txt
bc_reads=$home_folder/raw_reads/bc_1000.fastq
gen_reads=$home_folder/raw_reads/gen_1000.fastq
out_dir=$home_folder/filtered_reads

python3 $scripts/filter_barcodes_advanced.py \
    --bc_reference $bc_ref \
    --bc_reads $bc_reads \
    --gen_reads $gen_reads \
    --out_dir $out_dir \
    --mode "simple"

reads have been aligned in mode: simple
filtered reads have sucessfully been written to files
The memory usage was: 0.064516096 GB


## create reference genome bowtie index

In [None]:
%%bash
home_folder="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/SPLiT-seq_DGE_matrix"


genome=$home_folder/sample_bash_scripts/sample_data/mm10_genome/genome/mm10.fasta.zip
out=$home_folder/sample_bash_scripts/sample_data/mm10_genome/genome_index/mm10  # mm10 is a prefix for outout files

bowtie2-build \
     --threads 4 \
     $genome \
     $out

## align filtered genomic reads to reference genome

In [4]:
%%bash
home_folder="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data"


bowtie2_index=$home_folder/mm10_genome/genome_index/mus_musculus
genomic_reads=$home_folder/filtered_reads/filtered_genomic_reads.fastq
output=$home_folder/aligned_filtered_reads/bowtie_aligned_reads.sam

bowtie2 \
     -p 4 \
     -x $bowtie2_index \
     -U $genomic_reads \
     -S $output

361 reads; of these:
  361 (100.00%) were unpaired; of these:
    97 (26.87%) aligned 0 times
    229 (63.43%) aligned exactly 1 time
    35 (9.70%) aligned >1 times
73.13% overall alignment rate


## add BC and UMI tag to aligned_filtered_genomic_read.sam

In [5]:
%%bash
scripts="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/SPLiT-seq_DGE_matrix"
home_folder="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data"


path_to_bc=$home_folder/filtered_reads/filtered_bc_reads.txt
path_to_umi=$home_folder/filtered_reads/filtered_UMIs.txt
sam_in_path=$home_folder/aligned_filtered_reads/bowtie_aligned_reads.sam
sam_out_path=$home_folder/tagged_aligned_filtered_reads
file_name="out_bc_umi.sam"

python3 $scripts/add_BC_UMI_tags.py \
     --bc_in $path_to_bc \
     --umi_in $path_to_umi \
     --sam_in $sam_in_path \
     --sam_out $sam_out_path \
     --file_name $file_name

361


## add gene function tags to tagged_aligned_filtered_genomic_read.sam

In [6]:
%%bash
home_folder="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data"
dropseq_tools="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/SPLiT-seq_DGE_matrix/sample_bash_scripts/drop_seq_tools"


tagged_aligned=$home_folder/tagged_aligned_filtered_reads/out_bc_umi.sam
out=$home_folder/genfun_tagged_aligned_filtered_reads/genfun_tagged.sam
annotated_genome=$home_folder/mm10_genome/genome_annotation/mm10.refFlat

$dropseq_tools/TagReadWithGeneFunction \
     I=$tagged_aligned \
     O=$out \
     ANNOTATIONS_FILE=$annotated_genome

INFO	2019-11-05 19:18:59	TagReadWithGeneFunction	

********** NOTE: Picard's command line syntax is changing.
**********
********** For more information, please see:
********** https://github.com/broadinstitute/picard/wiki/Command-Line-Syntax-Transition-For-Users-(Pre-Transition)
**********
********** The command line looks like this in the new syntax:
**********
**********    TagReadWithGeneFunction -I /Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data/tagged_aligned_filtered_reads/out_bc_umi.sam -O /Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data/genfun_tagged_aligned_filtered_reads/genfun_tagged.sam -ANNOTATIONS_FILE /Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data/mm10_genome/genome_annotation/mm10.refFlat
**********


19:19:00.549 INFO  NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/SPLiT-se

## calculate DGE matrix

In [7]:
%%bash
home_folder="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data"
dropseq_tools="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/SPLiT-seq_DGE_matrix/sample_bash_scripts/drop_seq_tools"

genfun_tagged=$home_folder/genfun_tagged_aligned_filtered_reads/genfun_tagged.sam
out=$home_folder/DGE_matrix/sample_dge_matrix.dge
summary=$home_folder/DGE_matrix/sample_dge_matrix_summary.txt

$dropseq_tools/DigitalExpression \
     I=$genfun_tagged \
     O=$out \
     SUMMARY=$summary \
     NUM_CORE_BARCODES=100

INFO	2019-11-05 19:20:12	DigitalExpression	

********** NOTE: Picard's command line syntax is changing.
**********
********** For more information, please see:
********** https://github.com/broadinstitute/picard/wiki/Command-Line-Syntax-Transition-For-Users-(Pre-Transition)
**********
********** The command line looks like this in the new syntax:
**********
**********    DigitalExpression -I /Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data/genfun_tagged_aligned_filtered_reads/genfun_tagged.sam -O /Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data/DGE_matrix/sample_dge_matrix.dge -SUMMARY /Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/sample_data/DGE_matrix/sample_dge_matrix_summary.txt -NUM_CORE_BARCODES 100
**********


19:20:13.361 INFO  NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/SPLiT-seq_DGE_matrix