## Downloading/installing required libraries

NOTE: This script at the moment is hard-coded to run the pipeline on the samples of GSE168595

In [None]:
!sudo apt-get install fastqc
!sudo python3 -m pip install cutadapt
!pip install pysradb
!gdown --id 1h2zIwVfOE0syTvVKlIHxgDtc0Fi2gMSq
!tar -zxvf dependencies.tar.gz 
!rm dependencies.tar.gz

## Granting execution access to fetched libraries 

In [None]:
!chmod 755 dependencies/TrimGalore/trim_galore
!chmod 755 dependencies/enaBrowserTools/python3/enaDataGet

## Fetching datasets --> FASTQC --> TrimGalore --> Tophat 

In [None]:
import subprocess as sp
import os
import json
import re

pwd = "dependencies/" 
with open(pwd + "metadata.json") as file:
    data = json.load(file)

samples = []
for sample in data[2]['Samples']:
    samples.append(sample['Accession'])

sample = samples[0]

#Downloading SRR files
fetch_srr = sp.run(["pysradb", "gsm-to-srr", sample], capture_output=True, text=True)
console_output = fetch_srr.stdout
SRR_file = (console_output.split()[-1])
print (SRR_file)

ena = pwd + "enaBrowserTools/python3/enaDataGet -f fastq -d /content/ " + SRR_file
sp.run(ena.split(), capture_output=True, text=True)
print ("Completed downloading files...\nRunning FASTQC")
dir = "mkdir ./" + SRR_file + "/trimmed"
sp.run (dir.split(), capture_output=True)


In [None]:
#user defined function to run FASTQC
def run_fastqc(fastq_file):
  fastqc_cmd = "fastqc " + fastq_file
  sp.run(fastqc_cmd.split(), capture_output=True, text=True)
  print ("Completed running FASTQC...")

In [None]:
#User defined function to parse fastqc report
def fastqc_parse (folder, fqc_loc):
  unzip = "unzip " + folder + "/" + fqc_loc + ".zip " + fqc_loc + "/fastqc_data.txt -d /content/" + folder
  sp.run(unzip.split(),capture_output=True)

  with open (folder + "/" + fqc_loc + "/fastqc_data.txt") as handle:
    lines = handle.readlines()
    fail = [1 for line in lines if re.findall("fail",line)]
  return len(fail)

In [None]:
#User defined function to run TrimGalore
def trim_reads(file_to_trim,folder,paired=False):
  print ("Running TrimGalore! ...")
  if(paired):
    trim_cmd = pwd + "TrimGalore/trim_galore --fastqc --illumina -q 30 --paired " + file_to_trim + " -O ./" + SRR_file + "/trimmed/"+folder
  else:
    trim_cmd = pwd + "TrimGalore/trim_galore --fastqc --illumina -q 30 " + file_to_trim + " -O ./" + SRR_file + "/trimmed/"+folder 
  
  sp.run(trim_cmd.split(),capture_output=True, text=True)
  print ("Completed running TrimGalore!")

In [None]:
#Running FASTQC and TrimGalore on downloaded files
SRR_path = "./" + SRR_file + "/" + SRR_file
reads_to_align = []

if (len(os.listdir("./" + SRR_file)) == 2):
  run_fastqc(SRR_path + ".fastq.gz")
  count_og = fastqc_parse("./" + SRR_file , SRR_file + "_fastqc")
  trim_reads(SRR_path + ".fastq.gz", "round1") 
  count_trim = fastqc_parse("./" + SRR_file + "/trimmed/round1" , SRR_file + "_trimmed_fastqc")

  if (count_og > count_trim):
    print ("Trim again")
  elif (count_og == count_trim):
    print ("File to use = trimmed")
    reads_to_align.insert(0,"./" + SRR_file + "/trimmed/round1/" + SRR_file + "_trimmed.fq.gz")
  else:
    print ("File to use = original")
    reads_to_align.insert(0,SRR_path + ".fastq.gz")

else: 
  run_fastqc(SRR_path + "_1.fastq.gz " + SRR_path + "_2.fastq.gz")
  count_og = fastqc_parse("./" + SRR_file, SRR_file + "_1_fastqc") + fastqc_parse("./" + SRR_file, SRR_file + "_2_fastqc")
  trim_reads(SRR_path + "_1.fastq.gz " + SRR_path + "_2.fastq.gz","round1",True)
  count_trim = fastqc_parse("./" + SRR_file + "/trimmed/round1" , SRR_file + "_1_val_1_fastqc") + fastqc_parse("./" + SRR_file + "/trimmed/round1" , SRR_file + "_2_val_2_fastqc")

  if (count_og > count_trim):
    print ("Trim again")
  elif (count_og == count_trim):
    print ("File to use = trimmed")
    reads_to_align.insert(0,"./" + SRR_file + "/trimmed/round1/" + SRR_file + "_1_val_1.fq.gz")
    reads_to_align.insert(1,"./" + SRR_file + "/trimmed/round1/" + SRR_file + "_2_val_2.fq.gz")
  else:
    print ("File to use = original")
    reads_to_align.insert(0,SRR_path + "_1.fastq.gz")
    reads_to_align.insert(1,SRR_path + "_2.fastq.gz")


In [None]:
fq_path = []
if len(reads_to_align) == 1:
  fq_path.insert(0, reads_to_align[0][:-3])
  gz_file = "gunzip " + reads_to_align[0]
  sp.run(gz_file.split())
else:
  fq_path.insert(0, reads_to_align[0][:-3])
  fq_path.insert(1, reads_to_align[1][:-3])
  gz_file = "gunzip " + reads_to_align[0]
  sp.run(gz_file.split())
  gz_file = "gunzip " + reads_to_align[1]
  sp.run(gz_file.split())
fq_direc = "/".join(reads_to_align[0].split('/')[1:-1])

In [None]:
! sudo apt-get install tophat

In [None]:
! gdown --id 1YA1EZ8YDHCEaM3dXGkmlWKZ5o5McSQ9I

In [None]:
index_genome = "bowtie2-build N_crassa_genome.fna /content/" + fq_direc + "/N_Crassa-ref"
print(index_genome)
sp.run(index_genome.split())

In [None]:
if len(fq_path) == 2:
  tophat_job = "tophat -r 20 " + fq_direc + "/N_Crassa-ref " + fq_path[0] + " " + fq_path[1]
else:
  tophat_job = "tophat -r 20 " + fq_direc + "/N_Crassa-ref " + fq_path[0]
sp.run(tophat_job.split(), capture_output=True)

In [None]:
rm -r SRR13919670