## Installing dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!sudo apt-get install fastqc
!sudo python3 -m pip install cutadapt
!pip install pysradb==1.0.1
%cd /content/drive/MyDrive/Bioethanol_pipeline
!pwd
!gdown --id 1D0UFQJrkBfXSiDXdsp0VQLA8YYWY0INk
!tar -zxvf dependencies.tar.gz 
!rm dependencies.tar.gz

## Granting execution access to fetched libraries

In [None]:
!chmod 755 dependencies/TrimGalore/trim_galore
!chmod 755 dependencies/enaBrowserTools/python3/enaDataGet

## Fetching datasets --> FASTQC --> TrimGalore --> HiSAT2

In [None]:
import subprocess as sp
import os
import json
import re

pwd = "dependencies/" 

## Converting given GSE id to GSM ids

gse_id = input("Provide a GSE id:")
fetch_gsm = sp.run(["pysradb", "gse-to-gsm", gse_id], capture_output=True, text=True)
console_output = fetch_gsm.stdout
gsm_ids = console_output.split("\n")

#print(gsm_ids)
del gsm_ids[0]
del gsm_ids[-1]

samples = []
for gsm_id in gsm_ids:
	samples.append(gsm_id.split(' ')[1])

for sample in samples:
	## Downloading SRR files
	fetch_srr = sp.run(["pysradb", "gsm-to-srr", sample], capture_output=True, text=True)
	console_output = fetch_srr.stdout
	SRR_file = (console_output.split()[-1])
	print (SRR_file)
	
	
	ena = pwd + "enaBrowserTools/python3/enaDataGet -f fastq -d ./ " + SRR_file
	sp.run(ena.split(), capture_output=True, text=True)
	print ("Completed downloading files...\nRunning FASTQC")
	
	
	direc = "mkdir ./" + SRR_file + "/trimmed"
	sp.run (direc.split(), capture_output=True)
	
	#user defined function to run FASTQC
	def run_fastqc(fastq_file):
	  print ("Running FASTQC on " + fastq_file + "...")
	  fastqc_cmd = "fastqc " + fastq_file
	  temp = sp.run(fastqc_cmd.split(), capture_output=True, text=True)  
	  print (temp.stderr)

	#User defined function to parse fastqc report
	def fastqc_parse (folder, fqc_loc):
		unzip = "unzip " + folder + "/" + fqc_loc + ".zip " + fqc_loc + "/fastqc_data.txt -d ./" + folder
		sp.run(unzip.split(),capture_output=True)
		
		print ("location: " + folder + "/" + fqc_loc + "/fastqc_data.txt")
		with open (folder + "/" + fqc_loc + "/fastqc_data.txt") as handle:
			lines = handle.readlines()
			fail = [1 for line in lines if re.findall("fail",line)]
		return len(fail)

	#User defined function to run TrimGalore
	def trim_reads(file_to_trim,folder,paired=False):
		print ("Running TrimGalore! ...")
		if(paired):
			trim_cmd = pwd + "TrimGalore/trim_galore --fastqc --illumina -q 30 --paired " + file_to_trim + " -O ./" + SRR_file + "/trimmed/"+folder
		else:
			trim_cmd = pwd + "TrimGalore/trim_galore --fastqc --illumina -q 30 " + file_to_trim + " -O ./" + SRR_file + "/trimmed/"+folder 

		temp=sp.run(trim_cmd.split(),capture_output=True, text=True)
		print (temp.stderr)
		print ("Completed running TrimGalore!")

	#Running FASTQC and TrimGalore on downloaded files
	SRR_path = "./" + SRR_file + "/" + SRR_file
	reads_to_align = []

	if (len(os.listdir("./" + SRR_file)) <= 2 ):
		run_fastqc(SRR_path + ".fastq.gz")
		count_og = fastqc_parse("./" + SRR_file , SRR_file + "_fastqc")
		trim_reads(SRR_path + ".fastq.gz", "round1") 
		count_trim = fastqc_parse("./" + SRR_file + "/trimmed/round1" , SRR_file + "_trimmed_fastqc")

		if (count_og > count_trim):
			print ("Trimming again")
			reads_to_align.insert(0,SRR_path + ".fastq.gz")
		elif (count_og == count_trim):
			print ("File to use = trimmed")
			reads_to_align.insert(0,"./" + SRR_file + "/trimmed/round1/" + SRR_file + "_trimmed.fq.gz")
		else:
			print ("File to use = original")
			reads_to_align.insert(0,SRR_path + ".fastq.gz")

	else: 
		run_fastqc(SRR_path + "_1.fastq.gz " + SRR_path + "_2.fastq.gz")
		count_og = fastqc_parse("./" + SRR_file, SRR_file + "_1_fastqc") + fastqc_parse("./" + SRR_file, SRR_file + "_2_fastqc")
		trim_reads(SRR_path + "_1.fastq.gz " + SRR_path + "_2.fastq.gz","round1",True)
		count_trim = fastqc_parse("./" + SRR_file + "/trimmed/round1" , SRR_file + "_1_val_1_fastqc") + fastqc_parse("./" + SRR_file + "/trimmed/round1" , SRR_file + "_2_val_2_fastqc")

		if (count_og > count_trim):
			print ("\nTrimming again\n")
			reads_to_align.insert(0,SRR_path + "_1.fastq.gz")
			reads_to_align.insert(1,SRR_path + "_2.fastq.gz")
		elif (count_og == count_trim):
			print ("\n\nFile to use = trimmed\n\n")
			reads_to_align.insert(0,"./" + SRR_file + "/trimmed/round1/" + SRR_file + "_1_val_1.fq.gz")
			reads_to_align.insert(1,"./" + SRR_file + "/trimmed/round1/" + SRR_file + "_2_val_2.fq.gz")
		else:
			print ("\n\nFile to use = original\n\n")
			reads_to_align.insert(0,SRR_path + "_1.fastq.gz")
			reads_to_align.insert(1,SRR_path + "_2.fastq.gz")

	fq_path = []
	if len(reads_to_align) == 1:
		fq_path.insert(0, reads_to_align[0][:-3])
		gz_file = "gunzip " + reads_to_align[0]
		sp.run(gz_file.split())
	else:
		fq_path.insert(0, reads_to_align[0][:-3])
		fq_path.insert(1, reads_to_align[1][:-3])
		gz_file = "gunzip " + reads_to_align[0]
		sp.run(gz_file.split())
		gz_file = "gunzip " + reads_to_align[1] 
		sp.run(gz_file.split())
	fq_direc = "/".join(reads_to_align[0].split('/')[1:-1])

	fna_file_name = "Avibrio_tcellus_genome.fna"
	index_genome = "./dependencies/hisat2-2.2.1/hisat2-build -p 16 " + fna_file_name + " "+ fq_direc + "/" + fna_file_name[0:-4] + "-ref"
	sp.run(index_genome.split())

	if len(fq_path) == 2:
	  hisat2_job = "./dependencies/hisat2-2.2.1/hisat2 -x " + fq_direc + "/" + fna_file_name[0:-4] + "-ref" + " -1 " + fq_path[0] + " -2 " + fq_path[1] + " -S " + fq_direc + "/" + SRR_file+".sam"

	else:
	  hisat2_job = "./dependencies/hisat2-2.2.1/hisat2 -x " + fq_direc + "/" + fna_file_name[0:-4] + "-ref" + " -U " + fq_path[0] + " -S " + fq_direc + "/" + SRR_file+".sam"

	temp = sp.run(hisat2_job.split(), capture_output=True)
	print (temp.stderr)

Provide a GSE id:GSE51745


IndexError: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive/My Drive/Bioethanol_pipeline')

In [None]:
!rm -r SRR5684121/