In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
pd.set_option('display.max_colwidth', None) 

In [3]:
def printSlurmCommands(row, slurm_node, time, logfile_directory, path_2_script, commands_file):
    input1, input2, samplename, RG, adapter1, adapter2, reference_file, LB, PU, CN, output_directory = row
    
    slurm_command = " ".join(["sbatch",
          f"-p {slurm_node} --ntasks-per-node=24 --nodes=1",
          f"--job-name=wgs_processing_{samplename}_step1",
          f"--time={time}",
          f"--output={logfile_directory}/{samplename}_{RG}_step1.logfile",
          f"{path_2_script} {input1} {input2} {samplename} {RG} {adapter1} {adapter2} {reference_file} {LB} {PU} {CN} {output_directory}"])
    
    with open(commands_file_path, 'a+') as sbatch_commands:
        sbatch_commands.write(slurm_command + "\n")

In [20]:
def printStep2SlurmCommands(samplename, basepath, slurm_node, time, logfile_directory, path_2_script, commands_file):
    
    slurm_command = " ".join(["sbatch",
          f"-p {slurm_node} --ntasks-per-node=24 --nodes=1",
          f"--job-name=wgs_processing_{samplename}_step2",
          f"--time={time}",
          f"--output={logfile_directory}/{samplename}_step2.logfile",
          f"{path_2_script} {samplename} {basepath}"])
    with open(commands_file_path, 'a+') as sbatch_commands:
        sbatch_commands.write(slurm_command + "\n")

In [5]:
basepath="/gpfs/home/shkhalid/scratch/WGS_gentoo_penguins/merged_fastq_files"
set1_reads = pd.read_csv(f"{basepath}/input_reads1", header = None).rename(columns = {0 : "input1"})
set2_reads = pd.read_csv(f"{basepath}/input_reads2", header = None).rename(columns = {0 : "input2"})
sample_names = pd.read_csv(f"{basepath}/sample_ids", header = None).rename(columns = {0 : "sample_name"})
read_groups = pd.read_csv(f"{basepath}/read_groups", header = None).rename(columns = {0 : "RG"})

merged_df = pd.concat([set1_reads, set2_reads, sample_names, read_groups], axis = 1)

In [6]:
#paths in this block shuold be w.r.t singularity mounted directory
adapter1 = "AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA"
adapter2 = "AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG"
reference_file = "/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta"
LB = "Veeramah"
PU = "BGI"
CN = "BGI"
output_directory = "/mnt/WGS_gentoo_penguins/processed_bams"

In [7]:
slurm_nodes = "long-28core"
time = "480:00"
logfile_directory = "/gpfs/home/shkhalid/scratch/WGS_gentoo_penguins/logfiles" # path2 logfile should be the actual directory name not singularity mount dir
path_2_script = "/gpfs/home/shkhalid/scratch/processing/singularity_complete_WGS_processing_pipeline_step2.sh"
commands_file = "step2_login_commands"

commands_file_path = f"{logfile_directory}/{commands_file}"
    
timestamp = datetime.now().strftime("%Y/%m/%d_%H:%M:%S")
    
f = open(commands_file_path, "w")
f.write(f"#command created at: {timestamp}\n")
f.close()

In [8]:
merged_df["adapter1"] = adapter1
merged_df["adapter2"] = adapter2
merged_df["reference_file"] = reference_file
merged_df["LB"] = LB
merged_df["PU"] = PU
merged_df["CN"] = CN
merged_df["output_directory"] = output_directory

In [9]:
merged_df[
    ["input1",
     "input2",
     "sample_name",
     "RG",
     "adapter1",
     "adapter2",
     "reference_file",
     "LB", "PU", "CN",
     "output_directory"]
].tail(748)\
#.apply(lambda row : printSlurmCommands(row, slurm_nodes, time, logfile_directory, path_2_script, commands_file), axis = 1)

Unnamed: 0,input1,input2,sample_name,RG,adapter1,adapter2,reference_file,LB,PU,CN,output_directory
400,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/GERL_1/V350094799_L01_B5GPENwpyfRAAFA-577_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/GERL_1/V350094799_L01_B5GPENwpyfRAAFA-577_2.fq.gz,GERL_1,V350094799_L01_B5GPENwpyfRAAFA-577_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
401,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/GERL_1/V350094799_L01_B5GPENwpyfRAAFA-578_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/GERL_1/V350094799_L01_B5GPENwpyfRAAFA-578_2.fq.gz,GERL_1,V350094799_L01_B5GPENwpyfRAAFA-578_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
402,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/GERL_1/V350094799_L01_B5GPENwpyfRAAFA-579_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/GERL_1/V350094799_L01_B5GPENwpyfRAAFA-579_2.fq.gz,GERL_1,V350094799_L01_B5GPENwpyfRAAFA-579_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
403,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/GERL_1/V350094799_L01_B5GPENwpyfRAAFA-580_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/GERL_1/V350094799_L01_B5GPENwpyfRAAFA-580_2.fq.gz,GERL_1,V350094799_L01_B5GPENwpyfRAAFA-580_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
404,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/GERL_1/V350094799_L02_B5GPENwpyfRAAFA-573_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/GERL_1/V350094799_L02_B5GPENwpyfRAAFA-573_2.fq.gz,GERL_1,V350094799_L02_B5GPENwpyfRAAFA-573_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
...,...,...,...,...,...,...,...,...,...,...,...
1143,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-552_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-552_2.fq.gz,NOBR_5,V350094799_L02_B5GPENwpyfRAAEA-552_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
1144,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-553_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-553_2.fq.gz,NOBR_5,V350094799_L02_B5GPENwpyfRAAEA-553_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
1145,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-554_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-554_2.fq.gz,NOBR_5,V350094799_L02_B5GPENwpyfRAAEA-554_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
1146,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-555_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-555_2.fq.gz,NOBR_5,V350094799_L02_B5GPENwpyfRAAEA-555_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams


In [10]:
merged_df[
    ["input1",
     "input2",
     "sample_name",
     "RG",
     "adapter1",
     "adapter2",
     "reference_file",
     "LB", "PU", "CN",
     "output_directory"]
]

Unnamed: 0,input1,input2,sample_name,RG,adapter1,adapter2,reference_file,LB,PU,CN,output_directory
0,/mnt2/F21FTSUSAT0760-01_GENwatxR/soapnuke/clean/MOOT_3/V350034126_L03_B5GGENwatxRAAEA-509_1.fq.gz,/mnt2/F21FTSUSAT0760-01_GENwatxR/soapnuke/clean/MOOT_3/V350034126_L03_B5GGENwatxRAAEA-509_2.fq.gz,MOOT_3,V350034126_L03_B5GGENwatxRAAEA-509_1,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
1,/mnt2/F21FTSUSAT0760-01_GENwatxR/soapnuke/clean/MOOT_3/V350034126_L03_B5GGENwatxRAAEA-510_1.fq.gz,/mnt2/F21FTSUSAT0760-01_GENwatxR/soapnuke/clean/MOOT_3/V350034126_L03_B5GGENwatxRAAEA-510_2.fq.gz,MOOT_3,V350034126_L03_B5GGENwatxRAAEA-510_1,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
2,/mnt2/F21FTSUSAT0760-01_GENwatxR/soapnuke/clean/MOOT_3/V350034126_L04_B5GGENwatxRAAEA-509_1.fq.gz,/mnt2/F21FTSUSAT0760-01_GENwatxR/soapnuke/clean/MOOT_3/V350034126_L04_B5GGENwatxRAAEA-509_2.fq.gz,MOOT_3,V350034126_L04_B5GGENwatxRAAEA-509_1,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
3,/mnt2/F21FTSUSAT0760-01_GENwatxR/soapnuke/clean/MOOT_3/V350034126_L04_B5GGENwatxRAAEA-510_1.fq.gz,/mnt2/F21FTSUSAT0760-01_GENwatxR/soapnuke/clean/MOOT_3/V350034126_L04_B5GGENwatxRAAEA-510_2.fq.gz,MOOT_3,V350034126_L04_B5GGENwatxRAAEA-510_1,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
4,/mnt2/F21FTSUSAT0760-01_GENwatxR/soapnuke/clean/NEKO_12/V350034126_L03_B5GGENwatxRAACA-505_1.fq.gz,/mnt2/F21FTSUSAT0760-01_GENwatxR/soapnuke/clean/NEKO_12/V350034126_L03_B5GGENwatxRAACA-505_2.fq.gz,NEKO_12,V350034126_L03_B5GGENwatxRAACA-505_1,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
...,...,...,...,...,...,...,...,...,...,...,...
1143,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-552_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-552_2.fq.gz,NOBR_5,V350094799_L02_B5GPENwpyfRAAEA-552_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
1144,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-553_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-553_2.fq.gz,NOBR_5,V350094799_L02_B5GPENwpyfRAAEA-553_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
1145,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-554_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-554_2.fq.gz,NOBR_5,V350094799_L02_B5GPENwpyfRAAEA-554_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams
1146,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-555_1.fq.gz,/mnt2/F22FTSUSAT0801_PENwpyfR/upload/soapnuke/clean/NOBR_5/V350094799_L02_B5GPENwpyfRAAEA-555_2.fq.gz,NOBR_5,V350094799_L02_B5GPENwpyfRAAEA-555_2,AGTCGGAGGCCAAGCGGTCTTAGGAAGACAANNNNNNNNNNCAACTCCTTGGCTCACA,AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG,/mnt2/gentoo_pacbio_ref/GEPE-SCRY-2.asm.bp.p_ctg.fasta,Veeramah,BGI,BGI,/mnt/WGS_gentoo_penguins/processed_bams


In [11]:
merged_df[
    ["input1",
     "input2",
     "sample_name",
     "RG",
     "adapter1",
     "adapter2",
     "reference_file",
     "LB", "PU", "CN",
     "output_directory"]
].shape

(1148, 11)

In [12]:
#for milan 400
#for login 748

In [13]:
base_folder = "/gpfs/home/shkhalid/scratch/WGS_gentoo_penguins"
files_to_create = pd.read_csv(f"{base_folder}/complete_list_of_files_to_create", header = None).rename(columns = {0 : "files"})
files_created = pd.read_csv(f"{base_folder}/files_created", header = None).rename(columns = {0 : "files"})
files_created["files"] = files_created["files"].apply(lambda x : x.split("/")[-1])
files_created["files"] = files_created["files"].apply(lambda x : x.split(".")[0] + "*RG.bam")

In [14]:
pd.merge(files_created, files_to_create, on = ["files"], indicator = True, how = "outer")\
.query("_merge == 'right_only'")

Unnamed: 0,files,_merge
1094,JOUG_5_V350094945_L01_B5GPENwpyfRABGA-555_2*RG.bam,right_only
1095,JOUG_5_V350094945_L01_B5GPENwpyfRABGA-556_2*RG.bam,right_only
1096,JOUG_5_V350094945_L02_B5GPENwpyfRABGA-549_2*RG.bam,right_only
1097,JOUG_5_V350094945_L02_B5GPENwpyfRABGA-550_2*RG.bam,right_only
1098,JOUG_5_V350094945_L02_B5GPENwpyfRABGA-551_2*RG.bam,right_only
1099,JOUG_5_V350094945_L02_B5GPENwpyfRABGA-552_2*RG.bam,right_only
1100,JOUG_5_V350094945_L02_B5GPENwpyfRABGA-553_2*RG.bam,right_only
1101,JOUG_5_V350094945_L02_B5GPENwpyfRABGA-554_2*RG.bam,right_only
1102,JOUG_5_V350094945_L02_B5GPENwpyfRABGA-555_2*RG.bam,right_only
1103,JOUG_5_V350094945_L02_B5GPENwpyfRABGA-556_2*RG.bam,right_only


In [21]:
list_of_samples = merged_df["sample_name"].unique()
for sample in list_of_samples:
    printStep2SlurmCommands(sample, output_directory, slurm_nodes, time,
                            logfile_directory, path_2_script, commands_file)

sbatch -p long-28core --ntasks-per-node=24 --nodes=1 --job-name=wgs_processing_MOOT_3_step2 --time=480:00 --output=/gpfs/home/shkhalid/scratch/WGS_gentoo_penguins/logfiles/MOOT_3_step2.logfile /gpfs/home/shkhalid/scratch/processing/singularity_complete_WGS_processing_pipeline_step2.sh MOOT_3 /mnt/WGS_gentoo_penguins/processed_bams
sbatch -p long-28core --ntasks-per-node=24 --nodes=1 --job-name=wgs_processing_NEKO_12_step2 --time=480:00 --output=/gpfs/home/shkhalid/scratch/WGS_gentoo_penguins/logfiles/NEKO_12_step2.logfile /gpfs/home/shkhalid/scratch/processing/singularity_complete_WGS_processing_pipeline_step2.sh NEKO_12 /mnt/WGS_gentoo_penguins/processed_bams
sbatch -p long-28core --ntasks-per-node=24 --nodes=1 --job-name=wgs_processing_PCHA_8_step2 --time=480:00 --output=/gpfs/home/shkhalid/scratch/WGS_gentoo_penguins/logfiles/PCHA_8_step2.logfile /gpfs/home/shkhalid/scratch/processing/singularity_complete_WGS_processing_pipeline_step2.sh PCHA_8 /mnt/WGS_gentoo_penguins/processed_bam