# Installation
install conda env, recommendation is into project dir

`conda install -p ./.venv -c conda-forge -c bioconda --file ./notebooks/covid19_leader.requirements.txt`

`. activate ./.venv`

Add associate input and output folders. The default location is `./input` and `./output`

NOTE: for `./sars_cov2_leader.sh` to work on mac you'll also need to install md5sum (e.g. brew install md5sha1sum)

In [None]:
from dotenv import load_dotenv, find_dotenv
import os
from concurrent.futures import ThreadPoolExecutor # For parallel processing
import subprocess
import math
from pathlib import Path
from typing import Tuple

In [None]:
%pwd

In [None]:
load_dotenv(find_dotenv())
INPUT_DIR = os.getenv('INPUT_DIR', "./input")
SARS_COV_2_LEADER_PROGRAM_PATH = os.getenv('SARS_COV_2_LEADER_PROGRAM_PATH', "./scripts/sars_cov2_leader.sh")
OUTPUT_DIR = os.getenv('OUT_DIR', "./output")
THREADS = os.getenv('THREADS', 12)
EXECUTE_COMMANDS = os.getenv('EXECUTE_COMMANDS', True)

PROJECT_PATH = os.getcwd()

In [None]:
def get_bam_files(input_dir:Path, bam_extension:str=".bam") -> list[Path]:
    bam_files:list[Path] = []
    for here, dirs, files in os.walk(input_dir, topdown=True):
        for file in files:
            if file.endswith(bam_extension):
                bam_files.append(os.path.abspath(os.path.join(here, file)))
    return bam_files

In [None]:
bam_files = get_bam_files(INPUT_DIR) 

In [None]:
print(f"Length of bam_files: {len(bam_files)}\nFirst 10: {bam_files[:10]}\n")

NOTE: If this command fails it creates a bunch of folders in the main directory.

In [None]:
def find_leaders_in_bam(bam_files: list[Path], output:Path, program_path:Path=SARS_COV_2_LEADER_PROGRAM_PATH, reference_name:str='MN908947.3', quality:int=30) -> list[Path]:
    output_folders: list[Path] = []
    commands: list[str] = []
    process_threads: int = 1 # Set to 1 because each process runs really fast so figure its more optimal to run 1 per thread then pooling more threads to 1 
    try:    
        for bam_file in bam_files:
            expected_output_folder:Path = os.path.join(output, f"{os.path.basename(bam_file)[:-4]}_leader_data")
            out_leaders_txt:Path = os.path.join(output, os.path.basename(bam_file))[:-4] + ".leaders.txt"
            if not os.path.isdir(expected_output_folder):    
                command:str = f"" \
                    f"{program_path} -i {bam_file} -r {reference_name} -q {quality} -t {process_threads} -o {out_leaders_txt};" \
                    f"mv {os.path.basename(bam_file)[:-4]}_leader_data {expected_output_folder};"
                commands.append(command)
        with ThreadPoolExecutor(max_workers=math.floor(THREADS/process_threads)) as executor:
            for i, command in enumerate(commands):
                if EXECUTE_COMMANDS:
                    executor.submit(subprocess.run, command, shell=True)
                    print(f"{i}/{len(commands)} Ran command: {command}", end="\r")
    except Exception as e:
        print(f"Error: {e}")
        print(f"Command: {command}")
        print(f"Error processing {bam_file}")
    return output_folders

In [None]:
leader_folder = find_leaders_in_bam(bam_files, OUTPUT_DIR)

In [None]:
output_bam_files = get_bam_files(OUTPUT_DIR)

In [None]:
def calculate_depth_on_leader_data(bam_files:list[Path], output:Path) -> list[Path]:
    calculated_depth_files:list[Path] = []
    commands:str = []
    try:
        for bam_file in bam_files:
            output_depth_file:Path = os.path.join(output, os.path.basename(bam_file))[:-4] + ".depth.txt"
            calculated_depth_files.append(output_depth_file)
            if not os.path.isfile(output_depth_file):
                command:str = f"" \
                    f"samtools depth {bam_file} > {output_depth_file}; " 
                commands.append(command)
        with ThreadPoolExecutor(max_workers=THREADS) as executor:
            for i, command in enumerate(commands):
                print(f"\n{command}")
                if EXECUTE_COMMANDS:
                    executor.submit(subprocess.run, command, shell=True)
                    print(f"{i}/{len(commands)}\ Depth calculated: {bam_file}", end="\r")
    except Exception as e:
        print(f"Error: {e}")
        print(f"Command: {command}")
        print(f"Error processing {bam_file}")
    return calculated_depth_files

In [None]:
depth_files = calculate_depth_on_leader_data(output_bam_files, OUTPUT_DIR) 

In [None]:
# Helper function if you already have depth files made
def scan_for_depth_files(output:Path) -> list[Path]:
    depth_files:list[Path] = []
    for root, dirs, files in os.walk(output):
        for file in files:
            if file.endswith(".depth.txt"):
                depth_files.append(os.path.abspath(os.path.join(root, file)))
    return depth_files

In [None]:
depth_files = scan_for_depth_files(OUTPUT_DIR)

In [None]:
def parse_depth_on_sites_of_interest(depth_files: list[Path], sites_of_interest: list[int], output: Path):
    with open(output, "w+") as out_file:
        for depth_file in depth_files:
            sample_name:str = os.path.basename(depth_file).split(".")[0]
            with open(depth_file, "r") as depth_file_stream:
                for line in depth_file_stream:
                    position = int(line.strip().split("\t")[1])
                    depth = int(line.strip().split("\t")[2])
                    if position in sites_of_interest:
                        print(f"{position}\t{depth}\t{sample_name}", file=out_file)

In [None]:
sites_from_paper = [55, 21552, 25385, 26237, 26469, 27041, 27388, 27644, 27884, 28256, 29530]
parse_depth_on_sites_of_interest(depth_files, sites_from_paper, f"{OUTPUT_DIR}/COVID_leader_splice_sites.tsv")

In [None]:
# Note the proportions are based on the sites included, beware of missing sites and double counting sites (nearby sites which are artificats)
def make_proportional_samples(parsed_depth_file:Path, proportional_file:Path):
    sample_count:dict = {}
    with open(parsed_depth_file, "r") as f:
        for line in f:
            line:str = line.strip()
            if not line.startswith("#"):    
                line = line.split("\t")
                try:
                    pos:int = int(line[0])
                    counts:int = int(line[1])
                    sample:str = line[2]
                    if sample not in sample_count:
                        sample_count[sample] = {}
                    sample_count[sample][pos] = counts
                except Exception:
                    print(line)
    with open(proportional_file, "w+") as out_file:
        print(f"#sample_name\tposition\tproportion\tcount", file=out_file)
        for key in sample_count:
            total:int = 0
            total:int = sum(sample_count[key].values())
            for pos in sample_count[key]:
                print(f"{key}\t{pos}\t{sample_count[key][pos]/total}\t{sample_count[key][pos]}", file=out_file)


In [None]:
make_proportional_samples(f"{OUTPUT_DIR}/COVID_leader_splice_sites.tsv", f"{OUTPUT_DIR}/COVID_leader_splice_sites.proportional.tsv")