### Extract Sample Names from TCGA Aliquot IDs (TCGA-BRCA)

In [None]:
import polars as pl
import numpy as np
import os

# traverse a target directory and return a list of all files in it
def get_filepaths(directory):
    file_paths = []
    for root, _, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)
    return file_paths

# get all file paths
filepaths = get_filepaths("/scratch/users/ntu/suffiazi/outputs/BRCA-diff-footprinting/test")

# extract identifiers from file paths
identifiers = [os.path.basename(path).split("_")[0] for path in filepaths]
print(identifiers)

# zip identifiers and filepaths together to create a dictionary
id_filepath_dict = dict(zip(identifiers, filepaths))

# initialize empty dictionary to store column names
id_cols_dict = {}

# loop through dictionary and read in each file
for ids, paths in id_filepath_dict.items():
    df = pl.read_csv(paths, separator="\t")
    cols = df.columns
    cols = [col for col in cols if "aliquot" in col]
    id_cols_dict[ids] = cols

print(id_cols_dict)

# save each dictionary key-value pair as one column text file
for ids, cols in id_cols_dict.items():
    with open(f"/home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/output_files/aliquot_ids/{ids}_aliquot_IDs.txt", "w") as f:
        for col in cols:
            f.write(col + "\n")

In [None]:
%%bash
# run this cell to pull uuid from aliquot ids on the GDC portal

# first read in aliquot ids from text files

for file in /home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/output_files/aliquot_ids/*_aliquot_IDs.txt; do
    # get dataset id from file name
    dataset_id=$(basename "${file}" | cut -d "_" -f 1)
    while read -r line; do
        curl "https://api.gdc.cancer.gov/v0/all?query=${line}&size=5" | jq -c '.data.query.hits[].samples.hits.edges[].node.portions.hits.edges[].node.analytes.hits.edges[].node.aliquots.hits.edges[]?.node | select(.aliquot_id and .submitter_id)' | grep "_aliquot" >> "/home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/output_files/aliquot_id-uuid_mapping/${dataset_id}_uuids.tmp"
    done < "$file"
    # print unique lines and remove tmp file
    cat "/home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/output_files/aliquot_id-uuid_mapping/${dataset_id}_uuids.tmp" | sort | uniq > "/home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/output_files/aliquot_id-uuid_mapping/${dataset_id}_uuids.txt"
    rm "/home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/output_files/aliquot_id-uuid_mapping/${dataset_id}_uuids.tmp"
done

In [None]:
import json
import glob
import os
import polars as pl
from collections import Counter

# Define a generator function to generate unique values for technical replicate renaming
def unique_value_generator():
    count = 1
    while True:
        yield count
        count += 1

# now read the uuid text files from the target path and convert to json
directory = "/home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/output_files/aliquot_id-uuid_mapping"
items = os.listdir(directory)
files = [item for item in items if os.path.isfile(os.path.join(directory, item))]

for file in files:
    # get the 
    with open(os.path.join(directory, file), "r") as f:
        json_objects = [json.loads(line) for line in f]
    # print(json_objects)

    # extract dataset id from file name
    dataset_id = file.split("_")[0]
    print(dataset_id)

    # get the path to the sampsheet file of the dataset id using wildcards
    sampsheet = glob.glob(f"/home/users/ntu/suffiazi/scripts/atac-seq-workflow-scripts/output_files/exported_sampsheets/{dataset_id}*.csv")
    print(sampsheet)
    # try loading the sampsheet as a dataframe
    try:
        df = pl.read_csv(sampsheet[0])
        # print(df)
    except:
        print("Error: No sampsheet found for dataset id " + dataset_id)
        continue
    # loop through the list of dictionaries and extract the uuids from the sampsheet
    sample_ids = {}
    for obj in json_objects:
        print(obj["aliquot_id"] + "\t" + obj["submitter_id"])
        # generate a search string
        search_string = f"{obj['aliquot_id']}.bam"
        # get the SAMPLE number by searching the FILE column using the search string
        sample_id = df.filter(df["FILE"].str.contains(search_string)).select("SAMPLE")
        # squeeze the sample_id into a string using .item() method, which is the polars equivalent of pandas .squeeze() method
        sample_id = sample_id.item()
        # print(sample_id)
        # print(type(sample_id))
        # construct sample_id string for new column name
        if sample_id < 10:
            sample_colname = f"{dataset_id}_sample0" + str(sample_id)
        else:
            sample_colname = f"{dataset_id}_" + "sample" + str(sample_id)
        # add the sample id to the dictionary
        sample_ids[obj["submitter_id"]] = sample_colname
    # print(sample_ids)
    # sort the dictionary by values
    sample_ids = dict(sorted(sample_ids.items(), key=lambda item: item[1]))
    print(sample_ids)
    # create a list of values from the dictionary
    sample_ids_list = list(sample_ids.values())
    # check if there are any duplicate values in the list
    if len(sample_ids_list) != len(set(sample_ids_list)):
        # return the duplicate values and their counts
        print("Warning: Duplicate values found in dictionary")
        count_holder = {k: v for k, v in Counter(sample_ids_list).items() if v > 1}
        print(f"Listing duplicated samples: {count_holder}")

        # Create an instance of the generator
        unique_values = unique_value_generator()
        
        # Dictionary to keep track of replacements
        replaced_sample_ids = {}

        # Loop through the dictionary and replace duplicate values with unique values
        for key, value in sample_ids.items():
            if value in count_holder.keys():
                # If the value is a duplicate, replace it with a unique value
                sample_ids[key] = f"{value}_0{next(unique_values)}"

        # Print the updated dictionary
        print(sample_ids)
    
    # save the dictionary as tab-separated text file with each key-value pair on a new line
    with open(f"/home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/output_files/mapping_files/{dataset_id}_sample_colname_mapping.txt", "w") as f:
        for key, value in sample_ids.items():
            f.write(key + "\t" + str(value) + "\n")


### Extract Motif IDs from Bed Filename (TCGA-BRCA)

In [None]:
%%bash

while read -r line; do
    echo $line
    # get motif id from file name
    motif_id=${line%_tfbs_merged_matrix-brca_brca.bed}
    # append the motif id to a new file
    echo $motif_id >> "/home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/input_files/tfbs_motif_prefix_list.txt"
done < "/home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/input_files/tfbs_bedfile_names.txt"
 

In [1]:
%%bash 
bcftools +fill-tags /scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode/2GAMBDQ/2GAMBDQ_AHR_AHR_HUMAN.H11MO.0.B_qualgt10.var.flt.vcf -- -t VAF

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##bcftoolsVersion=1.17+htslib-1.17
##bcftoolsCommand=mpileup -Ou -f /scratch/users/ntu/suffiazi/inputs/references/gatk4/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta -T /scratch/users/ntu/suffiazi/inputs/BRCA-diffmode_tfbs_bedfiles/AHR_AHR_HUMAN.H11MO.0.B_diffmode_TCGA-BRCA_fpscore_regions.bed -b /home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/input_files/mpileup_lists/per-dataset/2GAMBDQ_bam_list.txt --annotate FORMAT/AD,FORMAT/DP
##reference=file:///scratch/users/ntu/suffiazi/inputs/references/gatk4/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422

#### Ad-hoc script to add VAF to VCF files (TCGA-BRCA)

In [7]:
%%bash
for dir in /scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode/*; do
    if [ -d "$dir" ]; then
        echo $dir
        count=0
        for file in $dir/*.vcf; do
            echo $file
            outfile=${file%.vcf}
            bcftools +fill-tags $file -- -t VAF > "${outfile}"_VAF.vcf
            count=$((count+1))
            echo "File no. $count processed."
        done
    else
        echo "$dir: Not a directory!"
    fi
done


/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode/2GAMBDQ
/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode/2GAMBDQ/2GAMBDQ_AHR_AHR_HUMAN.H11MO.0.B_qualgt10.var.flt.vcf
File no. 1 processed.
/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode/2GAMBDQ/2GAMBDQ_AIRE_AIRE_HUMAN.H11MO.0.C_qualgt10.var.flt.vcf
File no. 2 processed.
/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode/2GAMBDQ/2GAMBDQ_ALX1_ALX1_HUMAN.H11MO.0.B_qualgt10.var.flt.vcf
File no. 3 processed.
/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode/2GAMBDQ/2GAMBDQ_ANDR_ANDR_HUMAN.H11MO.0.A_qualgt10.var.flt.vcf
File no. 4 processed.
/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode/2GAMBDQ/2GAMBDQ_AP2A_AP2A_HUMAN.H11MO.0.A_qualgt10.var.flt.vcf
File no. 5 processed.
/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode/2GAMBDQ/2GAMBDQ_AP2B_AP2B_HUMAN.H11MO.0.B_qualgt10.var.flt.vcf
File no. 6 processed.
/scratch/users/ntu/suf