## Pip installs

In [None]:
!pip install torch

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install pysam

In [None]:
!pip install HTSeq

## LLAMA loading

In [None]:
from transformers import AutoTokenizer, LlamaForCausalLM

In [None]:
"""
    path on clust to convert weights is:
    path on clust to convert tokenizer:
"""

model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf")
tokenizer = AutoTokenizer.from_pretrained("decapoda-research/llama-7b-hf")

## Tutorial Testing

In [None]:
from datasets import load_dataset

In [None]:
from transformers import AutoTokenizer

In [None]:
dataset = load_dataset("rotten_tomatoes", split="train")


In [None]:
dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


In [None]:
encoded_input = tokenizer("ACGTGGTATGATGATAGATGATGA")
print(encoded_input)

In [None]:
tokenizer.decode(encoded_input["input_ids"])

## Loading dna sequences from a BAM and creating a dataset for fine tuning Llama to understand sequence positions and mutations

### Attempt with HTSeq

In [None]:
import HTSeq
from collections import defaultdict

In [None]:
import json

In [None]:
ref_dir = "/oak/stanford/groups/cgawad/Reference_Files/GATK_Resource_Bundle_hg38/"

In [None]:
### change this later to be inputted 
gtf_file = HTSeq.GFF_Reader(ref_dir + "hg38.refGene.gtf.gz" )

In [None]:
hg38_fasta = ref_dir + "Homo_sapiens_assembly38.fasta" 

In [None]:
### using htseq's fasta reader, get ref genome hg38 sequences and put in a good format for the dataset

def addRef(fasta_dir):
    '''
        Given a directory pointing to a genome reference fasta, return a json file or hf dataset object with the following info:
            1. iterate through the fasta and get sequence info
            2. for each read want to have the chr #, description, and whther it is in exonic portion of a gene
                -if not exonic, ideally want what type non-coding element the read is a part of
            3. NA (or whatever is best) for Mutation, Clinvar, Cosmic
            4. desc of gene from NCBI 
    '''
    data_dict = {}
    for read in HTSeq.FastaReader(fasta_dir):
        ### may make more sense to iterate through gtf file instead of fasta, but we need to iterate through the sequences somehow
        ### and also iterate through the read nammes with chrom position etc, 
        chr_pos = #a string with the chromosome number, start pos and end position
        data_dict[chr_pos] = [
            {
                'seq':read,
                'chr':#the chromosome #
                'pos':#preferably the start AND end position
                'refGene':#gene name or whether intronic, ncrna, splicing etc
                'NCBI':#info from ncbi database
                'mutation':'NA',
                'clinvar':'NA',
                'cosmic':'NA'
            }
        ]
        print("Sequence '%s' has length %d." % ( s.name, len(s) ))

In [None]:


def addBAM(bam_path, vcf_path):
    '''
        Given a path pointing ot a bam file, return a json or hf dataset object with following info:
            1. iterate through the bam file and get sequence info on reads containing mutations appearing in vcf file
            2. for each read want to have the chr #, description, and whther it is in exonic portion of a gene
                -if not exonic, ideally want what type non-coding element the read is a part of
            3. If a mutation, want info for Mutation from Clinvar, Cosmic, NCBI
            4. desc of gene from NCBI 
            
            desc of sam_alignment from htseq:
                >>> aln.iv
                <GenomicInterval object 'IV', [246048,246084), strand '+'>
                >>> aln.iv.chrom
                'IV'
                >>> aln.iv.start
                246048
                >>> aln.iv.end
                246084
                >>> aln.iv.strand
                '+'
    '''
    data_dict = {}
    with HTSeq.BAM_Reader(bam_path) as f:
        for i, sam_alignment in enumerate(f):
            ### did this a alittle backwards, should iterate through gtf file to get gene name and interval, then (hopefully) use that
            ### to index into the bam file to get the sequence rather than iterating through all parts of the bam file
            if sam_alignment.aligned == True:
                ### i'm not sure what's gonna be the most useful thing for llama to map all the info to, starting with a string with
                ### chrom and pos and what file it's from
                chrom_pos_identifier = sam_alignment.iv.chrom + ' START: ' + sam_alignment.iv.start + ' END: ' + sam_alignment.iv.end 
                data_dict[chrom_pos_identifier] = [
                {
                    'read_name':sam_alignment.read.name
                    'seq':sam_alignment.read,
                    'chr':sam_alignment.iv.chrom
                    'pos':sam_alignment.iv.start + '_' + sam_alignment.iv.end #preferably the start AND end position
                    'refGene': #gene name or whether intronic, ncrna, splicing etc
                    'NCBI': #info from ncbi database
                    'mutation':'NA',
                    'clinvar':'NA',
                    'cosmic':'NA'
                }
                ]
            print(read)
            ## for testing don't do the whole thing
            if i == 2:
                break

In [None]:
# exons = HTSeq.GenomicArrayOfSets( "auto", stranded=True )
# counter = 0
# for feature in gtf_file:
#     if counter <= 5:
#         print(feature)
#         if feature.type == "exon":
#            exons[ feature.iv ] += feature.attr["gene_id"]

### Attempt with pysam

In [None]:
!pip install pysam

In [None]:
import pysam

In [None]:
input_bam="/scratch/users/sschulz/CARTPt04_Scan2/CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-A8_S50.recalibrated_realigned_deduped_sorted.bam"
#chrom is formatted chr1, chr2, chr3 etc. for hg38, may be diff for other ones
chrom='chr1'
start_pos=0
end_pos=20

In [None]:
## the normal samtools command we want to run: samtools view CART-MRD-BALL-PTA-NEXTERA-WGS-CCT500*.bam chr1:1322100-1332100 | awk '{if($1 !~ /^@/) print $10}

In [None]:
samfile = samfile = pysam.AlignmentFile(input_bam, "rb")
iter = samfile.fetch("chr1")
temp = []
for x in iter:
    temp.append(x)


In [None]:
## idfk how pysam works tbh
header = tk_bam.get_bam_header_as_dict(input_bam)

bam = pysam.Samfile(input_bam)

for rec in bam:
    # Convert to string and back to replace the old tid with the new one
    # This appears to be the only way to do this with pysam (sometime after 0.9)
    rec = pysam.AlignedSegment.fromstring(rec.to_string(),
                                          header=pysam.AlignmentHeader.from_dict(header))

In [None]:
## idk this isn't working
# import os
# import subprocess

# # Specify the input and output directories
# input_dir = "/scratch/users/sschulz/CARTPt04_Scan2/"
# output_dir = "/scratch/users/sschulz/CARTPt04_Scan2/"

# # Iterate over the files in the input directory
# for filename in os.listdir(input_dir):
#     # Check if the file is a SAM file
#     if filename.endswith(".sam"):
#         # Construct the input and output file paths
#         input_file = os.path.join(input_dir, filename)
#         output_file = os.path.join(output_dir, filename.replace(".sam", "_gene_seq.txt"))
        
#         # Run the samtools and awk commands using subprocess
#         with open(output_file, "w") as f:
#             subprocess.run(['ml', 'load', 'biology', 'samtools'], stdout=subprocess.PIPE)
#             subprocess.run(["samtools", "view", input_file], stdout=subprocess.PIPE)
#             subprocess.run(["awk", "{if($1 !~ /^@/) print $10}"], stdin=process.stdout, stdout=f)


## Loading DNA sequence data from BAMs to create dataset for fine tuning DNABERT

In [None]:
import HTSeq
from collections import defaultdict

In [None]:
def prepareBertDataset(bam_file, vcf_file):
    '''
        get mutations from vcf file, then get the read containing that mutation from the bam file, then mask that mutation and format
        for BERT (i.e. token with the mask at the mutation position so it can learn with PTA artifacts look like
    '''

## DNA tokenizer Test


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("AIRI-Institute/gena-lm-bert-base")


In [None]:
encoded_input = tokenizer("ACGTGGTATGATGATAGATGATGA")


In [None]:
print(encoded_input)

In [None]:
tokenizer.decode(encoded_input["input_ids"])

In [None]:
batch_sequences = [
    "ACGTAGCTGACTGACTTAGTGA",
    "ACTAGCATGCATCGTAGCTAGCTAGACTGA",
    "ATATATATTACACACACGAGACTAGCTT",
]

In [None]:
encoded_input=tokenizer(batch_sequences, padding=True, truncation=True)

In [None]:
print(encoded_input)

In [None]:
for i in encoded_input['input_ids']:
    print(tokenizer.decode(i))

In [None]:
## p much above but we've padded, truncated (no maximum length provided tho) and returned tensors
encoded_input = tokenizer(batch_sequences, padding=True, truncation=True, return_tensors="pt")

In [None]:
## so we can tokenize DNA sequences, but how do we 