## Pip installs

In [None]:
!pip install torch

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install pysam

In [None]:
!pip install HTSeq

In [5]:
!pip install enformer-pytorch>=0.5

In [3]:
!pip install polars

Collecting polars
  Downloading polars-0.16.18-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: polars
Successfully installed polars-0.16.18


## Load modules and variable declarations

In [1]:
import torch
import polars as pl
from enformer_pytorch import Enformer, GenomeIntervalDataset
from datasets import concatenate_datasets, load_dataset
import os
import numpy as np
import pandas as pd
import pysam

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tsv_dir= "/scratch/users/sschulz/pta_on_normal"
bed_path = "/scratch/users/sschulz/pta_on_normal/chr10.bed"


In [5]:
#good testing but use a gvcf instead containing all known mutations first
tsv_path = tsv_dir + "/CARTPt04_Scan2_svc_merged_extract_snp.hg38_multianno.tsv"

In [6]:
tsv = pd.read_table(tsv_path, sep='\t')

In [7]:
tsv['CHROM'][0]

'chr1'

In [23]:
bam_path=tsv_dir + '/CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-D4_S26.realigned_deduped_sorted.bam'

## Function and class definitions

In [146]:
def makeLlamaDataset(tsv_dir, bam_path, bed_path):
    '''
        from a directory containing many annotated tsv files and a bed path, create a huggingface dataset for use in llama
        
        start by just passing lines from vcf to llama for fine tuning, along with a line that says 
        "The read/basepairs/sequence at this position is:
        The read information from reference is:"
        
        This is a pretty brute force way to do it but maybe it'll create something coherent from llama.
        
        
        Getting correct sequence instruction: 
        "instruction": f"The gene {gene} is mutated at the {start_pos} basepair. What is the sequence? What is the mutation?",
        "input": f"{read_seq}",
        "output": "5"
        
        Getting whether exonic or not/amino acid change:
        
        
        [WIP] Instrucitons incorporating answers from databases:
        
        Clinvar:
        
        NCBI:
        
        Genecards: 
        
    '''
    for filename in os.listdir(tsv_dir):
        if filename.endswith('tsv'):
            tsv_file = os.path.join(tsv_dir, filename)
            tsv_length=len(tsv_file)
            counter = 0
            print(tsv_file)
            for i in range(tsv_length):
                chrom = tsv['CHROM'][i]
                start_pos = tsv['POS'][i]
                sample = tsv['SAMPLE'][i]
                gene = tsv['Gene.refGene'][i]
                gt = tsv['GT'][i]
                alt = tsv['ALT'][i]
                if gt == '0/1' or gt == '1/1':
                    print(start_pos)
                    print(sample, gt)
                    print(alt)
                    print(gene)
                    
                    ### position of mutation is the position is says on the pileup - start position (0 indexed)
                    ## start position can be greater than or less than position of read start, but luckily
                    ## should be able to index the base that's changed either way 
                    
                    #
                    
                    samfile = pysam.AlignmentFile(bam_path, "rb" )
                    pileup = samfile.pileup(chrom, start_pos, start_pos+1, min_mapping_quality=58)
                    for read in pileup:
                        read_list = str(read).split('\t')
                        read_start = read_list[5]
                        read_seq = read_list[11]
                        
                        mutated_base= read_seq[int(read_start) - start_pos] 
                        
                
                        print(f"the start pos from tsv is {start_pos} the start pos from pileup is {read_start} the the gene is: "+ gene +  ' the read is: ' + str(read_list) + ' and the mutated base is: ' + mutated_base)
                        print('for sanity, the mutated allele was: ' + alt)
                # for x in pileup:
                #     if counter == 0:
                #         print(str(x))

            

In [102]:
read = 'GTGTCAGACACTGTGGTGGAGCCCTACAACGCCACCCTCTCAGTCCACCAGCTCATAGAAAATGTGGATGAGACCTTCTGCATAGATAACGAAGCGCTAT'

In [114]:
len(read) + index

5

In [108]:
index = 873537 - 873632

In [113]:
index

-95

In [110]:
read[index]

'A'

In [115]:
read[5]

'A'

In [147]:
makeLlamaDataset(tsv_dir, bam_path, bed_path)

/scratch/users/sschulz/pta_on_normal/CARTPt04_Scan2_svc_merged_extract_snp.hg38_multianno.tsv
138156
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-A8_S50 0/1
T
LOC729737
138156
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-E10_S68 0/1
T
LOC729737
138156
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-F6_S41 0/1
T
LOC729737
139836
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-A8_S50 0/1
C
LOC729737
139836
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-B4_S24 0/1
C
LOC729737
139836
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-E10_S68 0/1
C
LOC729737
139836
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-E11_S75 0/1
C
LOC729737
139836
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-E1_S5 0/1
C
LOC729737
139836
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-F11_S76 0/1
C
LOC729737
139836
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-F6_S41 0/1
C
LOC729737
873632
CART-MRD-BALL-PTA-NEXTERA-WGS-CCT5007Pt04-B8_S51 0/1
A
FAM41C
the start pos from tsv is 873632 the start pos from pileup is 873537 the the gene is: FAM41C the read is: ['0', '8