In [1]:
import numpy as np
import pandas as pd
import torch

import boda

import tqdm.notebook as tqdm

import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Convert reference MPRA to VCF

In [2]:
mpra_19 = pd.read_table('MPRA_ALL_v3.txt', sep=' ', header=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
with open('troubleshoot.vcf', 'w') as f:
    print('#CHROM\tPOS\tID\tREF\tALT', file=f)
    for i, x in mpra_19.iterrows():
        if len(x['IDs'].split(':')) > 5:
            chrom, pos, ref, alt, *stuff = x['IDs'].split(':')
            if len(stuff) > 0:
                if (stuff[-1] == 'wC') and (stuff[-2] == 'R'):
                    print(f'chr{chrom}\t{int(pos)+1-1}\t{x["IDs"]}\t{ref}\t{alt}', file=f)


# Setup VCF dataset

In [4]:
fasta_data = boda.data.Fasta('male.hg19.fasta')


pre-reading fasta into memory
100%|██████████████████████████| 61913917/61913917 [00:21<00:00, 2863952.17it/s]
finding keys
parsing
100%|███████████████████████████████████████████| 25/25 [12:00<00:00, 28.83s/it]
done


In [5]:
vcf = boda.data.VCF(
    'troubleshoot.vcf', chr_prefix='', 
    max_allele_size=20, max_indel_size=20,
)

WINDOW_SIZE = 200
RELATIVE_START = 99
RELATIVE_END = 100

vcf_data = boda.data.VcfDataset(
    vcf.vcf, fasta_data.fasta, WINDOW_SIZE, 
    RELATIVE_START, RELATIVE_END, step_size=1, 
    left_flank='', right_flank=''
)


loading DataFrame
Checking and filtering tokens
Allele length checks
Done
343562/343562 records have matching contig in FASTA
returned 343562/343562 records


# Check if sequences are identical matches

In [6]:
def vec2nt(vec):
    hold = torch.cat([ vec, torch.ones((1,vec.shape[1])) ], axis=0)
    return "".join([ ['A','C','G','T','N'][x] for x in torch.argmax(hold, axis=0) ])

In [7]:
vec2nt(vcf_data[0]['ref'][0])

'AAAAAAAAAAAAAAAAAAAAAAAGTCTTTCCTCTCATCAACCCCCTACCCCCTGCCCCCGAACGTCCTCAACCCAAAGTAGCCAAGTTGGCTAATCTAACGCAAAGGGTGGGAGGCCACACCAAATGCACTGGTTACATAGCAAACACAAAGTCTGGCTGGGTTTTAAAGCAGCCTTCGTCTAAATCACGCAAGCACACA'

In [8]:
mpra_19.set_index('IDs').loc[ vcf.vcf.loc[0,'id'], 'nt_sequence' ]

'AAAAAAAAAAAAAAAAAAAAAAAGTCTTTCCTCTCATCAACCCCCTACCCCCTGCCCCCGAACGTCCTCAACCCAAAGTAGCCAAGTTGGCTAATCTAACGCAAAGGGTGGGAGGCCACACCAAATGCACTGGTTACATAGCAAACACAAAGTCTGGCTGGGTTTTAAAGCAGCCTTCGTCTAAATCACGCAAGCACACA'

In [9]:
vec2nt(vcf_data[0]['ref'][0]) == mpra_19.set_index('IDs').loc[ vcf.vcf.loc[0,'id'], 'nt_sequence' ]

True

In [10]:
loader_dict = {}

for i in tqdm.tqdm(range(vcf.vcf.shape[0])):
    row_id =  vcf.vcf.loc[i,'id']
    loader_seq = vec2nt(vcf_data[i]['ref'][0])
    loader_dict[row_id] = loader_seq


  0%|          | 0/343562 [00:00<?, ?it/s]

In [11]:
extract_seqs = pd.DataFrame(zip(*loader_dict.items())).T
extract_seqs.columns=['IDs','nt_sequence']
extract_seqs = extract_seqs.set_index('IDs', drop=True)

In [12]:
mismatch_filter = mpra_19.set_index('IDs').loc[extract_seqs.index, 'nt_sequence'] == extract_seqs.loc[extract_seqs.index,'nt_sequence']

In [13]:
mismatch_filter.mean()

0.9234199358485513

## result
about 92% of extracted sequences based on VCF coordinates are exact matches to the known sequence in the MPRA

# Check mismatches
checking if mismatches are indels

In [14]:
extract_seqs.loc[~mismatch_filter]

Unnamed: 0_level_0,nt_sequence
IDs,Unnamed: 1_level_1
1:26878979:A:AT:R:wC,AAAAAAAAAAAAAAAAAAAAAGGATTTGAGCTAGAAAATGGGACCA...
13:72497708:AT:A:R:wC,AAAAAAAAAAAAAAAAAAACTTCCCTCTAAATACACACATTAATAA...
3:70506530:CA:C:R:wC,CAAAAAAAAAAAAAAAAAAAAGAAAGAAAAAAAGAAAGAAAGAAAG...
3:53801691:AT:A:R:wC,CAAAAAAAAAAAAAAAACAACAAAATAAAATTCCCAACATGCAGAT...
14:102751522:CA:C:R:wC,AAAAAAAAAAAAAAAGAAAATGCAGGCTTGAGAACTAGATCTAGAA...
...,...
7:4810197:CTCT:C:R:wC,GTTGCAGTGAGCCGAGATCATGCCACTGCACTCCAGCCTGGGCAAC...
18:12438917:TGGA:T:R:wC,CCTGCAGCGGTGCTCGCTATCCTTGACCAAGCTGGAGAACAAGAGT...
22:27697920:GT:G:R:wC,CTGCAGCCTTGACCTCCCCAGCTCAAGCAATCCTCCCACCTCAGCC...
4:106276362:A:AT:R:wC,CTGCAGCCTTGATTTCCTGGGTTCAGGTGATTCTCCCACCTCAGCC...


In [15]:
var_len_match = np.array([ len(seq_id.split(':')[2]) == len(seq_id.split(':')[3]) for seq_id in extract_seqs.loc[~mismatch_filter].index ])
var_len_match.mean()

0.00011402508551881414

In [16]:
extract_seqs.loc[~mismatch_filter].loc[var_len_match]

Unnamed: 0_level_0,nt_sequence
IDs,Unnamed: 1_level_1
9:139217089:C:T:R:wC,NNNNNNNNGATCTTCTTGGGTCTCAGGTTCCAACATATGAAGGCTG...
11:69139893:T:G:R:wC,NNNNNNNNTGATGGTTTTGGTGATGGCGGTGGTGGTGGTGATGGTG...
19:20523319:G:T:R:wC,TTGAGACCCTCATACCTGATTCTGGCCTCCCCTTAGAGTCACGTGA...


In [17]:
extract_seqs.loc['19:20523319:G:T:R:wC', 'nt_sequence']

'TTGAGACCCTCATACCTGATTCTGGCCTCCCCTTAGAGTCACGTGAAGCCCTTCATTAAAACAACATGGATACTTCCACCCAGAACAATAAACAGAACCGGTGGGGAAGGCACAAGAGATTTCTGCAAATTGGCCATGTGATCCTAATGAGAAGCCTGGGCTGATAACCACTAAGCTAAGCATTGCCTCTCAAGCTNNNN'

## result
All but 3 sequences are indels, the rest have N's in the reference