In [40]:
import sys

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

import boda

import tqdm.notebook as tqdm

import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Convert reference MPRA to VCF

In [2]:
mpra_19 = pd.read_table('MPRA_ALL_v3.txt', sep=' ', header=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [110]:
with open('troubleshoot.vcf', 'w') as f:
    print('#CHROM\tPOS\tID\tREF\tALT', file=f)
    for i, x in mpra_19.iterrows():
        if len(x['IDs'].split(':')) > 5:
            chrom, pos, ref, alt, *stuff = x['IDs'].split(':')
            if len(stuff) > 0:
                if (stuff[-1] == 'wC') and (stuff[-2] == 'R'):
                    print(f'chr{chrom}\t{int(pos)+1-1}\t{x["IDs"]}\t{ref}\t{alt}', file=f)


# Setup VCF dataset

In [4]:
fasta_data = boda.data.Fasta('male.hg19.fasta')


pre-reading fasta into memory
100%|██████████████████████████| 61913917/61913917 [00:21<00:00, 2840868.13it/s]
finding keys
parsing
100%|███████████████████████████████████████████| 25/25 [11:56<00:00, 28.67s/it]
done


In [111]:
vcf = boda.data.VCF(
    'troubleshoot.vcf', chr_prefix='', 
    max_allele_size=20, max_indel_size=20,
)

WINDOW_SIZE = 200
RELATIVE_START = 99
RELATIVE_END = 100

vcf_data = boda.data.VcfDataset(
    vcf.vcf, fasta_data.fasta, WINDOW_SIZE, 
    RELATIVE_START, RELATIVE_END, step_size=1, 
    left_flank='', right_flank=''
)


loading DataFrame
Checking and filtering tokens
Allele length checks
Done
343562/343562 records have matching contig in FASTA
returned 343562/343562 records


# Check if sequences are identical matches

In [112]:
def vec2nt(vec):
    hold = torch.cat([ vec, torch.ones((1,vec.shape[1])) ], axis=0)
    return "".join([ ['A','C','G','T','N'][x] for x in torch.argmax(hold, axis=0) ])

In [113]:
vec2nt(vcf_data[0]['ref'][0])

'AAAAAAAAAAAAAAAAAAAAAAAGTCTTTCCTCTCATCAACCCCCTACCCCCTGCCCCCGAACGTCCTCAACCCAAAGTAGCCAAGTTGGCTAATCTAACGCAAAGGGTGGGAGGCCACACCAAATGCACTGGTTACATAGCAAACACAAAGTCTGGCTGGGTTTTAAAGCAGCCTTCGTCTAAATCACGCAAGCACACA'

In [114]:
mpra_19.set_index('IDs').loc[ vcf.vcf.loc[0,'id'], 'nt_sequence' ]

'AAAAAAAAAAAAAAAAAAAAAAAGTCTTTCCTCTCATCAACCCCCTACCCCCTGCCCCCGAACGTCCTCAACCCAAAGTAGCCAAGTTGGCTAATCTAACGCAAAGGGTGGGAGGCCACACCAAATGCACTGGTTACATAGCAAACACAAAGTCTGGCTGGGTTTTAAAGCAGCCTTCGTCTAAATCACGCAAGCACACA'

In [115]:
vec2nt(vcf_data[0]['ref'][0]) == mpra_19.set_index('IDs').loc[ vcf.vcf.loc[0,'id'], 'nt_sequence' ]

True

In [116]:
loader_dict = {}

for i in tqdm.tqdm(range(vcf.vcf.shape[0])):
    row_id =  vcf.vcf.loc[i,'id']
    loader_seq = vec2nt(vcf_data[i]['ref'][0])
    loader_dict[row_id] = loader_seq


  0%|          | 0/343562 [00:00<?, ?it/s]

In [117]:
extract_seqs = pd.DataFrame(zip(*loader_dict.items())).T
extract_seqs.columns=['IDs','nt_sequence']
extract_seqs = extract_seqs.set_index('IDs', drop=True)

In [118]:
mismatch_filter = mpra_19.set_index('IDs').loc[extract_seqs.index, 'nt_sequence'] == extract_seqs.loc[extract_seqs.index,'nt_sequence']

In [119]:
mismatch_filter.mean()

0.9234199358485513

## result
about 92% of extracted sequences based on VCF coordinates are exact matches to the known sequence in the MPRA

# Check mismatches
checking if mismatches are indels

In [120]:
extract_seqs.loc[~mismatch_filter]

Unnamed: 0_level_0,nt_sequence
IDs,Unnamed: 1_level_1
1:26878979:A:AT:R:wC,AAAAAAAAAAAAAAAAAAAAAGGATTTGAGCTAGAAAATGGGACCA...
13:72497708:AT:A:R:wC,AAAAAAAAAAAAAAAAAAACTTCCCTCTAAATACACACATTAATAA...
3:70506530:CA:C:R:wC,CAAAAAAAAAAAAAAAAAAAAGAAAGAAAAAAAGAAAGAAAGAAAG...
3:53801691:AT:A:R:wC,CAAAAAAAAAAAAAAAACAACAAAATAAAATTCCCAACATGCAGAT...
14:102751522:CA:C:R:wC,AAAAAAAAAAAAAAAGAAAATGCAGGCTTGAGAACTAGATCTAGAA...
...,...
7:4810197:CTCT:C:R:wC,GTTGCAGTGAGCCGAGATCATGCCACTGCACTCCAGCCTGGGCAAC...
18:12438917:TGGA:T:R:wC,CCTGCAGCGGTGCTCGCTATCCTTGACCAAGCTGGAGAACAAGAGT...
22:27697920:GT:G:R:wC,CTGCAGCCTTGACCTCCCCAGCTCAAGCAATCCTCCCACCTCAGCC...
4:106276362:A:AT:R:wC,CTGCAGCCTTGATTTCCTGGGTTCAGGTGATTCTCCCACCTCAGCC...


In [123]:
mpra_19.set_index('IDs').loc[extract_seqs.index, 'nt_sequence'].loc[~mismatch_filter].iloc[:20]

IDs
1:26878979:A:AT:R:wC          AAAAAAAAAAAAAAAAAAAAGGATTTGAGCTAGAAAATGGGACCAT...
13:72497708:AT:A:R:wC         AAAAAAAAAAAAAAAAAACTTCCCTCTAAATACACACATTAATAAT...
3:70506530:CA:C:R:wC          AAAAAAAAAAAAAAAAAAAAGAAAGAAAAAAAGAAAGAAAGAAAGA...
3:53801691:AT:A:R:wC          AAAAAAAAAAAAAAAACAACAAAATAAAATTCCCAACATGCAGATA...
14:102751522:CA:C:R:wC        AAAAAAAAAAAAAAGAAAATGCAGGCTTGAGAACTAGATCTAGAAG...
21:37562412:TC:T:R:wC         AAAAAAAAAAAATGATTTTCAGCTGGGCACAGTGGCTTACACCTGT...
9:73060384:A:ATTTTC:R:wC      AAAAAAAAAAAAAAGTATTGTAGAGGAAAGAAGTTCTAGCTAATGC...
1:165583188:T:TAA:R:wC        AAAAAAAGATATTTGCAATATATATGATAGCAAAAGCCTAATTTCT...
15:41533058:C:CTT:R:wC        AAAAAAAAAAAAGATCTATGAAGAAATGAAAATGGCAGTGTTGGGA...
1:11075825:C:CAG:R:wC         AAAAAAAAAAAATTATCTCCCCTTCTCTTCTAAATAACCTAAGGTT...
19:4135623:CT:C:R:wC          AAAAAAAAAAAGGAATAAGGAAATATTTTCCAGTATTCAGAAATGA...
1:203688453:A:ATTGCCT:R:wC    AAAAAAAAAAAAAAAAAGCAAGAGAGATAAAATACATGGTTCTAAA...
5:74639544:CTTGTA:C:R:wC      AAAAAA

In [107]:
extract_seqs.loc['7:4810197:CTCT:C:A:wC', 'nt_sequence']

'AGGTTGCAGTGAGCCGAGATCATGCCACTGCACTCCAGCCTGGGCAACAGAGGGAGACTCTGTCTCAAAAATAATAATCAAATCAAGTATTTTAAGTTTGGCTCTTTTTTTCAAGAAAGGCTTTTTGGATACCTAGAATACCTTCATGTATGTGGCTTGATTTCTGTTTCAGGAAAAGGGGGGCGGGAGGAGAGACAACA'

In [122]:
extract_seqs.loc['7:4810197:CTCT:C:R:wC', 'nt_sequence']

'GTTGCAGTGAGCCGAGATCATGCCACTGCACTCCAGCCTGGGCAACAGAGGGAGACTCTGTCTCAAAAATAATAATCAAATCAAGTATTTTAAGTTTGGCTCTTCTTTTTTTCAAGAAAGGCTTTTTGGATACCTAGAATACCTTCATGTATGTGGCTTGATTTCTGTTTCAGGAAAAGGGGGGCGGGAGGAGAGACAAC'

In [109]:
mpra_19.set_index('IDs').loc['7:4810197:CTCT:C:R:wC', 'nt_sequence']

'TGCAGTGAGCCGAGATCATGCCACTGCACTCCAGCCTGGGCAACAGAGGGAGACTCTGTCTCAAAAATAATAATCAAATCAAGTATTTTAAGTTTGGCTCTTTTTTTCAAGAAAGGCTTTTTGGATACCTAGAATACCTTCATGTATGTGGCTTGATTTCTGTTTCAGGAAAAGGGGGGCGGGAGGAGAGACAACAG'

In [100]:
var_len_match = np.array([ len(seq_id.split(':')[2]) == len(seq_id.split(':')[3]) for seq_id in extract_seqs.loc[~mismatch_filter].index ])
var_len_match.mean()

0.00013016313779937522

In [101]:
extract_seqs.loc[~mismatch_filter].loc[var_len_match]

Unnamed: 0_level_0,nt_sequence
IDs,Unnamed: 1_level_1
9:139217089:C:T:A:wC,NNNNNNNNGATCTTCTTGGGTCTCAGGTTCCAACATATGAAGGCTG...
11:69139893:T:G:A:wC,NNNNNNNNTGATGGTTTTGGTGATGGCGGTGGTGGTGGTGATGGTG...
19:20523319:G:T:A:wC,TTGAGACCCTCATACCTGATTCTGGCCTCCCCTTAGAGTCACGTGA...


In [17]:
extract_seqs.loc['19:20523319:G:T:R:wC', 'nt_sequence']

'TTGAGACCCTCATACCTGATTCTGGCCTCCCCTTAGAGTCACGTGAAGCCCTTCATTAAAACAACATGGATACTTCCACCCAGAACAATAAACAGAACCGGTGGGGAAGGCACAAGAGATTTCTGCAAATTGGCCATGTGATCCTAATGAGAAGCCTGGGCTGATAACCACTAAGCTAAGCATTGCCTCTCAAGCTNNNN'

## result
All but 3 sequences are indels, the rest have N's in the reference

# Test liftover

In [77]:
fasta_38 = boda.data.Fasta('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta')

pre-reading fasta into memory
100%|██████████████████████████| 44284892/44284892 [00:16<00:00, 2742527.68it/s]
finding keys
parsing
100%|█████████████████████████████████████████| 195/195 [11:44<00:00,  3.61s/it]
done


In [78]:
vcf = boda.data.VCF(
    'gtex_hg38_coord_hg19_ID.vcf', chr_prefix='', 
    max_allele_size=20, max_indel_size=20,
)

WINDOW_SIZE = 200
RELATIVE_START = 99
RELATIVE_END = 100

vcf_data = boda.data.VcfDataset(
    vcf.vcf, fasta_38.fasta, WINDOW_SIZE, 
    RELATIVE_START, RELATIVE_END, step_size=1, 
    left_flank='', right_flank=''
)


loading DataFrame
Checking and filtering tokens
Allele length checks
Done
210121/210121 records have matching contig in FASTA
returned 210121/210121 records


In [79]:
vcf_data[1]

{'ref': tensor([[[1., 1., 1.,  ..., 0., 0., 1.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          [0., 0., 0.,  ..., 0., 1., 0.]],
 
         [[0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [1., 0., 0.,  ..., 1., 1., 1.]]]),
 'alt': tensor([[[1., 1., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 1., 0.],
          [0., 0., 0.,  ..., 1., 0., 1.]],
 
         [[1., 0., 1.,  ..., 0., 0., 0.],
          [0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 1., 1.]]])}

In [82]:
loader_dict = {}
chrom_key = {}

for i in tqdm.tqdm(range(vcf.vcf.shape[0])):
    row_id =  vcf.vcf.loc[i,'id']
    chrom = vcf.vcf.loc[i,'chrom']
    loader_seq = vec2nt(vcf_data[i]['ref'][0])
    loader_dict[row_id] = loader_seq
    chrom_key[row_id] = chrom


  0%|          | 0/210121 [00:00<?, ?it/s]

In [83]:
liftover_seqs = pd.DataFrame(zip(*loader_dict.items())).T
liftover_seqs.columns=['IDs','nt_sequence']
liftover_seqs = liftover_seqs.set_index('IDs', drop=True)

liftover_chrom= pd.DataFrame(zip(*chrom_key.items())).T
liftover_chrom.columns=['IDs','chrom']
liftover_chrom= liftover_chrom.set_index('IDs', drop=True)

In [None]:
liftover_seqs

In [84]:
intersect_ids = set(extract_seqs.index).intersection(set(liftover_seqs.index))
len(intersect_ids)

199935

In [85]:
concord = extract_seqs.loc[intersect_ids, 'nt_sequence'] == liftover_seqs.loc[intersect_ids, 'nt_sequence']

In [86]:
mismatch_contig_source_19 = [ x.split(':')[0] for x in liftover_seqs.loc[intersect_ids].loc[~concord].index ]
mismatch_contig_source_38 = [ liftover_chrom.loc[x, 'chrom'] for x in liftover_seqs.loc[intersect_ids].loc[~concord].index ]

with open('liftover_fails.txt', 'w') as f:
    [ print(x, file=f) for x in liftover_seqs.loc[intersect_ids].loc[~concord].index ]

In [88]:
from collections import Counter

print("in hg19")
_ = [ print(f"chr{k}: {v}") for k,v in Counter(mismatch_contig_source_19).items() ]

print("")

print("in hg38")
_ = [ print(f"{k}: {v}") for k,v in Counter(mismatch_contig_source_38).items() ]

in hg19
chr10: 419
chr8: 102
chr17: 222
chr1: 826
chr11: 26
chr22: 211
chr19: 290
chr9: 463
chrX: 300
chr20: 52
chr12: 43
chr14: 218
chr7: 310
chr18: 23
chr2: 84
chr15: 244
chr4: 56
chr21: 16
chr3: 65
chr6: 17
chr5: 22
chr16: 14
chr13: 3

in hg38
chr10: 420
chr8: 101
chr17: 222
chr1: 828
chr11: 28
chr22: 225
chr19: 290
chr9: 458
chrX: 299
chr20: 69
chr12: 42
chr14: 205
chr7: 309
chr18: 23
chr2: 86
chr15: 241
chr4: 51
chr21: 17
chr3: 62
chr6: 17
chr5: 19
chr16: 11
chr13: 3


In [89]:
fasta_38.fasta['chr22'].shape

(4, 50818468)

In [None]:
print(mismatch_contig_source.mean())