In [1]:
import pandas as pd
from pvtools import Sequence, LookupTable
import json
import numpy as np

# Section 1. Build a lookup table for the test gene

In [2]:
ng = Sequence(fasta_file='TestGene/TestGene.fasta', json_file='TestGene/TestGene.json')
g7 = Sequence(fasta_file='TestGene/GRCh37.fasta', json_file='TestGene/GRCh37.json')
g8 = Sequence(fasta_file='TestGene/GRCh38.fasta', json_file='TestGene/GRCh38.json')

In [3]:
lookup = LookupTable(ng, g7, g8)

In [4]:
lookup.df

Unnamed: 0,Start_Position,ATG_Position,Transcript_Position,GRCh37_Position,GRCh38_Position,Allele,Exon_Annotation,CDS_Annotation
0,1,-350,c.-250,10001,20001,G,Upstream,Upstream
1,2,-349,c.-249,10002,20002,A,Upstream,Upstream
2,3,-348,c.-248,10003,20003,T,Upstream,Upstream
3,4,-347,c.-247,10004,20004,T,Upstream,Upstream
4,5,-346,c.-246,10005,20005,C,Upstream,Upstream
...,...,...,...,...,...,...,...,...
3495,3496,3145,c.*1946,13496,23496,G,Downstream,Downstream
3496,3497,3146,c.*1947,13497,23497,G,Downstream,Downstream
3497,3498,3147,c.*1948,13498,23498,C,Downstream,Downstream
3498,3499,3148,c.*1949,13499,23499,A,Downstream,Downstream


In [5]:
lookup.to_tsv('TestGene/Lookup.tsv')

# Section 2. Build a lookup table for SLCO1B1

In [6]:
ng = Sequence(fasta_file='SLCO1B1/NG_011745.1.fasta', json_file='SLCO1B1/NG_011745.1.json')
g7 = Sequence(fasta_file='SLCO1B1/GRCh37.fasta', json_file='SLCO1B1/GRCh37.json')
g8 = Sequence(fasta_file='SLCO1B1/GRCh38.fasta', json_file='SLCO1B1/GRCh38.json')

In [7]:
lookup = LookupTable(ng, g7, g8)

In [8]:
lookup.df

Unnamed: 0,Start_Position,ATG_Position,Transcript_Position,GRCh37_Position,GRCh38_Position,Allele,Exon_Annotation,CDS_Annotation
0,1,-15381,c.-5104,21279128,21126194,G,Upstream,Upstream
1,2,-15380,c.-5103,21279129,21126195,A,Upstream,Upstream
2,3,-15379,c.-5102,21279130,21126196,T,Upstream,Upstream
3,4,-15378,c.-5101,21279131,21126197,T,Upstream,Upstream
4,5,-15377,c.-5100,21279132,21126198,C,Upstream,Upstream
...,...,...,...,...,...,...,...,...
115598,115599,100217,c.*2603,21394726,21241792,A,Downstream,Downstream
115599,115600,100218,c.*2604,21394727,21241793,A,Downstream,Downstream
115600,115601,100219,c.*2605,21394728,21241794,A,Downstream,Downstream
115601,115602,100220,c.*2606,21394729,21241795,C,Downstream,Downstream


In [9]:
lookup.to_tsv('SLCO1B1/Lookup.tsv')

# Section 3. Perform sanity checks

## Section 3-1. Compare the sequence identity

In [None]:
ng.seq == g7.seq == g8.seq

## Section 3-2. Confirm the mRNA sequence

In [None]:
nm = Sequence(fasta_file='SLCO1B1/NM_006446.5.fasta', json_file='SLCO1B1/NM_006446.5.json')
ng.transcribe() == nm.seq

## Section 3-3. Confirm the protein sequence

In [10]:
df = pd.read_table('SLCO1B1/manual.tsv')

b = []

for item in df['Impact'].to_list():
    if any(x.isdigit() for x in item):
        b.append(item)
        
pos_list = []

for item in b:
    pos_list.append(int(''.join([x for x in item if x.isdigit()])))

ref_aa = []

for pos in pos_list:
    ref_aa.append(ng.data['Protein'][pos-1])

for i in range(len(b)):
    item = b[i]
    pos = pos_list[i]
    aa = item.split(str(pos))[0]
    print(aa == ref_aa[i], aa, ref_aa[i], pos)

False A R 57
True G G 71
True F F 73
True V V 82
True N N 130
True S S 137
True N N 151
True R R 152
True P P 155
True E E 156
True Y Y 173
True V V 174
True L L 191
True L L 193
True F F 199
True I I 222
True D D 241
True I I 245
True R R 253
True R R 253
True L L 294
True I I 353
True Y Y 362
True F F 400
True V V 416
True N N 432
True G G 437
True D D 462
True P P 484
True G G 488
True I I 499
True L L 643
True D D 655
True E E 667
True H H 678
True S S 682


## Section 3-4. Compairison with the manual table

In [12]:
df = pd.read_table('SLCO1B1/manual.tsv')
df = df[df['NG_011745.1 (start=1)'] != '.']
df['Ref'] = df['SNP'].str.split('>').str[0]
df['NG_011745.1 (start=1)'] = df['NG_011745.1 (start=1)'].astype(int)
ng_dict = dict(zip(df['NG_011745.1 (start=1)'], df['Ref']))
results = []
for pos, allele in ng_dict.items():
    results.append(ng.seq[pos-1] == allele)
if all(results):
    print('All alleles passed')

All alleles passed
