In [1]:
import pandas as pd
from pvtools import Sequence, LookupTable, dbSNP
import json
import numpy as np

# Section 1. Build an example lookup table for the test gene

In [2]:
ng = Sequence(fasta_file='TestGene/TestGene.fasta', json_file='TestGene/TestGene.json')
g7 = Sequence(fasta_file='TestGene/GRCh37.fasta', json_file='TestGene/GRCh37.json')
g8 = Sequence(fasta_file='TestGene/GRCh38.fasta', json_file='TestGene/GRCh38.json')

In [3]:
lookup = LookupTable(ng, g7, g8)

In [4]:
lookup.df

Unnamed: 0,Start_Position,ATG_Position,Transcript_Position,GRCh37_Position,GRCh38_Position,Allele,Exon_Annotation,CDS_Annotation
0,1,-350,c.-250,10001,20001,G,Upstream,Upstream
1,2,-349,c.-249,10002,20002,A,Upstream,Upstream
2,3,-348,c.-248,10003,20003,T,Upstream,Upstream
3,4,-347,c.-247,10004,20004,T,Upstream,Upstream
4,5,-346,c.-246,10005,20005,C,Upstream,Upstream
...,...,...,...,...,...,...,...,...
3495,3496,3145,c.*1946,13496,23496,G,Downstream,Downstream
3496,3497,3146,c.*1947,13497,23497,G,Downstream,Downstream
3497,3498,3147,c.*1948,13498,23498,C,Downstream,Downstream
3498,3499,3148,c.*1949,13499,23499,A,Downstream,Downstream


In [5]:
lookup.to_tsv('TestGene/Lookup.tsv')

# Section 2. Build a lookup table for SLCO1B1

In [6]:
ng = Sequence(fasta_file='SLCO1B1/NG_011745.1.fasta', json_file='SLCO1B1/NG_011745.1.json')
g7 = Sequence(fasta_file='SLCO1B1/GRCh37.fasta', json_file='SLCO1B1/GRCh37.json')
g8 = Sequence(fasta_file='SLCO1B1/GRCh38.fasta', json_file='SLCO1B1/GRCh38.json')

In [7]:
lookup = LookupTable(ng, g7, g8)

In [8]:
lookup.df

Unnamed: 0,Start_Position,ATG_Position,Transcript_Position,GRCh37_Position,GRCh38_Position,Allele,Exon_Annotation,CDS_Annotation
0,1,-15381,c.-5104,21279128,21126194,G,Upstream,Upstream
1,2,-15380,c.-5103,21279129,21126195,A,Upstream,Upstream
2,3,-15379,c.-5102,21279130,21126196,T,Upstream,Upstream
3,4,-15378,c.-5101,21279131,21126197,T,Upstream,Upstream
4,5,-15377,c.-5100,21279132,21126198,C,Upstream,Upstream
...,...,...,...,...,...,...,...,...
115598,115599,100217,c.*2603,21394726,21241792,A,Downstream,Downstream
115599,115600,100218,c.*2604,21394727,21241793,A,Downstream,Downstream
115600,115601,100219,c.*2605,21394728,21241794,A,Downstream,Downstream
115601,115602,100220,c.*2606,21394729,21241795,C,Downstream,Downstream


In [9]:
lookup.to_tsv('SLCO1B1/Lookup.tsv')

# Section 3. Perform sanity checks

## Section 3-1. Compare the sequence identity

In [10]:
ng.seq == g7.seq == g8.seq

True

## Section 3-2. Confirm the mRNA sequence

In [11]:
nm = Sequence(fasta_file='SLCO1B1/NM_006446.5.fasta', json_file='SLCO1B1/NM_006446.5.json')
ng.transcribe() == nm.seq

True

## Section 3-3. Confirm the protein sequence

In [12]:
df = pd.read_table('SLCO1B1/SNP_Table.tsv')
df = df[df['Issue'] == '.']

impact_list = []
for item in df['Impact'].to_list():
    if any(x.isdigit() for x in item):
        impact_list.append(item)

pos_list = []
for impact in impact_list:
    pos_list.append(int(''.join([x for x in impact if x.isdigit()])))

aa_list = []
for pos in pos_list:
    aa_list.append(ng.data['Protein'][pos-1])

results = []
for i, impact in enumerate(impact_list):
    pos = pos_list[i]
    ref_aa = aa_list[i]
    test_aa = impact.split(str(pos))[0]
    results.append(ref_aa == test_aa)

print("All amino acids matched:", all(results))

All amino acids matched: True


## Section 3-4. Confirm the coordinates

In [13]:
df = pd.read_table('SLCO1B1/SNP_Table.tsv')
df = df[df['Issue'] == '.']

atg_pos = [lookup.find('Start_Position', 'ATG_Position', x) for x in df['Start_Position'].astype(int)]
transcript_pos = [lookup.find('Start_Position', 'Transcript_Position', x) for x in df['Start_Position'].astype(int)]
grch37_pos = [lookup.find('Start_Position', 'GRCh37_Position', x) for x in df['Start_Position'].astype(int)]
grch38_pos = [lookup.find('Start_Position', 'GRCh38_Position', x) for x in df['Start_Position'].astype(int)]

print('ATG_Position:', all(df['ATG_Position'].astype(int) == atg_pos))
print('Transcript_Position:', all(df['Transcript_Position'] == transcript_pos))
print('GRCh37_Position:', all(df['GRCh37_Position'].astype(int) == grch37_pos))
print('GRCh38_Position:', all(df['GRCh38_Position'].astype(int) == grch38_pos))

ATG_Position: True
Transcript_Position: True
GRCh37_Position: True
GRCh38_Position: True


## Section 3-5. Confirm the alleles

In [14]:
df = pd.read_table('SLCO1B1/SNP_Table.tsv')
df = df[df['Issue'] == '.']
test_alleles = df['Reference_Allele'].to_list()
ref_alleles = [lookup.find('Start_Position', 'Allele', x) for x in df['Start_Position'].astype(int)]
print("All alleles matched:", test_alleles == ref_alleles)

All alleles matched: True


## Section 3-6. Confirm the rs IDs

In [15]:
df = pd.read_table('SLCO1B1/SNP_Table.tsv')
df = df[df['Issue'] == '.']
test_ids = df['rs_ID'].to_list()

db7 = dbSNP('SLCO1B1/GRCh37_dbSNP.tsv')
db8 = dbSNP('SLCO1B1/GRCh38_dbSNP.tsv')

ref_ids7 = [db7.get_ref(int(x)-1, int(x)) for x in df['GRCh37_Position']]
ref_ids8 = [db8.get_ref(int(x)-1, int(x)) for x in df['GRCh38_Position']]

results7 = []
results8 = []

for i, test_id in enumerate(test_ids):
    results7.append(test_id == ref_ids7[i])

for i, test_id in enumerate(test_ids):
    results8.append(test_id == ref_ids8[i])

print("All rs Ids matched for GRCh37:", all(results7))
print("All rs Ids matched for GRCh38:", all(results8))

All rs Ids matched for GRCh37: True
All rs Ids matched for GRCh38: True
