In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import wget #!pip install python3-wget

# Getting Swissmodel dataframe

In [2]:
data_path = '../data/'
os.listdir(data_path)

['P0DTC1.json', 'raw_page', 'swissmodel_dataframe.pkl']

In [3]:
mutations_path = data_path + 'swissmodel_dataframe.pkl'
with open(mutations_path, 'rb') as f:
    mutations_df = pickle.load(f)

mutations_df.head()

Unnamed: 0,Protein,Frequency,From,To,Position,Row
0,P0DTC1,3e-05,S,A,4393,0
1,P0DTC1,3e-05,S,L,4393,0
2,P0DTC1,0.00016,A,D,4394,1
3,P0DTC1,0.00016,A,S,4394,1
4,P0DTC1,0.00016,A,T,4394,1


# Getting all unique protein names

In [4]:
all_proteins = list(mutations_df.Protein.unique())
print(all_proteins)

['P0DTC1', 'P0DTC2', 'P0DTC3', 'P0DTC4', 'P0DTC5', 'P0DTC6', 'P0DTC7', 'P0DTC8', 'P0DTC9', 'P0DTD1', 'P0DTD8']


## Downloading Protein JSONs

Assembling URL

In [5]:
base_url = "https://swissmodel.expasy.org/repository/uniprot/{}.json "

In [11]:
result

'P0DTC1.json '

In [14]:
for protein in all_proteins:
    print(protein)
    path = data_path + protein + '.json'
    wget.download(base_url.format(protein), out=path)

P0DTC1
P0DTC2
P0DTC3
P0DTC4
P0DTC5
P0DTC6
P0DTC7
P0DTC8
P0DTC9
P0DTD1
P0DTD8


## Checking data
 
Checks if original aminoacid position matches it on the sequence

In [75]:
mutation_df_path = data_path + 'swissmodel_dataframe.pkl'
with open(mutation_df_path, 'rb') as f:
    mutation_df = pickle.load(f)
mutation_df.head()

Unnamed: 0,Protein,Frequency,From,To,Position,Row
0,P0DTC1,3e-05,S,A,4393,0
1,P0DTC1,3e-05,S,L,4393,0
2,P0DTC1,0.00016,A,D,4394,1
3,P0DTC1,0.00016,A,S,4394,1
4,P0DTC1,0.00016,A,T,4394,1


Getting first protein since it has the smallest number of matches

In [74]:
for protein in all_proteins:
    with open(data_path+protein+'.json', 'r') as f:
        result_dict = json.loads(f.read())
        assert len(result_dict['result']['sequence']) == result_dict['result']['sequence_length']
        protein_df = mutations_df.loc[mutations_df.Protein == protein]
        print(protein, len(protein_df))
        for row, mutation in protein_df.iterrows():
            assert mutation['From'] == result_dict['result']['sequence'][mutation['Position'] - 1]

P0DTC1 26
P0DTC2 2761
P0DTC3 1109
P0DTC4 199
P0DTC5 376
P0DTC6 241
P0DTC7 625
P0DTC8 524
P0DTC9 1276
P0DTD1 15723
P0DTD8 168
