# Extracting features from protein sequence data
Protein sequence data stored in /data/{PROTEIN}.json

In [2]:
import pandas as pd
import numpy as np
import os
import pickle

In [8]:
data_path = '../data/'
df_path = data_path + 'swissmodel_dataframe.pkl'
with open(df_path, 'rb') as f:
    mutations_df = pickle.load(f)
mutations_df.head()

Unnamed: 0,Protein,Frequency,From,To,Position,Row
0,P0DTC1,3e-05,S,A,4393,0
1,P0DTC1,3e-05,S,L,4393,0
2,P0DTC1,0.00016,A,D,4394,1
3,P0DTC1,0.00016,A,S,4394,1
4,P0DTC1,0.00016,A,T,4394,1


In [20]:
print(list(mutations_df.From.unique()))

['S', 'A', 'D', 'G', 'F', 'V', 'L', 'P', 'Q', 'C', 'N', 'T', 'R', 'Y', 'K', 'H', 'W', 'I', 'E', 'M']


**NOTE: I'm using 'Z' as a symbol for missing aminoacids (happens when the desired mutation position is at the beginning or end of the sequence)**

In [21]:
all_proteins = list(mutations_df.Protein.unique())
left_aa = []
right_aa = []
for protein in all_proteins:
    with open(data_path+protein+'.json', 'r') as f:
        result_dict = json.loads(f.read())
        assert len(result_dict['result']['sequence']) == result_dict['result']['sequence_length']
        protein_df = mutations_df.loc[mutations_df.Protein == protein]
        print(protein, len(protein_df))
        for row, mutation in protein_df.iterrows():
            assert mutation['From'] == result_dict['result']['sequence'][mutation['Position'] - 1]
            try:
                left_aa.append(result_dict['result']['sequence'][mutation['Position'] - 2])
            except IndexError:
                left_aa.append('Z')
            try:
                right_aa.append(result_dict['result']['sequence'][mutation['Position']])
            except IndexError:
                right_aa.append('Z')

P0DTC1 26
P0DTC2 2761
P0DTC3 1109
P0DTC4 199
P0DTC5 376
P0DTC6 241
P0DTC7 625
P0DTC8 524
P0DTC9 1276
P0DTD1 15723
P0DTD8 168


# Appending features to mutation data

In [25]:
mutations_df['LeftAA'] = left_aa
mutations_df['RightAA'] = right_aa
mutations_df.head()

Unnamed: 0,Protein,Frequency,From,To,Position,Row,LeftAA,RightAA
0,P0DTC1,3e-05,S,A,4393,0,Q,A
1,P0DTC1,3e-05,S,L,4393,0,Q,A
2,P0DTC1,0.00016,A,D,4394,1,S,D
3,P0DTC1,0.00016,A,S,4394,1,S,D
4,P0DTC1,0.00016,A,T,4394,1,S,D


# Saving new dataframe

In [27]:
processed_df_path = data_path + 'processed_df.pkl'

In [26]:
with open(df_path, 'wb') as f:
    pickle.dump(mutations_df, f)

Testing saved file

In [28]:
with open(processed_df_path, 'rb') as f:
    loaded_df = pickle.load(f)
loaded_df.head()

Unnamed: 0,Protein,Frequency,From,To,Position,Row,LeftAA,RightAA
0,P0DTC1,3e-05,S,A,4393,0,Q,A
1,P0DTC1,3e-05,S,L,4393,0,Q,A
2,P0DTC1,0.00016,A,D,4394,1,S,D
3,P0DTC1,0.00016,A,S,4394,1,S,D
4,P0DTC1,0.00016,A,T,4394,1,S,D
