In [234]:
from bs4 import BeautifulSoup # pip install BeautifulSoup4
import requests
import pickle # pip install pickleshare
import re
import pandas as pd # pip install pandas

## Getting all the data from Swiss Model

In [7]:
data = requests.get("https://swissmodel.expasy.org/repository/user_annotation_project/BdWKmZ")

In [24]:
# Saves data to a file
raw_page_path = '../data/raw_page'

with open(raw_page_path, 'wb') as f:
    pickle.dump(data, f)

## Scraping

In [32]:
soup = BeautifulSoup(str(data.content), 'html.parser')

In [66]:
all_tr = soup.find_all('tr')

In [79]:
# Discards the first row
rows = [element.text for element in soup.find_all('tr')[1:]]
print(len(rows))

7533


Good news: 7533 is the number of annotations in the table!

In [235]:
rows[0]

'P0DTC143934393 #fffdfd -\\n            0.000030 (S4393A (nsp11:1), S4393L (nsp11:1))No'

### Getting desired data through regular expressions

In [240]:
protein_pattern = re.compile(r'^\w{6}')
matches = protein_pattern.findall(rows[0])
print(matches[0])

P0DTC1


In [242]:
frequency_pattern = re.compile(r'\d\.\d{6}')
matches = frequency_pattern.findall(rows[0])
print(matches)

['0.000030']


In [251]:
annotation_pattern = re.compile(r'(?<=[\s(])[A-Z]\d+[A-Z]')
matches = annotation_pattern.findall(rows[3])
print(matches)

['A4396E', 'A4396S', 'A4396T', 'A4396V']


In [258]:
matches[0][1:-1]

'4396'

In [260]:
from tqdm.notebook import tqdm

In [264]:
swiss_df = pd.DataFrame(columns=['Protein', 'Frequency', 'From', 'To', 'Position', 'Row'])
i = 0
with tqdm(total=len(rows)) as pbar:
    for n, row in enumerate(rows):
        protein = protein_pattern.findall(row)[0]
        freq = frequency_pattern.findall(row)[0]
        annotations = annotation_pattern.findall(row)
        for annotation in annotations:
            from_ = annotation[0]
            to = annotation[-1:]
            position = annotation[1:-1]
            swiss_df.loc[i] = [protein, freq, from_, to, position, n]
            i += 1
        pbar.update(1)

  0%|          | 0/7533 [00:00<?, ?it/s]

In [265]:
swiss_df.head()

Unnamed: 0,Protein,Frequency,From,To,Position,Row
0,P0DTC1,3e-05,S,A,4393,0
1,P0DTC1,3e-05,S,L,4393,0
2,P0DTC1,0.00016,A,D,4394,1
3,P0DTC1,0.00016,A,S,4394,1
4,P0DTC1,0.00016,A,T,4394,1


In [266]:
swiss_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23028 entries, 0 to 23027
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Protein    23028 non-null  object
 1   Frequency  23028 non-null  object
 2   From       23028 non-null  object
 3   To         23028 non-null  object
 4   Position   23028 non-null  object
 5   Row        23028 non-null  object
dtypes: object(6)
memory usage: 1.2+ MB


### Changing data types

In [313]:
type_dict = {"Protein": str,
            "Frequency": float,
            "From": "category",
            "To": "category",
            "Position": int,
            "Row": int}

swiss_df = swiss_df.astype(type_dict)
swiss_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23028 entries, 0 to 23027
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Protein    23028 non-null  object  
 1   Frequency  23028 non-null  float64 
 2   From       23028 non-null  category
 3   To         23028 non-null  category
 4   Position   23028 non-null  int64   
 5   Row        23028 non-null  int64   
dtypes: category(2), float64(1), int64(2), object(1)
memory usage: 945.9+ KB


### Saving dataframe

In [319]:
swiss_df_path = '../data/swissmodel_dataframe.pkl'
with open(swiss_df_path, 'wb') as f:
    pickle.dump(swiss_df, f)

In [322]:
# testing load function
with open(swiss_df_path, 'rb') as f:
    test = pickle.load(f)

test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23028 entries, 0 to 23027
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Protein    23028 non-null  object  
 1   Frequency  23028 non-null  float64 
 2   From       23028 non-null  category
 3   To         23028 non-null  category
 4   Position   23028 non-null  int64   
 5   Row        23028 non-null  int64   
dtypes: category(2), float64(1), int64(2), object(1)
memory usage: 944.8+ KB


In [323]:
test.tail()

Unnamed: 0,Protein,Frequency,From,To,Position,Row
23023,P0DTD8,0.00039,A,I,43,7532
23024,P0DTD8,0.00039,A,L,43,7532
23025,P0DTD8,0.00039,A,S,43,7532
23026,P0DTD8,0.00039,A,T,43,7532
23027,P0DTD8,0.00039,A,V,43,7532
