In [1]:
import os
from os import path
os.environ['QT_QPA_PLATFORM'] = 'offscreen'

In [2]:
import sys
sys.path.insert(1, '../scripts')

In [6]:
import random
import pandas as pd
from itertools import groupby
from operator import itemgetter
from Bio import SeqIO
import parseaf as pa
import mdtraj as md

In [4]:
import warnings
warnings.filterwarnings('ignore')

## Functions

In [47]:
def find_helix_indices(secstr):
    helix_indices = []
    for i, label in enumerate(secstr):
        if label == 'H':
            helix_indices.append(i)
    return helix_indices

In [58]:
def find_disordered_indices(ss):
    disordered_indices = []
    for i, label in enumerate(ss):
        if label == 'C':
            disordered_indices.append(i)
    return disordered_indices

In [48]:
def list_to_regions(indices, minlen=25):
    ranges = []
    for k, g in groupby(enumerate(indices), lambda x:x[0]-x[1]):
        group = list(map(itemgetter(1), g))
        if group[-1] - group[0] + 1 >= minlen:
            ranges.append((group[0], group[-1]))
    return ranges

In [8]:
def append_sequence(row):
    fdir = '/mnt/d/research/drummond-lab/data/yeast-alphafold-output-unzipped/'
    fpath = fdir + 'AF-' + str(row['uni_id']) + '-F1-model_v1.pdb'
    seq = pa.read_seq_from_pdb(fpath)
    start = row['start']
    end = row['end']
    return seq[start:(end+1)]

In [52]:
def append_len_region(row):
    return len(row['region_seq'])

## Load PDB data

In [31]:
seqs = {}
secstr = {}
with open("../../data/ss_dis.txt") as file:
    for line in file:
        line = line[:-1]
        if not line:
            continue
        if line.startswith(">"):
            name = line[1:].split(':')
            id = name[0]
            type = name[-1]
            if type == 'sequence':
                seqs[id] = []
            if type == 'secstr':
                secstr[id] = []
            continue
        data = line
        if type == 'sequence':
            seqs[id].append(data)
        if type == 'secstr':
            secstr[id].append(data)

In [33]:
for k,v in seqs.items():
    seq = ''.join(v)
    seqs[k] = seq

In [34]:
for k,v in secstr.items():
    str = ''.join(v).replace(" ", "C")
    secstr[k] = str

## Extracting pure helices & pure disordered regions

In [53]:
fdir = '/mnt/d/research/drummond-lab/data/cerevisiae-alphafold-output/'
df = pd.DataFrame(columns={'id', 'region_seq', 'label'})
for id,seq in seqs.items():
    helix_indices = find_helix_indices(secstr[id])
    regions = list_to_regions(helix_indices, minlen=25)
    for r in regions:
        df = df.append({'id': id, 'region_seq': seq[r[0]:(r[1]+1)], 'label': 'helix'}, ignore_index=True)

In [54]:
df['len_region'] = df.apply(lambda row: append_len_region(row), axis=1)

In [55]:
df.head(10)

Unnamed: 0,label,region_seq,id,len_region
0,helix,ADAQGAMNKALELFRKDIAAKYKEL,102M,25
1,helix,ADAQGAMNKALELFRKDIAAKYKEL,104M,25
2,helix,ADAQGAMNKALELFRKDIAAKYKEL,105M,25
3,helix,ADAQGAMNKALELFRKDIAAKYKEL,106M,25
4,helix,ADAQGAMNKALELFRKDIAAKYKEL,107M,25
5,helix,ADAQGAMNKALELFRKDIAAKYKEL,109M,25
6,helix,QQEAALVDMVNDGVEDLRCKYISLIYT,10GS,27
7,helix,ADAQGAMNKALELFRKDIAAKYKEL,110M,25
8,helix,ADAQGAMNKALELFRKDIAAKYKEL,112M,25
9,helix,FSTLKSTVEAIWAGIKATEAAVSEE,11AS,25


In [59]:
fdir = '/mnt/d/research/drummond-lab/data/cerevisiae-alphafold-output/'
disdf = pd.DataFrame(columns={'id', 'region_seq', 'label'})
for id,seq in seqs.items():
    dis_indices = find_disordered_indices(secstr[id])
    regions = list_to_regions(dis_indices, minlen=25)
    for r in regions:
        disdf = disdf.append({'id': id, 'region_seq': seq[r[0]:(r[1]+1)], 'label': 'disordered'}, ignore_index=True)

In [60]:
disdf['len_region'] = disdf.apply(lambda row: append_len_region(row), axis=1)

In [64]:
df_all = pd.concat([df, disdf], axis=0)
df_all.head(10)

Unnamed: 0,label,region_seq,id,len_region
0,helix,ADAQGAMNKALELFRKDIAAKYKEL,102M,25
1,helix,ADAQGAMNKALELFRKDIAAKYKEL,104M,25
2,helix,ADAQGAMNKALELFRKDIAAKYKEL,105M,25
3,helix,ADAQGAMNKALELFRKDIAAKYKEL,106M,25
4,helix,ADAQGAMNKALELFRKDIAAKYKEL,107M,25
5,helix,ADAQGAMNKALELFRKDIAAKYKEL,109M,25
6,helix,QQEAALVDMVNDGVEDLRCKYISLIYT,10GS,27
7,helix,ADAQGAMNKALELFRKDIAAKYKEL,110M,25
8,helix,ADAQGAMNKALELFRKDIAAKYKEL,112M,25
9,helix,FSTLKSTVEAIWAGIKATEAAVSEE,11AS,25


In [65]:
df_all.to_csv('../../data/pdb_regions.csv', index=False)