In [1]:
import sys
sys.path.insert(1, '../scripts')

In [2]:
import os
import random
import pandas as pd
from itertools import groupby
from operator import itemgetter
import parseaf as pa
import mdtraj as md

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Functions

In [5]:
def find_helix_indices(ss, bfactor):
    helix_indices = []
    for i, label in enumerate(ss):
        if (label == 'H') and (bfactor[i] >= 70):
            helix_indices.append(i)
    return helix_indices

In [6]:
def find_disordered_indices(ss, bfactor):
    disordered_indices = []
    for i, label in enumerate(ss):
        if ((label == 'C') and (bfactor[i] >= 70)) or (bfactor[i] < 50):
            disordered_indices.append(i)
    return disordered_indices

In [7]:
def list_to_regions(indices, minlen=25):
    ranges = []
    for k, g in groupby(enumerate(indices), lambda x:x[0]-x[1]):
        group = list(map(itemgetter(1), g))
        if group[-1] - group[0] + 1 >= minlen:
            ranges.append((group[0], group[-1]))
    return ranges

In [9]:
def append_len_region(row):
    return len(row['region_seq'])

## Extracting pure helices & pure disordered regions based on AlphaFold output

In [9]:
df_all_orfs = pd.read_table('../../data/sc_orfs/yeast-all-orfs.txt', header=None, names=['orf'])

In [10]:
all_orfs = df_all_orfs['orf'].tolist()

In [11]:
with open('../../data/misc/uniprot-to-sgdid.txt') as f:
    mappings = {}
    for line in f:
        uniprot = line[95:106].rstrip()
        orf = line[75:95].rstrip()
        mappings[orf] = uniprot

In [13]:
uni_id_with_pdb = []
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/scerevisiae_alphafold/'
for orf in all_orfs:
    uni_id = mappings[orf]
    fpath = fdir + 'AF-' + str(uni_id) + '-F1-model_v2.pdb'
    if os.path.exists(fpath):
        uni_id_with_pdb.append(uni_id)

In [14]:
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/scerevisiae_alphafold/'
df = pd.DataFrame(columns={'uni_id', 'start', 'end', 'region_seq'})
for uni_id in uni_id_with_pdb:
    fpath = fdir + 'AF-' + str(uni_id) + '-F1-model_v2.pdb'
    pdb = pa.read_af_output(fdir, uni_id)
    ss = md.compute_dssp(pdb, simplified=True)[0]
    bfactor = pa.read_bfactor_from_pdb(fpath)
    helix_indices = find_helix_indices(ss, bfactor)
    regions = list_to_regions(helix_indices)
    seq = pa.read_seq_from_pdb(fpath)
    for r in regions:
        df = df.append({'uni_id': uni_id, 'start': r[0], 'end': r[1], 'region_seq': seq[r[0]:(r[1]+1)]}, ignore_index=True)

In [15]:
df.head()

Unnamed: 0,start,end,uni_id,region_seq
0,1143,1185,P39702,VQDLKQLLLNVFNTYKLERSLSELIQKIIEDSSQDLVQQYRKF
1,508,551,P10591,KEDIEKMVAEAEKFKEEDEKESQRIASKNQLESIAYSLKNTISE
2,138,208,P39704,DIIANNAVEEIDRNLNKITKTLNYLRAREWRNMSTVNSTESRLTWL...
3,103,142,P18411,KQMFLGSLFGVVLGVTVAKISILFMYVGITSMLLCEWLRY
4,45,93,P18410,PASMIFRNLLILEDDLRRQAHEQKILKWQFTLFLASMAGVGAFTFYELY


In [16]:
df['len_region'] = df.apply(lambda row: append_len_region(row), axis=1)
df['label'] = 'helix'

In [17]:
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/scerevisiae_alphafold/'
disdf = pd.DataFrame(columns={'uni_id', 'start', 'end', 'region_seq'})
for uni_id in uni_id_with_pdb:
    fpath = fdir + 'AF-' + str(uni_id) + '-F1-model_v2.pdb'
    pdb = pa.read_af_output(fdir, uni_id)
    ss = md.compute_dssp(pdb, simplified=True)[0]
    bfactor = pa.read_bfactor_from_pdb(fpath)
    disorder_indices = find_disordered_indices(ss, bfactor)
    regions = list_to_regions(disorder_indices)
    seq = pa.read_seq_from_pdb(fpath)
    for r in regions:
        disdf = disdf.append({'uni_id': uni_id, 'start': r[0], 'end': r[1], 'region_seq': seq[r[0]:(r[1]+1)]}, ignore_index=True)

In [18]:
disdf['len_region'] = disdf.apply(lambda row: append_len_region(row), axis=1)
disdf['label'] = 'disordered'

In [19]:
df_all = pd.concat([df, disdf], axis=0)
df_all.head(10)

Unnamed: 0,start,end,uni_id,region_seq,len_region,label
0,1143,1185,P39702,VQDLKQLLLNVFNTYKLERSLSELIQKIIEDSSQDLVQQYRKF,43,helix
1,508,551,P10591,KEDIEKMVAEAEKFKEEDEKESQRIASKNQLESIAYSLKNTISE,44,helix
2,138,208,P39704,DIIANNAVEEIDRNLNKITKTLNYLRAREWRNMSTVNSTESRLTWL...,71,helix
3,103,142,P18411,KQMFLGSLFGVVLGVTVAKISILFMYVGITSMLLCEWLRY,40,helix
4,45,93,P18410,PASMIFRNLLILEDDLRRQAHEQKILKWQFTLFLASMAGVGAFTFYELY,49,helix
5,159,184,P18410,WDEKYTDSVRFVSRTIAYCNIYCLKK,26,helix
6,225,252,P18410,AEIREGWEIYRDEFWAREGARRRKQAHE,28,helix
7,372,407,P18409,VHKQKLLNDLTYAFSSSLRKIDEERSTIEKFDNKIN,36,helix
8,176,249,P31376,DKIEQKWQDEQELKKKEKELKRKNDAEAKRLRMEERKRQQMQKKIA...,74,helix
9,171,216,P31385,PIEMEEQRMTALKEITDIEYKFAQLRQKLYDNQLVRLQTELQMCLE,46,helix


In [20]:
df_all.to_csv('../../data/af_regions/sc_af_regions_all.csv', index=False)

## Doing it again for fission yeast

In [11]:
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/pombe_alphafold/'
directory = os.fsencode(fdir)
df = pd.DataFrame(columns={'uni_id', 'start', 'end', 'region_seq'})
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".pdb"):
        fpath = fdir + filename
        uni_id = filename.split('-')[1]
        pdb = pa.read_af_output(fdir, uni_id)
        ss = md.compute_dssp(pdb, simplified=True)[0]
        bfactor = pa.read_bfactor_from_pdb(fpath)
        helix_indices = find_helix_indices(ss, bfactor)
        regions = list_to_regions(helix_indices)
        seq = pa.read_seq_from_pdb(fpath)
        for r in regions:
            df = df.append({'uni_id': uni_id,
                            'start': r[0],
                            'end': r[1],
                            'region_seq': seq[r[0]:(r[1]+1)]},
                           ignore_index=True)
        continue

In [12]:
df['len_region'] = df.apply(lambda row: append_len_region(row), axis=1)
df['label'] = 'helix'

In [13]:
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/pombe_alphafold/'
directory = os.fsencode(fdir)
disdf = pd.DataFrame(columns={'uni_id', 'start', 'end', 'region_seq'})
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".pdb"):
        fpath = fdir + filename
        uni_id = filename.split('-')[1]
        pdb = pa.read_af_output(fdir, uni_id)
        ss = md.compute_dssp(pdb, simplified=True)[0]
        bfactor = pa.read_bfactor_from_pdb(fpath)
        disorder_indices = find_disordered_indices(ss, bfactor)
        regions = list_to_regions(disorder_indices)
        seq = pa.read_seq_from_pdb(fpath)
        for r in regions:
            disdf = disdf.append({'uni_id': uni_id,
                            'start': r[0],
                            'end': r[1],
                            'region_seq': seq[r[0]:(r[1]+1)]},
                            ignore_index=True)
        continue

In [14]:
disdf['len_region'] = disdf.apply(lambda row: append_len_region(row), axis=1)
disdf['label'] = 'disordered'

In [15]:
df_pombe = pd.concat([df, disdf], axis=0)

In [16]:
df_pombe.to_csv('../../data/af_regions/pombe_af_regions.csv', index=False)

## Doing it one more time for homo sapiens

In [18]:
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/hsapiens_alphafold/'
directory = os.fsencode(fdir)
df = pd.DataFrame(columns={'uni_id', 'start', 'end', 'region_seq'})
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith("F1-model_v3.pdb"):
        fpath = fdir + filename
        uni_id = filename.split('-')[1]
        pdb = pa.read_af_output(fdir, uni_id)
        ss = md.compute_dssp(pdb, simplified=True)[0]
        bfactor = pa.read_bfactor_from_pdb(fpath)
        helix_indices = find_helix_indices(ss, bfactor)
        regions = list_to_regions(helix_indices)
        seq = pa.read_seq_from_pdb(fpath)
        for r in regions:
            df = df.append({'uni_id': uni_id,
                            'start': r[0],
                            'end': r[1],
                            'region_seq': seq[r[0]:(r[1]+1)]},
                           ignore_index=True)
        continue

In [19]:
df['len_region'] = df.apply(lambda row: append_len_region(row), axis=1)
df['label'] = 'helix'

In [20]:
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/hsapiens_alphafold/'
directory = os.fsencode(fdir)
disdf = pd.DataFrame(columns={'uni_id', 'start', 'end', 'region_seq'})
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith("F1-model_v3.pdb"):
        fpath = fdir + filename
        uni_id = filename.split('-')[1]
        pdb = pa.read_af_output(fdir, uni_id)
        ss = md.compute_dssp(pdb, simplified=True)[0]
        bfactor = pa.read_bfactor_from_pdb(fpath)
        disorder_indices = find_disordered_indices(ss, bfactor)
        regions = list_to_regions(disorder_indices)
        seq = pa.read_seq_from_pdb(fpath)
        for r in regions:
            disdf = disdf.append({'uni_id': uni_id,
                            'start': r[0],
                            'end': r[1],
                            'region_seq': seq[r[0]:(r[1]+1)]},
                            ignore_index=True)
        continue

In [21]:
disdf['len_region'] = disdf.apply(lambda row: append_len_region(row), axis=1)
disdf['label'] = 'disordered'

In [22]:
df_human = pd.concat([df, disdf], axis=0)

In [23]:
df_human.to_csv('../../data/af_regions/hsapiens_af_regions.csv', index=False)

## Doing it one more time for c elegans

In [25]:
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/celegans_alphafold/'
directory = os.fsencode(fdir)
df = pd.DataFrame(columns={'uni_id', 'start', 'end', 'region_seq'})
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith("F1-model_v3.pdb"):
        fpath = fdir + filename
        uni_id = filename.split('-')[1]
        pdb = md.load(fpath)
        ss = md.compute_dssp(pdb, simplified=True)[0]
        bfactor = pa.read_bfactor_from_pdb(fpath)
        helix_indices = find_helix_indices(ss, bfactor)
        regions = list_to_regions(helix_indices)
        seq = pa.read_seq_from_pdb(fpath)
        for r in regions:
            df = df.append({'uni_id': uni_id,
                            'start': r[0],
                            'end': r[1],
                            'region_seq': seq[r[0]:(r[1]+1)]},
                           ignore_index=True)
        continue

In [26]:
df['len_region'] = df.apply(lambda row: append_len_region(row), axis=1)
df['label'] = 'helix'

In [27]:
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/celegans_alphafold/'
directory = os.fsencode(fdir)
disdf = pd.DataFrame(columns={'uni_id', 'start', 'end', 'region_seq'})
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith("F1-model_v3.pdb"):
        fpath = fdir + filename
        uni_id = filename.split('-')[1]
        pdb = md.load(fpath)
        ss = md.compute_dssp(pdb, simplified=True)[0]
        bfactor = pa.read_bfactor_from_pdb(fpath)
        disorder_indices = find_disordered_indices(ss, bfactor)
        regions = list_to_regions(disorder_indices)
        seq = pa.read_seq_from_pdb(fpath)
        for r in regions:
            disdf = disdf.append({'uni_id': uni_id,
                            'start': r[0],
                            'end': r[1],
                            'region_seq': seq[r[0]:(r[1]+1)]},
                            ignore_index=True)
        continue

In [28]:
disdf['len_region'] = disdf.apply(lambda row: append_len_region(row), axis=1)
disdf['label'] = 'disordered'

In [29]:
df_celegans = pd.concat([df, disdf], axis=0)

In [30]:
df_celegans.to_csv('../../data/af_regions/celegans_af_regions.csv', index=False)

## Extracting random non-highly-charged regions from yeast AlphaFold output

In [4]:
def append_structure_label(row):
    fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/scerevisiae_alphafold/'
    uni_id = row['uni_id']
    left_bound = row['left.bound']
    right_bound = row['right.bound']
    label = pa.get_structure_label(fdir, uni_id, left_bound, right_bound)
    return label

In [5]:
df_all_orfs = pd.read_table('../../data/sc_orfs/yeast-all-orfs.txt', header=None, names=['orf'])

In [6]:
all_orfs = df_all_orfs['orf'].tolist()

In [7]:
with open('../../data/misc/uniprot-to-sgdid.txt') as f:
    mappings = {}
    for line in f:
        uniprot = line[95:106].rstrip()
        orf = line[75:95].rstrip()
        mappings[orf] = uniprot

In [8]:
uni_id_with_pdb = []
fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/scerevisiae_alphafold/'
for orf in all_orfs:
    uni_id = mappings[orf]
    fpath = fdir + 'AF-' + str(uni_id) + '-F1-model_v2.pdb'
    if os.path.exists(fpath):
        uni_id_with_pdb.append(uni_id)

In [9]:
df_valid_orfs = pd.read_csv('../../data/sc_orfs/verified_orfs_with_msa.csv')

In [10]:
valid_orfs = df_valid_orfs['systematic_name'].to_list()

In [11]:
with open('../../data/misc/uniprot-to-sgdid.txt') as f:
    mappings = {}
    for line in f:
        uniprot = line[95:106].rstrip()
        orf = line[75:95].rstrip()
        mappings[orf] = uniprot

In [12]:
valid_orfs_uniid = []
for i in valid_orfs:
    valid_orfs_uniid.append(mappings[i])

In [13]:
valid_uniid = list(set(uni_id_with_pdb) & set(valid_orfs_uniid))

In [14]:
len(valid_uniid)

4660

In [15]:
def extract_random_region_from_proteome(uni_id_with_pdb, regionlen, label):
    fdir = '/Users/rosalindpan/drummondlab/hcrpaper_data/scerevisiae_alphafold/'
    random_region = None
    cnt = 0
    while random_region is None:
        randomi = random.randrange(len(valid_uniid))
        uni_id = valid_uniid[randomi]
        fpath = fdir + 'AF-' + str(uni_id) + '-F1-model_v2.pdb'
        seq = pa.read_seq_from_pdb(fpath)
        if len(seq) > regionlen:
            starti = random.randrange(len(seq) - regionlen)
            region_label = pa.get_structure_label(fdir, uni_id, starti, starti + regionlen)
            if region_label == label:
                random_region = seq[starti:(starti + regionlen + 1)] 
        cnt += 1
        if cnt >= 500:
            print(regionlen)
    return uni_id, random_region, starti, (starti + regionlen)

In [16]:
df_hc = pd.read_csv('../../data/charged_regions/cr_trimmed_filtered_aflabel.csv')

In [17]:
df_hc = df_hc[df_hc.label != 'unclassified']

In [18]:
df_hc = df_hc.dropna(how='any')

In [21]:
df_relaxed = pd.DataFrame(columns = ['uni_id', 'seq', 'left_bound', 'right_bound', 'label'])
for i in range(5):
    for index, row in df_hc.iterrows():
        rv = {}
        regionlen = row['region.len']
        label = row['label']
        params = extract_random_region_from_proteome(uni_id_with_pdb, regionlen, label)
        rv['uni_id'] = params[0]
        rv['seq'] = params[1]
        rv['left_bound'] = params[2]
        rv['right_bound'] = params[3]
        rv['label'] = label
        df_relaxed = df_relaxed.append(rv, ignore_index=True)

In [20]:
len(df_relaxed)

1826

In [22]:
df_relaxed.to_csv('../../data/af_regions/random_af_regions_low_thresh.csv', index=False)