In [1]:
import sys
sys.path.insert(1, '../scripts')

In [23]:
from os import path
import random
import numpy as np
import pandas as pd
import collections
import mdtraj as md
import parseaf as pa

## Functions

In [3]:
def read_seq_from_pdb(filepath):
    three_to_one_map =  {'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E', 'PHE':'F', \
                         'GLY':'G', 'HIS':'H', 'ILE':'I', 'LYS':'K', 'LEU':'L', \
                         'MET':'M', 'ASN':'N', 'PRO':'P', 'GLN':'Q', 'ARG':'R', \
                         'SER':'S', 'THR':'T', 'VAL':'V', 'TRP':'W', 'TYR':'Y', '-':'-'}
    with open(filepath) as file:
        resids = []
        pLDDTs = []
        for line in file:
            if line[0:4] == "ATOM" and line[13:15] == 'N ':
                resids.append(line[17:20])
                pLDDTs.append(float(line[61:67].strip()))
    return ''.join([three_to_one_map[aa] for aa in resids]), np.mean(pLDDTs)

In [4]:
def get_percent_helix(af_pdb):
    if not af_pdb is None:
        ss = md.compute_dssp(af_pdb, simplified=True)[0]
        helix_cnt = collections.Counter(ss)['H']
        helix_p = helix_cnt / 50
    else:
        helix_p = None
    return helix_p

## Polyampholytes

In [6]:
sv_keys = ['seq' + str(x) for x in np.arange(1, 31)]

In [7]:
sv_seqs = ["EKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEK",
"EEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEK",
"KEKKKEKKEEKKEEKEKEKEKEEKKKEEKEKEKEKKKEEKEKEEKKEEEE",
"KEKEKKEEKEKKEEEKKEKEKEKKKEEKKKEEKEEKKEEKKKEEKEEEKE",
"KEKEEKEKKKEEEEKEKKKKEEKEKEKEKEEKKEEKKKKEEKEEKEKEKE",
"EEEKKEKKEEKEEKKEKKEKEEEKKKEKEEKKEEEKKKEKEEEEKKKKEK",
"EEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEK",
"KKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKE",
"EEKKEEEKEKEKEEEEEKKEKKEKKEKKKEEKEKEKKKEKKKKEKEEEKE",
"EKKKKKKEEKKKEEEEEKKKEEEKKKEKKEEKEKEEKEKKEKKEEKEEEE",
"EKEKKKKKEEEKKEKEEEEKEEEEKKKKKEKEEEKEEKKEEKEKKKEEKK",
"EKKEEEEEEKEKKEEEEKEKEKKEKEEKEKKEKKKEKKEEEKEKKKKEKK",
"KEKKKEKEKKEKKKEEEKKKEEEKEKKKEEKKEKKEKKEEEEEEEKEEKE",
"EKKEKEEKEEEEKKKKKEEKEKKEKKKKEKKKKKEEEEEEKEEKEKEKEE",
"KKEKKEKKKEKKEKKEEEKEKEKKEKKKKEKEKKEEEEEEEEKEEKKEEE",
"EKEKEEKKKEEKKKKEKKEKEEKKEKEKEKKEEEEEEEEEKEKKEKKKKE",
"EKEKKKKKKEKEKKKKEKEKKEKKEKEEEKEEKEKEKKEEKKEEEEEEEE",
"KEEKKEEEEEEEKEEKKKKKEKKKEKKEEEKKKEEKKKEEEEEEKKKKEK",
"EEEEEKKKKKEEEEEKKKKKEEEEEKKKKKEEEEEKKKKKEEEEEKKKKK",
"EEKEEEEEEKEEEKEEKKEEEKEKKEKKEKEEKKEKKKKKKKKKKKKEEE",
"EEEEEEEEEKEKKKKKEKEEKKKKKKEKKEKKKKEKKEEEEEEKEEEKKK",
"KEEEEKEEKEEKKKKEKEEKEKKKKKKKKKKKKEKKEEEEEEEEKEKEEE",
"EEEEEKEEEEEEEEEEEKEEKEKKKKKKEKKKKKKKEKEKKKKEKKEEKK",
"EEEEKEEEEEKEEEEEEEEEEEEKKKEEKKKKKEKKKKKKKEKKKKKKKK",
"EEEEEEEEEEEKEEEEKEEKEEKEKKKKKKKKKKKKKKKKKKEEKKEEKE",
"KEEEEEEEKEEKEEEEEEEEEKEEEEKEEKKKKKKKKKKKKKKKKKKKKE",
"KKEKKKEKKEEEEEEEEEEEEEEEEEEEEKEEKKKKKKKKKKKKKKKEKK",
"EKKKKKKKKKKKKKKKKKKKKKEEEEEEEEEEEEEEEEEEKKEEEEEKEK",
"KEEEEKEEEEEEEEEEEEEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKK",
"EEEEEEEEEEEEEEEEEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKKKK"]

In [8]:
fdir = '/mnt/d/research/drummond-lab/hcrpaper/sv_results/rank1/'
phelix = []
for i in range(1,31):
    for j in range(1,6):
        fpath = fdir + 'sv'+str(i)+'_unrelaxed_rank_1_model_' + str(int(j)) + '.pdb'
        if path.exists(fpath):
            pdb = md.load(fpath)
            p = get_percent_helix(pdb)
            print(read_seq_from_pdb(fpath), p)
            phelix.append(p)

('EKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEKEK', 97.7094) 0.96
('EEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEEEKKKEK', 97.73) 0.96
('KEKKKEKKEEKKEEKEKEKEKEEKKKEEKEKEKEKKKEEKEKEEKKEEEE', 97.68580000000001) 0.96
('KEKEKKEEKEKKEEEKKEKEKEKKKEEKKKEEKEEKKEEKKKEEKEEEKE', 97.77400000000002) 0.96
('KEKEEKEKKKEEEEKEKKKKEEKEKEKEKEEKKEEKKKKEEKEEKEKEKE', 97.7528) 0.96
('EEEKKEKKEEKEEKKEKKEKEEEKKKEKEEKKEEEKKKEKEEEEKKKKEK', 97.8582) 0.96
('EEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEK', 97.86640000000001) 0.96
('KKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKKKKEEEEKE', 97.60840000000002) 0.96
('EEKKEEEKEKEKEEEEEKKEKKEKKEKKKEEKEKEKKKEKKKKEKEEEKE', 97.8268) 0.96
('EKKKKKKEEKKKEEEEEKKKEEEKKKEKKEEKEKEEKEKKEKKEEKEEEE', 97.75280000000001) 0.96
('EKEKKKKKEEEKKEKEEEEKEEEEKKKKKEKEEEKEEKKEEKEKKKEEKK', 97.88799999999999) 0.96
('EKKEEEEEEKEKKEEEEKEKEKKEKEEKEKKEKKKEKKEEEKEKKKKEKK', 97.65299999999999) 0.96
('KEKKKEKEKKEKKKEEEKKKEEEKEKKKEEKKEKKEKKEEEEEEEKEEKE', 97.84240000000001) 0.96
('EKKEKEEKEEEEKKKKKEEKEKK

## Polyampholytes - E to D

In [9]:
sv_seqs2 = []
for i, seq in enumerate(sv_seqs):
    new_seq = ''
    for AA in seq:
        if AA == 'E':
            new_seq += 'D'
        elif AA == 'K':
            new_seq += 'K'
    sv_seqs2.append(new_seq)

In [10]:
df = pd.DataFrame(columns=['key', 'seq'])
for i in range(len(sv_seqs2)):
    df = df.append({'key': sv_keys[i],
                    'seq': sv_seqs2[i]},
                   ignore_index=True)

In [11]:
for index, row in df.iterrows():
    key = row['key']
    ofile = open("./sv_seqs_KD/" + str(key) + ".fasta", "w")
    ofile.write(">" + key + "\n" + row['seq'] + "\n")
    ofile.close()

In [13]:
fdir = '/mnt/d/research/drummond-lab/hcrpaper/result_KD/'
phelix = []
for i in range(1,31):
    for j in range(1,6):
        fpath = fdir + 'seq'+str(i)+'_unrelaxed_rank_1_model_' + str(int(j)) + '.pdb'
        if path.exists(fpath):
            pdb = md.load(fpath)
            p = get_percent_helix(pdb)
            print(read_seq_from_pdb(fpath), p)
            phelix.append(p)

('DKDKDKDKDKDKDKDKDKDKDKDKDKDKDKDKDKDKDKDKDKDKDKDKDK', 68.1948) 0.0
('DDDKKKDDDKKKDDDKKKDDDKKKDDDKKKDDDKKKDDDKKKDDDKKKDK', 66.8868) 0.0
('KDKKKDKKDDKKDDKDKDKDKDDKKKDDKDKDKDKKKDDKDKDDKKDDDD', 69.257) 0.0
('KDKDKKDDKDKKDDDKKDKDKDKKKDDKKKDDKDDKKDDKKKDDKDDDKD', 68.2378) 0.0
('KDKDDKDKKKDDDDKDKKKKDDKDKDKDKDDKKDDKKKKDDKDDKDKDKD', 66.9384) 0.0
('DDDKKDKKDDKDDKKDKKDKDDDKKKDKDDKKDDDKKKDKDDDDKKKKDK', 67.8452) 0.0
('DDDDKKKKDDDDKKKKDDDDKKKKDDDDKKKKDDDDKKKKDDDDKKKKDK', 95.54979999999999) 0.96
('KKKKDDDDKKKKDDDDKKKKDDDDKKKKDDDDKKKKDDDDKKKKDDDDKD', 93.64980000000001) 0.96
('DDKKDDDKDKDKDDDDDKKDKKDKKDKKKDDKDKDKKKDKKKKDKDDDKD', 67.5426) 0.0
('DKKKKKKDDKKKDDDDDKKKDDDKKKDKKDDKDKDDKDKKDKKDDKDDDD', 70.4384) 0.0
('DKDKKKKKDDDKKDKDDDDKDDDDKKKKKDKDDDKDDKKDDKDKKKDDKK', 68.32480000000001) 0.0
('DKKDDDDDDKDKKDDDDKDKDKKDKDDKDKKDKKKDKKDDDKDKKKKDKK', 76.80279999999999) 0.68
('KDKKKDKDKKDKKKDDDKKKDDDKDKKKDDKKDKKDKKDDDDDDDKDDKD', 68.3552) 0.0
('DKKDKDDKDDDDKKKKKDDKDKKDKKKKDKKKKKDDDDDDKDDKDKDKDD', 77.7038) 0.66
('KKD

## charged regions with FCRs

In [21]:
seqs = []
for i in range(5, 26, 5):
    seq = i * 'E' + (50 - i) * 'K'
    seqs.append(seq)

In [24]:
sv_keys = ['seq' + str(x) for x in np.arange(1, 6)]

In [26]:
df = pd.DataFrame(columns=['key', 'seq'])
for i in range(len(seqs)):
    df = df.append({'key': sv_keys[i],
                    'seq': seqs[i]},
                   ignore_index=True)

In [30]:
for index, row in df.iterrows():
    key = row['key']
    ofile = open("./testseqs/" + str(key) + ".fasta", "w")
    ofile.write(">" + key + "\n" + row['seq'] + "\n")
    ofile.close()

In [33]:
fdir = '/mnt/d/research/drummond-lab/hcrpaper/testseqs_results/'
phelix = []
for i in range(1,5):
    for j in range(1,6):
        fpath = fdir + 'seq'+str(i)+'_unrelaxed_rank_1_model_' + str(int(j)) + '.pdb'
        if path.exists(fpath):
            pdb = md.load(fpath)
            p = get_percent_helix(pdb)
            print(read_seq_from_pdb(fpath), p)
            phelix.append(p)

('EEEEEKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 98.0536) 0.96
('EEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 97.75480000000002) 0.96
('EEEEEEEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 96.8962) 0.96
('EEEEEEEEEEEEEEEEEEEEKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 95.97059999999999) 0.96


## Reduced fractional charge - poly A

In [14]:
seqs = []
for i in range(5, 51, 5):
    seq = i * 'A' + (50 - i) * 'K'
    seqs.append(seq)

In [16]:
sv_keys = ['seq' + str(x) for x in np.arange(1, len(seqs)+1)]

In [17]:
df = pd.DataFrame(columns=['key', 'seq'])
for i in range(len(seqs)):
    df = df.append({'key': sv_keys[i],
                    'seq': seqs[i]},
                   ignore_index=True)

In [19]:
for index, row in df.iterrows():
    key = row['key']
    ofile = open("./seqs_AK/" + str(key) + ".fasta", "w")
    ofile.write(">" + key + "\n" + row['seq'] + "\n")
    ofile.close()

In [21]:
fdir = '/mnt/d/research/drummond-lab/hcrpaper/result_AK/'
phelix = []
for i in range(1,len(df)+1):
    for j in range(1,6):
        fpath = fdir + 'seq'+str(i)+'_unrelaxed_rank_1_model_' + str(int(j)) + '.pdb'
        if path.exists(fpath):
            pdb = md.load(fpath)
            p = get_percent_helix(pdb)
            print(read_seq_from_pdb(fpath), p)
            phelix.append(p)

('AAAAAKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 98.07879999999999) 0.96
('AAAAAAAAAAKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 96.6456) 0.96
('AAAAAAAAAAAAAAAKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 96.1434) 0.96
('AAAAAAAAAAAAAAAAAAAAKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 94.93) 0.96
('AAAAAAAAAAAAAAAAAAAAAAAAAKKKKKKKKKKKKKKKKKKKKKKKKK', 94.64440000000002) 0.96
('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKKKKKKKKKKKKKKKKKKKK', 93.97140000000002) 0.96
('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKKKKKKKKKKKKKKK', 93.1692) 0.96
('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKKKKKKKKKK', 91.1574) 0.96
('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKKKKK', 92.43860000000001) 0.96
('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', 96.2616) 0.96


## Reduced fractional charge - random uncharged sequence

In [22]:
all_aa = ['H', 'Q', 'N', 'S', 'T', 'G',
          'M', 'V', 'I', 'L', 'F', 'Y', 'W', 'P', 'A', 'C']

In [25]:
seqs = []
for i in range(5, 51, 5):
    seq = ''
    for j in range(i):
        seq += random.choices(all_aa)[0]
    seq += (50 - i) * 'K'
    seqs.append(seq)

In [28]:
sv_keys = ['seq' + str(x) for x in np.arange(1, len(seqs)+1)]

In [29]:
df = pd.DataFrame(columns=['key', 'seq'])
for i in range(len(seqs)):
    df = df.append({'key': sv_keys[i],
                    'seq': seqs[i]},
                   ignore_index=True)

In [30]:
for index, row in df.iterrows():
    key = row['key']
    ofile = open("./seqs_XK/" + str(key) + ".fasta", "w")
    ofile.write(">" + key + "\n" + row['seq'] + "\n")
    ofile.close()

In [31]:
fdir = '/mnt/d/research/drummond-lab/hcrpaper/result_XK/'
phelix = []
for i in range(1,len(df)+1):
    for j in range(1,6):
        fpath = fdir + 'seq'+str(i)+'_unrelaxed_rank_1_model_' + str(int(j)) + '.pdb'
        if path.exists(fpath):
            pdb = md.load(fpath)
            p = get_percent_helix(pdb)
            print(read_seq_from_pdb(fpath), p)
            phelix.append(p)

('VIFVFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 98.1416) 0.96
('FPVSVHPHPWKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 95.24299999999998) 0.82
('CWTQATVGYPTSILYKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 93.29539999999999) 0.74
('HQNYMCMTALAVPTIWQFGMKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK', 91.04299999999999) 0.7
('SCSITMSYWITNSPWPSVNITIMWAKKKKKKKKKKKKKKKKKKKKKKKKK', 81.52300000000001) 0.84
('WTCGWGCFGQIYGLMQNWISHWQYQAFTTCKKKKKKKKKKKKKKKKKKKK', 79.714) 0.84
('PNTSPNNTCPINHQPPGCLNAWCHFGVNQYFIMTSKKKKKKKKKKKKKKK', 69.87399999999998) 0.48
('HPQVHYHLSWHFFFVHHNQWQSVNQLCMFTMIGTVGTCSPKKKKKKKKKK', 71.373) 0.68
('WCWAWCGCASAPVWHLGCTAHTHVLQTNLMNLSQHYYPTPCGHSMKKKKK', 61.0822) 0.42
('GMAFHMVPQICAAFSNVCICYWVALWYNLSTTFAGLGYGACHYMWVQGII', 80.0964) 0.06
