# Figure 2

In [1]:
## Basics
import numpy as np
import pandas as pd
import glob
import mdtraj as md
import gzip

## Bio functions
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import AlignIO
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

## Plotting
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({'font.size': 14})
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = "Arial"

## Colors
cscheme = {'pos':'#0177BB', 'neg':'#CB3627', 'dis':'#828282', 'helix':'#009988', 'sheet':'#EE7733', 'cr':'#AA4499'}

In [2]:
import sys
sys.path.append("../code/scripts")
import tempparse as paf
from tempparse import is_gz_file

In [3]:
# Filepaths
path_to_af_data = "/Volumes/DATA/alphafold/UP000002311_559292_YEAST/"

### Local functions

In [4]:
def get_ss_freqs(seq, counts = False, sort_output = True, gaps = False):
    ss_types = ['C', 'E', 'H']
    
    all_seq_ss = []
    all_seq_freqs = []
    
    # Parse sequence
    seq = list(seq)
    seq_ss, seq_counts = np.unique(seq, return_counts=True)
    zero_freq_ss = np.setxor1d(ss_types, seq_ss)

    seq_ss = np.append(seq_ss, zero_freq_ss)
    seq_freqs = np.append(seq_counts, np.zeros(zero_freq_ss.size))
    if not counts:
        seq_freqs = seq_freqs / len(seq)

    if sort_output:
        seq_freqs_sorted = seq_freqs[seq_ss.argsort()]
        all_seq_freqs.append(seq_freqs_sorted)

    else:
        all_seq_ss.append(seq_ss)
        all_seq_freqs.append(seq_freqs)
    
    if sort_output:
        return(all_seq_freqs)
    else:
        return(all_seq_ss, all_seq_freqs)

### Data

In [5]:
df = pd.read_csv('../data/charged_regions/cr_raw.csv',
                comment="#")

In [7]:
# Calculate normalized hydropathy, net charge, and percent disordered for each sequence
hits_disordered_frac = []
hits_conf = []
hits_ss = []

for index, row in df.iterrows():
    orf = row['orf']
    uid = row['uni_id']
    seq = row['region.seq']
    
    # Read alphafold pLDDT scoress for the sequence
    af_fp = paf.get_alphafold_pred(uid, path_to_af_data)+'.gz'
    try:
        pLDDTs = paf.read_bfactor_from_pdb(af_fp)[row['left.bound']:row['right.bound']+1]
    except FileNotFoundError:
        print(af_fp)
        continue
    
    
    #disordered = np.sum([1 for i in pLDDTs if i < 70.])
    
    # Get confidence scores mask
    conf = [True if i > 70. else False for i in pLDDTs]
    
    # Secondary structure prediction for each region
    structure = md.load(af_fp)
    ss = md.compute_dssp(structure, simplified=True)
    region_ss = ss[0][row['left.bound']:row['right.bound']+1]
    hits_ss.append(region_ss)
    
    # Score each residue as disordered or ordered based on pLDDT value
    ### UPDATE
    disordered = 0
    for r, resid in enumerate(pLDDTs):
        if resid < 70.:
            disordered += 1
        elif region_ss[r] == 'C':
            disordered += 1
        
    # Calculate fraction disorder for the region
    hits_disordered_frac.append(disordered / row['region.len'])
    hits_conf.append(conf)



/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-P53288-F1-model_v1.pdb.gz
/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-P38811-F1-model_v1.pdb.gz
/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-P38811-F1-model_v1.pdb.gz
/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-P36022-F1-model_v1.pdb.gz
/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-Q12019-F1-model_v1.pdb.gz
/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-Q12019-F1-model_v1.pdb.gz
/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-Q12019-F1-model_v1.pdb.gz
/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-Q12019-F1-model_v1.pdb.gz
/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-Q6B0X1-F1-model_v1.pdb.gz
/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-Q08428-F1-model_v1.pdb.gz
/Volumes/DATA/alphafold/UP000002311_559292_YEAST/AF-Q12444-F1-model_v1.pdb.gz


In [None]:
# Compare to random regions of the same length
all_af_predictions = glob.glob(path_to_af_data+'*.pdb.gz')

rand_disordered_frac = []
rand_conf = []
rand_ss = []


for index, row in df.iterrows():
    # Choose random alphafold output
    rfile = np.random.choice(all_af_predictions)
    raf = paf.read_bfactor_from_pdb(rfile)
    # If the length of the protein is less than the region, redraw
    while row['region.len'] > len(raf):
        rfile = np.random.choice(all_af_predictions)
        raf = paf.read_bfactor_from_pdb(rfile)
    
    rstart = np.random.choice(np.arange(len(raf) - row['region.len']+1))
    rend = rstart+row['region.len']+1
    
    # Get the sequence of this random region
    rseq = read_seq_from_pdb(rfile)[rstart:rend]
    if len(rseq) == 0:
        continue
    rpLDDTs = raf[rstart:rend]
    
    
    # Score each residue as disordered or ordered based on pLDDT value
    #disordered = np.sum([1 for i in rpLDDTs if i < 70.])
    conf = [True if i > 70. else False for i in rpLDDTs]
    

    
    # Secondary structure prediction for each region
    structure = md.load(rfile)
    ss = md.compute_dssp(structure, simplified=True)
    region_ss = ss[0][rstart:rend]
    rand_ss.append(region_ss)
    
    # Score each residue as disordered or ordered based on pLDDT value
    ### UPDATE
    disordered = 0
    for r, resid in enumerate(rpLDDTs):
        if resid < 70.:
            disordered += 1
        elif region_ss[r] == 'C':
            disordered += 1
            
    # Calculate fraction disorder for the region
    rand_disordered_frac.append(disordered / row['region.len'])
    rand_conf.append(conf)