# Predicting accessibility from AlphaFold structures - human proteome

In [1]:
### Scripts are adapted from example scripts in the original StructureMap GitHub: https://github.com/MannLabs/structuremap/tree/main/nbs
### 

## Import libraries

In [2]:
# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization


In [3]:
# Import 
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
import tqdm
import tempfile
import csv

## Set UniProt IDs to Use

In [4]:
uniprotIDs = ["P04406", "P13639", "P11802"]

## Download AlphaFold data

In [9]:
output_path_str = "../alphafold_data/"
output_path = os.path.abspath(output_path_str)

cif_dir = os.path.join(output_path, 'cif')
pae_dir = os.path.join(output_path, 'pae')

print(output_path)
print(cif_dir)
print(pae_dir)

/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/cif
/Users/ritwiksrinivas/Desktop/Projects/methionine-analysis/alphafold_data/pae


In [10]:
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprotIDs,
    out_folder=cif_dir)

100%|██████████| 3/3 [00:08<00:00,  2.78s/it]

2024-03-04 18:57:35> Valid proteins: 3
2024-03-04 18:57:35> Invalid proteins: 0
2024-03-04 18:57:35> Existing proteins: 0





In [11]:
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprotIDs,
    out_folder=pae_dir, 
    )

100%|██████████| 3/3 [00:07<00:00,  2.52s/it]

2024-03-04 18:58:15> Valid proteins: 3
2024-03-04 18:58:15> Invalid proteins: 0
2024-03-04 18:58:15> Existing proteins: 0





In [18]:
#valid_proteins_pae

list

## Format AlphaFold data input

In [27]:
alphafold_annotation = format_alphafold_data(
    directory=cif_dir, 
    protein_ids=uniprotIDs)
alphafold_annotation

  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
  df = df.apply(pd.to_numeric, errors='ignore')
100%|██████████| 3/3 [00:00<00:00, 11.75it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,P04406,1,M,1,57.51,-21.395,-22.332,-21.956,-22.266,17.151,...,0.022,-0.854,-0.755,unstructured,unstructured,0,0,0,0,1
1,P04406,1,G,2,78.10,-19.675,-20.861,,-21.769,15.599,...,3.517,,2.368,HELX_LH_PP_P,HELX,0,1,0,0,0
2,P04406,1,K,3,90.98,-17.941,-17.473,-16.372,-18.607,13.356,...,3.917,4.892,4.012,HELX_LH_PP_P,HELX,0,1,0,0,0
3,P04406,1,V,4,96.86,-17.139,-17.695,-17.245,-17.385,10.596,...,3.880,2.771,3.567,STRN,STRN,0,0,1,0,0
4,P04406,1,K,5,98.58,-16.883,-17.584,-18.836,-17.941,8.028,...,7.358,8.222,6.028,STRN,STRN,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,P13639,3,F,854,95.94,23.093,23.256,22.010,23.514,16.446,...,-31.655,-30.808,-31.668,TURN_TY1_P,TURN,0,0,0,1,0
1492,P13639,3,L,855,95.86,23.477,22.265,20.993,22.532,16.990,...,-35.353,-35.855,-33.992,unstructured,unstructured,0,0,0,0,1
1493,P13639,3,D,856,94.80,24.781,25.277,26.306,24.159,15.959,...,-37.595,-37.321,-36.657,unstructured,unstructured,0,0,0,0,1
1494,P13639,3,K,857,90.20,26.147,25.270,25.529,25.456,15.472,...,-41.390,-42.083,-39.933,unstructured,unstructured,0,0,0,0,1


## Annotate pPSE values

In [28]:
full_sphere_exposure = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=24, 
    max_angle=180, 
    error_dir=pae_dir)
full_sphere_exposure

100%|██████████| 3/3 [00:00<00:00, 43.90it/s]


Unnamed: 0,protein_id,AA,position,nAA_24_180_pae
0,P04406,M,1,11
1,P04406,G,2,24
2,P04406,K,3,51
3,P04406,V,4,104
4,P04406,K,5,144
...,...,...,...,...
1491,P13639,F,854,121
1492,P13639,L,855,101
1493,P13639,D,856,76
1494,P13639,K,857,47


In [30]:
alphafold_accessibility = alphafold_annotation.merge(
    full_sphere_exposure, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae
0,P04406,1,M,1,57.51,-21.395,-22.332,-21.956,-22.266,17.151,...,-0.854,-0.755,unstructured,unstructured,0,0,0,0,1,11
1,P04406,1,G,2,78.10,-19.675,-20.861,,-21.769,15.599,...,,2.368,HELX_LH_PP_P,HELX,0,1,0,0,0,24
2,P04406,1,K,3,90.98,-17.941,-17.473,-16.372,-18.607,13.356,...,4.892,4.012,HELX_LH_PP_P,HELX,0,1,0,0,0,51
3,P04406,1,V,4,96.86,-17.139,-17.695,-17.245,-17.385,10.596,...,2.771,3.567,STRN,STRN,0,0,1,0,0,104
4,P04406,1,K,5,98.58,-16.883,-17.584,-18.836,-17.941,8.028,...,8.222,6.028,STRN,STRN,0,0,1,0,0,144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,P13639,3,F,854,95.94,23.093,23.256,22.010,23.514,16.446,...,-30.808,-31.668,TURN_TY1_P,TURN,0,0,0,1,0,121
1492,P13639,3,L,855,95.86,23.477,22.265,20.993,22.532,16.990,...,-35.855,-33.992,unstructured,unstructured,0,0,0,0,1,101
1493,P13639,3,D,856,94.80,24.781,25.277,26.306,24.159,15.959,...,-37.321,-36.657,unstructured,unstructured,0,0,0,0,1,76
1494,P13639,3,K,857,90.20,26.147,25.270,25.529,25.456,15.472,...,-42.083,-39.933,unstructured,unstructured,0,0,0,0,1,47


In [32]:
part_sphere_exposure = annotate_accessibility(
    df=alphafold_annotation, 
    max_dist=12, 
    max_angle=70, 
    error_dir=pae_dir)
part_sphere_exposure

100%|██████████| 3/3 [00:00<00:00, 94.81it/s]


Unnamed: 0,protein_id,AA,position,nAA_12_70_pae
0,P04406,M,1,0
1,P04406,G,2,0
2,P04406,K,3,3
3,P04406,V,4,6
4,P04406,K,5,5
...,...,...,...,...
1491,P13639,F,854,10
1492,P13639,L,855,6
1493,P13639,D,856,3
1494,P13639,K,857,0


In [33]:
alphafold_accessibility = alphafold_accessibility.merge(
    part_sphere_exposure, how='left', on=['protein_id','AA','position'])
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae
0,P04406,1,M,1,57.51,-21.395,-22.332,-21.956,-22.266,17.151,...,-0.755,unstructured,unstructured,0,0,0,0,1,11,0
1,P04406,1,G,2,78.10,-19.675,-20.861,,-21.769,15.599,...,2.368,HELX_LH_PP_P,HELX,0,1,0,0,0,24,0
2,P04406,1,K,3,90.98,-17.941,-17.473,-16.372,-18.607,13.356,...,4.012,HELX_LH_PP_P,HELX,0,1,0,0,0,51,3
3,P04406,1,V,4,96.86,-17.139,-17.695,-17.245,-17.385,10.596,...,3.567,STRN,STRN,0,0,1,0,0,104,6
4,P04406,1,K,5,98.58,-16.883,-17.584,-18.836,-17.941,8.028,...,6.028,STRN,STRN,0,0,1,0,0,144,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,P13639,3,F,854,95.94,23.093,23.256,22.010,23.514,16.446,...,-31.668,TURN_TY1_P,TURN,0,0,0,1,0,121,10
1492,P13639,3,L,855,95.86,23.477,22.265,20.993,22.532,16.990,...,-33.992,unstructured,unstructured,0,0,0,0,1,101,6
1493,P13639,3,D,856,94.80,24.781,25.277,26.306,24.159,15.959,...,-36.657,unstructured,unstructured,0,0,0,0,1,76,3
1494,P13639,3,K,857,90.20,26.147,25.270,25.529,25.456,15.472,...,-39.933,unstructured,unstructured,0,0,0,0,1,47,0


In [35]:
alphafold_accessibility['high_acc_5'] = np.where(alphafold_accessibility.nAA_12_70_pae <= 5, 1, 0)
alphafold_accessibility['low_acc_5'] = np.where(alphafold_accessibility.nAA_12_70_pae > 5, 1, 0)
alphafold_accessibility

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,high_acc_5,low_acc_5
0,P04406,1,M,1,57.51,-21.395,-22.332,-21.956,-22.266,17.151,...,unstructured,0,0,0,0,1,11,0,1,0
1,P04406,1,G,2,78.10,-19.675,-20.861,,-21.769,15.599,...,HELX,0,1,0,0,0,24,0,1,0
2,P04406,1,K,3,90.98,-17.941,-17.473,-16.372,-18.607,13.356,...,HELX,0,1,0,0,0,51,3,1,0
3,P04406,1,V,4,96.86,-17.139,-17.695,-17.245,-17.385,10.596,...,STRN,0,0,1,0,0,104,6,0,1
4,P04406,1,K,5,98.58,-16.883,-17.584,-18.836,-17.941,8.028,...,STRN,0,0,1,0,0,144,5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,P13639,3,F,854,95.94,23.093,23.256,22.010,23.514,16.446,...,TURN,0,0,0,1,0,121,10,0,1
1492,P13639,3,L,855,95.86,23.477,22.265,20.993,22.532,16.990,...,unstructured,0,0,0,0,1,101,6,0,1
1493,P13639,3,D,856,94.80,24.781,25.277,26.306,24.159,15.959,...,unstructured,0,0,0,0,1,76,3,1,0
1494,P13639,3,K,857,90.20,26.147,25.270,25.529,25.456,15.472,...,unstructured,0,0,0,0,1,47,0,1,0


## Annotate IDRs

In [36]:
alphafold_accessibility_smooth = get_smooth_score(
    alphafold_accessibility, 
    np.array(['nAA_24_180_pae']), 
    [10])
alphafold_accessibility_smooth

100%|██████████| 3/3 [00:00<00:00, 22.74it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,BEND,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,high_acc_5,low_acc_5,nAA_24_180_pae_smooth10
0,P04406,1,M,1,57.51,-21.395,-22.332,-21.956,-22.266,17.151,...,0,0,0,0,1,11,0,1,0,121.818182
1,P04406,1,G,2,78.10,-19.675,-20.861,,-21.769,15.599,...,0,1,0,0,0,24,0,1,0,126.416667
2,P04406,1,K,3,90.98,-17.941,-17.473,-16.372,-18.607,13.356,...,0,1,0,0,0,51,3,1,0,131.384615
3,P04406,1,V,4,96.86,-17.139,-17.695,-17.245,-17.385,10.596,...,0,0,1,0,0,104,6,0,1,137.428571
4,P04406,1,K,5,98.58,-16.883,-17.584,-18.836,-17.941,8.028,...,0,0,1,0,0,144,5,1,0,141.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,P13639,3,F,854,95.94,23.093,23.256,22.010,23.514,16.446,...,0,0,0,1,0,121,10,0,1,80.866667
854,P13639,3,L,855,95.86,23.477,22.265,20.993,22.532,16.990,...,0,0,0,0,1,101,6,0,1,80.857143
855,P13639,3,D,856,94.80,24.781,25.277,26.306,24.159,15.959,...,0,0,0,0,1,76,3,1,0,82.230769
856,P13639,3,K,857,90.20,26.147,25.270,25.529,25.456,15.472,...,0,0,0,0,1,47,0,1,0,84.833333


In [37]:
alphafold_accessibility_smooth['IDR'] = np.where(
    alphafold_accessibility_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
alphafold_accessibility_smooth

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,HELX,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,high_acc_5,low_acc_5,nAA_24_180_pae_smooth10,IDR
0,P04406,1,M,1,57.51,-21.395,-22.332,-21.956,-22.266,17.151,...,0,0,0,1,11,0,1,0,121.818182,0
1,P04406,1,G,2,78.10,-19.675,-20.861,,-21.769,15.599,...,1,0,0,0,24,0,1,0,126.416667,0
2,P04406,1,K,3,90.98,-17.941,-17.473,-16.372,-18.607,13.356,...,1,0,0,0,51,3,1,0,131.384615,0
3,P04406,1,V,4,96.86,-17.139,-17.695,-17.245,-17.385,10.596,...,0,1,0,0,104,6,0,1,137.428571,0
4,P04406,1,K,5,98.58,-16.883,-17.584,-18.836,-17.941,8.028,...,0,1,0,0,144,5,1,0,141.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,P13639,3,F,854,95.94,23.093,23.256,22.010,23.514,16.446,...,0,0,1,0,121,10,0,1,80.866667,0
854,P13639,3,L,855,95.86,23.477,22.265,20.993,22.532,16.990,...,0,0,0,1,101,6,0,1,80.857143,0
855,P13639,3,D,856,94.80,24.781,25.277,26.306,24.159,15.959,...,0,0,0,1,76,3,1,0,82.230769,0
856,P13639,3,K,857,90.20,26.147,25.270,25.529,25.456,15.472,...,0,0,0,1,47,0,1,0,84.833333,0


## Annotate short IDRs

In [38]:
alphafold_accessibility_smooth_pattern = annotate_proteins_with_idr_pattern(
    alphafold_accessibility_smooth,
    min_structured_length = 80, 
    max_unstructured_length = 20)
alphafold_accessibility_smooth_pattern

100%|██████████| 3/3 [00:00<00:00, 915.72it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,STRN,TURN,unstructured,nAA_24_180_pae,nAA_12_70_pae,high_acc_5,low_acc_5,nAA_24_180_pae_smooth10,IDR,flexible_pattern
0,P04406,1,M,1,57.51,-21.395,-22.332,-21.956,-22.266,17.151,...,0,0,1,11,0,1,0,121.818182,0,0
1,P04406,1,G,2,78.10,-19.675,-20.861,,-21.769,15.599,...,0,0,0,24,0,1,0,126.416667,0,0
2,P04406,1,K,3,90.98,-17.941,-17.473,-16.372,-18.607,13.356,...,0,0,0,51,3,1,0,131.384615,0,0
3,P04406,1,V,4,96.86,-17.139,-17.695,-17.245,-17.385,10.596,...,1,0,0,104,6,0,1,137.428571,0,0
4,P04406,1,K,5,98.58,-16.883,-17.584,-18.836,-17.941,8.028,...,1,0,0,144,5,1,0,141.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,P13639,3,F,854,95.94,23.093,23.256,22.010,23.514,16.446,...,0,1,0,121,10,0,1,80.866667,0,0
854,P13639,3,L,855,95.86,23.477,22.265,20.993,22.532,16.990,...,0,0,1,101,6,0,1,80.857143,0,0
855,P13639,3,D,856,94.80,24.781,25.277,26.306,24.159,15.959,...,0,0,1,76,3,1,0,82.230769,0,0
856,P13639,3,K,857,90.20,26.147,25.270,25.529,25.456,15.472,...,0,0,1,47,0,1,0,84.833333,0,0


In [None]:
alphafold_accessibility_smooth_pattern_ext = get_extended_flexible_pattern(
    alphafold_accessibility_smooth_pattern, 
    ['flexible_pattern'], [5])

In [None]:
alphafold_accessibility_smooth_pattern_ext[0:3]

In [None]:
alphafold_accessibility_smooth_pattern_ext.columns


In [None]:
alphafold_accessibility.to_csv('AlphaFoldPredicted_Accessibility_hsapiens.csv')

In [None]:
structuremap.processing.perform_enrichment_analysis

In [None]:
plot_enrichment