<a href="https://colab.research.google.com/github/rae-gh/colab-analyses/blob/main/Geometry_Intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Protein Geometry Correlations</h1>
(c) Rachel Alcraft 2024

There are 7 cells to run, you only need to edit the first to change the pdbs and atoms of interest:
<hr/>
</br> 1. RUN: Import the required librarires
</br> 2. EDIT+RUN: Enter structure and atoms
</br> 3. Calculate the pdb objects
</br> 4. Generate a basic 2d correlation
</br> 5. Generate a correlation with additional geoemtric hue
</br> 6. A correlation between CB:O and N:O on various hues
</br> 7. A correlation between CB:O and N:O on amino acid hue
<hr/>

In [None]:
# CELL 1
# PATH TO YOUR SAMPLE DATA (pdbs, results, etc.)
# NOTE if you are using colab there is a deirectory by default called sample_data
# If you are running locally with jupyter notebook choose a directory of your choice

DATADIR = "sample_data/"
IS_GOOGLE_COLAB = True  # Set this to True if you are using Google Colab

In [36]:
# CELL 2
# Need to import libraries
if IS_GOOGLE_COLAB:  
  try:#runtime gets refreshed so reinstall of non standard libraries may be necessary
    import google.colab    
    !pip install maptial        
  except:
    pass
else:
  print("Not using Google Colab, so make sure you have an environment setup with the correct libraries installed, e.g.\n------------------")
  print("mamba create -n maptial-env python=3.12 ipykernel ipython nbformat")
  print("mamba activate maptial-env")
  print("python -m pip install maptial")

In [38]:
# CELL 2 - specify the inputs
pdb_codes = ["1crn", "1ejg", "3u7t", "2fd7", "1cbn", "1cnr", "3nir", "1ab1", "2fd9", "1jxy", "1jxu", "1jxx", "1jxw", "1jxt", "4rek"]
geos = ["N:CA:C:N+1","N:CA:C","N:O","CA:C:N+1","C-1:N:CA:C"]
DATADIR = "sample_data/"

In [39]:
# CELL 3 - Calculate the dataframe of geometric correlations
from prometry import pdbloader as pl
from prometry import pdbgeometry as pg

pobjs = []
for pdb in pdb_codes:    
    pla = pl.PdbLoader(pdb,DATADIR,cif=False)    
    po = pla.load_pdb()
    pobjs.append(po)
gm = pg.GeometryMaker(pobjs)
df_geos = gm.calculateGeometry(geos)
print(df_geos)
#print(df_geos.columns)

     N:CA:C:N+1   N:CA:C       N:O  CA:C:N+1  C-1:N:CA:C pdb_code  resolution  \
0       144.346  108.649  2.761307   114.975    -107.833     1crn        1.50   
1       133.291  106.634  2.742142   119.442    -131.170     1crn        1.50   
2       151.203  109.941  2.766070   116.923    -118.897     1crn        1.50   
3       -18.979  116.739  3.623741   117.547     -76.182     1crn        1.50   
4       166.027  108.065  2.621433   116.324    -157.924     1crn        1.50   
..          ...      ...       ...       ...         ...      ...         ...   
905     -47.438  109.919  3.494409   116.431     -60.893     4rek        0.74   
906     -47.770  110.725  3.498333   115.607     -59.001     4rek        0.74   
907     -49.292  112.234  3.473657   117.160     -67.314     4rek        0.74   
908     -33.562  109.447  3.518229   118.058     -68.258     4rek        0.74   
909     -45.074  109.960  3.493018   116.847     -60.987     4rek        0.74   

      aa chain  rid  ... ri

In [40]:
# CELL 4
# Correlation 1
import plotly.express as px
import plotly.io as pio
x_ax = "N:CA:C:N+1"
y_ax = "N:O"
hue = "resolution"
df_geos.sort_values(by=["resolution"], inplace=True)
fig = px.scatter(df_geos, x=x_ax, y=y_ax, color=hue,title="",width=500, height=500, opacity=0.7,color_continuous_scale=px.colors.sequential.Viridis)
fig.show()


In [41]:
# CELL 5
# Correlation 2
import plotly.express as px
import plotly.io as pio
x_ax = "N:CA:C:N+1"
y_ax = "N:O"
hue = "N:CA:C"
fig = px.scatter(df_geos, x=x_ax, y=y_ax, color=hue,title="",width=500, height=500, opacity=0.7,color_continuous_scale=px.colors.sequential.Inferno)
fig.show()


In [49]:
# 6 A few plots of different hues for CB:O, N:O
from prometry import pdbloader as pl
from prometry import pdbgeometry as pg
import plotly.express as px
import plotly.io as pio

geos = ["N:CA:C:N+1","N:CA:C","N:O","CA:C:N+1","C-1:N:CA:C","CB:O","N:N+1", "CA-1:CA:CA+1"]
# CELL 3 - Calculate the dataframe of geometric correlations
pobjs = []
for pdb in pdb_codes:    
    pla = pl.PdbLoader(pdb,DATADIR,cif=False)    
    po = pla.load_pdb()
    pobjs.append(po)
gm = pg.GeometryMaker(pobjs)
df_geos = gm.calculateGeometry(geos)
print(df_geos)
x_ax = "N:O"
y_ax = "CB:O"
for hue in geos:    
    if hue not in [x_ax, y_ax]:
        fig = px.scatter(df_geos, x=x_ax, y=y_ax, color=hue,title="",width=500, height=500, opacity=0.7,color_continuous_scale=px.colors.sequential.Inferno)
        fig.show()

     N:CA:C:N+1   N:CA:C       N:O  CA:C:N+1  C-1:N:CA:C      CB:O     N:N+1  \
0       144.346  108.649  2.761307   114.975    -107.833  3.254849  3.561015   
1       133.291  106.634  2.742142   119.442    -131.170  3.049633  3.512840   
2       151.203  109.941  2.766070   116.923    -118.897  3.279150  3.607199   
3       -18.979  116.739  3.623741   117.547     -76.182  3.115144  2.803028   
4       166.027  108.065  2.621433   116.324    -157.924  3.381345  3.637472   
..          ...      ...       ...       ...         ...       ...       ...   
693     -36.255  110.915  3.523896   117.942     -65.784  3.243867  2.787210   
694     -47.438  109.919  3.494409   116.431     -60.893  3.346284  2.822171   
695     -47.770  110.725  3.498333   115.607     -59.001  3.357182  2.818065   
696     -49.292  112.234  3.473657   117.160     -67.314  3.328707  2.878748   
697     -33.562  109.447  3.518229   118.058     -68.258  3.234715  2.747430   

     CA-1:CA:CA+1 pdb_code  resolution 

In [50]:
# 7  CB:O N:O against amino acid
x_ax = "N:O"
y_ax = "CB:O"
hue = "motif_CA-1:CA:CA+1"
print(df_geos.columns)
fig = px.scatter(df_geos, x=x_ax, y=y_ax, color=hue,title="",width=500, height=500, opacity=0.7,color_continuous_scale=px.colors.sequential.Inferno)
fig.show()

Index(['N:CA:C:N+1', 'N:CA:C', 'N:O', 'CA:C:N+1', 'C-1:N:CA:C', 'CB:O',
       'N:N+1', 'CA-1:CA:CA+1', 'pdb_code', 'resolution', 'aa', 'chain', 'rid',
       'info_N:CA:C:N+1', 'info_N:CA:C', 'info_N:O', 'info_CA:C:N+1',
       'info_C-1:N:CA:C', 'info_CB:O', 'info_N:N+1', 'info_CA-1:CA:CA+1',
       'occ_N:CA:C:N+1', 'occ_N:CA:C', 'occ_N:O', 'occ_CA:C:N+1',
       'occ_C-1:N:CA:C', 'occ_CB:O', 'occ_N:N+1', 'occ_CA-1:CA:CA+1',
       'bf_N:CA:C:N+1', 'bf_N:CA:C', 'bf_N:O', 'bf_CA:C:N+1', 'bf_C-1:N:CA:C',
       'bf_CB:O', 'bf_N:N+1', 'bf_CA-1:CA:CA+1', 'rid2_N:CA:C:N+1',
       'rid2_N:CA:C', 'rid2_N:O', 'rid2_CA:C:N+1', 'rid2_C-1:N:CA:C',
       'rid2_CB:O', 'rid2_N:N+1', 'rid2_CA-1:CA:CA+1', 'rid3_N:CA:C:N+1',
       'rid3_N:CA:C', 'rid3_N:O', 'rid3_CA:C:N+1', 'rid3_C-1:N:CA:C',
       'rid3_CB:O', 'rid3_N:N+1', 'rid3_CA-1:CA:CA+1', 'rid4_N:CA:C:N+1',
       'rid4_N:CA:C', 'rid4_N:O', 'rid4_CA:C:N+1', 'rid4_C-1:N:CA:C',
       'rid4_CB:O', 'rid4_N:N+1', 'rid4_CA-1:CA:CA+1'],
      d

ValueError: Value of 'color' is not the name of a column in 'data_frame'. Expected one of ['N:CA:C:N+1', 'N:CA:C', 'N:O', 'CA:C:N+1', 'C-1:N:CA:C', 'CB:O', 'N:N+1', 'CA-1:CA:CA+1', 'pdb_code', 'resolution', 'aa', 'chain', 'rid', 'info_N:CA:C:N+1', 'info_N:CA:C', 'info_N:O', 'info_CA:C:N+1', 'info_C-1:N:CA:C', 'info_CB:O', 'info_N:N+1', 'info_CA-1:CA:CA+1', 'occ_N:CA:C:N+1', 'occ_N:CA:C', 'occ_N:O', 'occ_CA:C:N+1', 'occ_C-1:N:CA:C', 'occ_CB:O', 'occ_N:N+1', 'occ_CA-1:CA:CA+1', 'bf_N:CA:C:N+1', 'bf_N:CA:C', 'bf_N:O', 'bf_CA:C:N+1', 'bf_C-1:N:CA:C', 'bf_CB:O', 'bf_N:N+1', 'bf_CA-1:CA:CA+1', 'rid2_N:CA:C:N+1', 'rid2_N:CA:C', 'rid2_N:O', 'rid2_CA:C:N+1', 'rid2_C-1:N:CA:C', 'rid2_CB:O', 'rid2_N:N+1', 'rid2_CA-1:CA:CA+1', 'rid3_N:CA:C:N+1', 'rid3_N:CA:C', 'rid3_N:O', 'rid3_CA:C:N+1', 'rid3_C-1:N:CA:C', 'rid3_CB:O', 'rid3_N:N+1', 'rid3_CA-1:CA:CA+1', 'rid4_N:CA:C:N+1', 'rid4_N:CA:C', 'rid4_N:O', 'rid4_CA:C:N+1', 'rid4_C-1:N:CA:C', 'rid4_CB:O', 'rid4_N:N+1', 'rid4_CA-1:CA:CA+1'] but received: motif_CA-1:CA:CA+1