## PRISMEM15 Matrix

#### `Author: Simon Hackl`
#### `Project: The OMPeome of Treponema pallidum`
#### `Contact: simon.hackl@uni-tuebingen.de`
#### `Date: 15.02.2022`

This _Python_ Notebook guides through the steps of generating and evaluating the `PRISMEM15` matrix. The concept is based on the publication "Amino acid interaction preferences in proteins", Jha _et al._, 2010, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2866284/.

The `PRISMEM15` matrix is a scoring matrix to evaluate contacts of pairs of residues, coupled to secondary structure types, of membrane proteins based on their observed frequency in nature.

Like the `BLOSUM` type matrices, the `PRISMEM15` matrix basically yields log-odds ratios, see the formula below.

$$S(r^i_x,r^j_z) = \large \frac{1}{\lambda} ln( \frac{ \frac{ c(r^i_x,r^j_z) }{ C } }{ \frac{ n(r^i_x) }{ R } \cdot \frac{ n(r^j_z) }{ R } } ) $$

- $i,j \in \textsf{3-Letter-Amino-Acid-Alphabet}$.
- $x,z \in \{~\text{Helix},\text{Coil},\text{Sheet}~\}$.
- $R :=$ total number of amino-acids in a dataset with at least one contact.
- $C :=$ total number of contacts in a dataset.
- $n(r^i_x)$ and $n(r^j_z)$; number of observerd amino acids of type $i, j$ in secondary structure $x, z$ in a dataset with at least one contact.
- $c(r^i_x,r^j_z) :=$ counted contacts of amino acids $r^i_x$ and $r^j_z$ in a dataset.
- $\lambda :=$ scaling factor fitted to the following equation:

$$ \sum_{ r^i_x, r^j_z } \frac{ n(r^i_x) }{ R } \cdot \frac{ n(r^j_z) }{ R } \cdot e^{\lambda \cdot S(r^i_x,r^j_z)} = 1$$

Thereby, a contact is defined as two residues having a side-chain center of mass distance to each other below $5$ Angstrom.

### 1. Imports and Definitions

In [None]:
from Bio.PDB import PDBList, PDBParser

import os
import itertools
import numpy
import warnings
import json
import math
import shutil

import matplotlib.pyplot as plt
plt.rcParams.update({
    "text.usetex": False,
    "font.family": "serif"
})

import seaborn as sns

# Generate output directory if not present.
if not os.path.isdir( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix" ) :
    os.mkdir( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix" )

def countContacts( directory, counts ) :
    
    def sidechainCenterOfMass( atomList ) :
        totalMass = 0
        xCoord = 0
        yCoord = 0
        zCoord = 0
        # Store atomList as list (genarator object allows only one iteration).
        atomList = list( atomList )
        for atom in atomList :
            totalMass += atomMass[ atom.element ]
        # Calculate x-coord center of mass:
        xCoord = sum( [ atomMass[ atom.element ] * atom.get_coord( )[ 0 ] for atom in atomList ] ) / totalMass
        # Calculate y-coord center of mass:
        yCoord = sum( [ atomMass[ atom.element ] * atom.get_coord( )[ 1 ] for atom in atomList ] ) / totalMass
        # Calculate z-coord center of mass:
        zCoord = sum( [ atomMass[ atom.element ] * atom.get_coord( )[ 2 ] for atom in atomList ] ) / totalMass
        return [ xCoord, yCoord, zCoord ]
    
    def getSidechainAtoms( atomList ) :
        sidechainAtomList = [ ]
        isSidechain = False
        # Store atomList as list (genarator object allows only one iteration).
        atomList = list( atomList )
        for atom in atomList :
            if atom.id == "CB" :
                isSidechain = True
            if isSidechain :
                # Exclude non-sidechain hydrogen atoms:
                if not atom.name in [ "H", "1H", "2H", "3H", "HA", "1HA", "2HA", "3HA" ] :
                    sidechainAtomList.append( atom )
        # Add hydrogen 2HA atom as sidechain if residue is Glycin.
        if len( sidechainAtomList ) == 0 :
            hasHydrogen = False
            for atom in atomList :
                if atom.name == "2HA" :
                    hasHydrogen = True
                    sidechainAtomList.append( atom )
            if not hasHydrogen :
                for atom in atomList :
                    if atom.name == "CA" :
                        sidechainAtomList.append( atom )
        return sidechainAtomList
    
    def extractSecondaryStructures( PDBFile ) :
        secondaryStructures = { }
        with open( PDBFile, "r" ) as PDBIn :
            line = PDBIn.readline( )
            while line :
                if line.startswith( "SHEET" ) :
                    sheetStart = int( line[ 22 : 26 ].strip( ) )
                    sheetStop = int( line[ 33 : 37 ] )
                    chain = line[ 21 ]
                    for i in range( sheetStart, sheetStop + 1 ) :
                        secondaryStructures[ chain + str( i ) ] = "Sheet"
                elif line.startswith( "HELIX" ) :
                    helixStart = int( line[ 21 : 25 ].strip( ) )
                    helixStop = int( line[ 33 : 37 ].strip( ) )
                    chain = line[ 19 ].strip( )
                    for i in range( helixStart, helixStop + 1 ) :
                        secondaryStructures[ chain + str( i ) ] = "Helix"
                line = PDBIn.readline( )
        return lambda chain, pos : secondaryStructures[ chain + pos ] if chain + pos in secondaryStructures else "Coil"
    
    # Ignore warnings.
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # Initialize .pdb parser and parse structure:
        parser = PDBParser( )
        PDBId = os.path.basename( directory )
        structure = parser.get_structure( PDBId, directory + "/" + PDBId + ".pdb" )
        secondaryStructure = extractSecondaryStructures( directory + "/" + PDBId + ".pdb" )
        # Iterate over structure chains:
        residues = [ ]
        # Only use the first model:
        model = structure.get_list()[ 0 ]
        for chain in model.get_list():
            # Extract list of residues from chain.
            chainResiduesList = chain.get_list( )
            for residue in chainResiduesList :
                if residue.get_full_id( )[ 3 ][ 0 ] == ' ' :
                    residue.secStruc = secondaryStructure( chain.id, str( residue.id[ 1 ] ) )
                    residue.urid = chain.id + str( residue.id[ 1 ] )
                    residues.append( residue )
                    counts[ "totalResidues" ] += 1
                    #counts[ mapModified( residue.get_resname( ) ) + "-" + residue.secStruc ] += 1
                else :
                    if mapModified( residue.get_resname( ) ) in aa3to1 :
                        residue.secStruc = secondaryStructure( chain.id, str( residue.id[ 1 ] ) )
                        residue.urid = chain.id + str( residue.id[ 1 ] )
                        residues.append( residue )
                        counts[ "totalResidues" ] += 1
                        #counts[ mapModified( residue.get_resname( ) ) + "-" + residue.secStruc ] += 1
        # For each residue pair.
        residuesNmbr = len( residues )
        totalSteps = int( ( residuesNmbr**2 + residuesNmbr ) / 2 ) - residuesNmbr
        steps = 0
        visited = set( )
        for i in range( 0, residuesNmbr ) :
            print( PDBId + ", Progress: " + str( steps ) + "/" + str( totalSteps ), end = '\r' )
            for j in range( i + 1, residuesNmbr ) :
                steps += 1
                residueI = residues[ i ]
                sidechainAtomsI = getSidechainAtoms( list( residueI.get_atoms( ) ) )
                sidechainCOMI = sidechainCenterOfMass( sidechainAtomsI )
                residueJ = residues[ j ]
                sidechainAtomsJ = getSidechainAtoms( list( residueJ.get_atoms( ) ) )
                sidechainCOMJ = sidechainCenterOfMass( sidechainAtomsJ )
                # Calculate the distance between the sidechains center of mass:
                diff = numpy.array( sidechainCOMI ) - numpy.array( sidechainCOMJ )
                distance = numpy.sqrt( numpy.sum( diff * diff ) )
                if distance <= 5 :
                    if not residueI.urid in visited :
                        counts[ mapModified( residueI.get_resname( ) ) + "-" + residueI.secStruc ] += 1
                        visited.add( residueI.urid )
                    if not residueJ.urid in visited :
                        counts[ mapModified( residueJ.get_resname( ) ) + "-" + residueJ.secStruc ] += 1
                        visited.add( residueJ.urid )
                    counts[ "totalContacts" ] += 1
                    counts[ mapModified( residueI.get_resname( ) ) + "-" + residueI.secStruc + "/" + mapModified( residueJ.get_resname( ) ) + "-" + residueJ.secStruc ] += 1
        print( PDBId + ", Progress: " + str( steps ) + "/" + str( totalSteps ), end = '\r' )
        
def computeScores( counts, scores = False, pseudoCount = False, l = False ) :
    if not scores :
        scores = { }
        for residueInSecondaryStructureContact in list( itertools.product( residuesInSecondaryStructure, residuesInSecondaryStructure ) ) :
            scores[ residueInSecondaryStructureContact[ 0 ] + "/" + residueInSecondaryStructureContact[ 1 ] ] = 0
    R = counts[ "totalResidues" ]
    C = counts[ "totalContacts" ]
    for entry in list( scores.keys( ) ) :
        residueI = entry.split( "/" )[ 0 ].split( "-" )[ 0 ]
        residueJ = entry.split( "/" )[ 1 ].split( "-" )[ 0 ]
        combFac = 2 if entry.split( "/" )[ 0 ] == entry.split( "/" )[ 1 ] else 1
        secondaryStructureI = entry.split( "/" )[ 0 ].split( "-" )[ 1 ]
        secondaryStructureJ = entry.split( "/" )[ 1 ].split( "-" )[ 1 ]
        pairedEntry = entry.split( "/" )[ 1 ] + "/" + entry.split( "/" )[ 0 ]
        c_rIX_rJZ = counts[ entry ] + counts[ pairedEntry ]
        rIX = counts[ residueI  + "-" + secondaryStructureI ]
        rJZ = counts[ residueJ  + "-" + secondaryStructureJ ]
        if c_rIX_rJZ == 0 and pseudoCount :
            c_rIX_rJZ = pseudoCount
        if l :
            scores[ entry ] = round( (1/l) * numpy.log( ( c_rIX_rJZ / C ) / ( combFac * ( rIX / R ) * ( rJZ / R ) ) ), 0 )
        else :
            scores[ entry ] = round( numpy.log( ( c_rIX_rJZ / C ) / ( combFac * ( rIX / R ) * ( rJZ / R ) ) ), 5 )
    return scores

def computeScaling( counts, scores, l = 1, iterations = 10 ) :
    R = counts[ 'totalResidues' ]
    C = counts[ 'totalContacts' ]
    fit = lambda l : sum( [
        ( ( counts[ a ] / R ) * ( counts[ b ] / R ) * math.exp( l * scores[ a + "/" + b ] ) )
        for a, b in itertools.product( residuesInSecondaryStructure, residuesInSecondaryStructure )
    ] )
    e = fit( l )
    step = 0.1
    for i in range( 0, iterations ) :
        if e > 1 :
            l -= step
            e = fit( l )
        elif e < 1 :
            l += step
            e = fit( l )
            step = step / 10
    print( "Done:" )
    print( "> Remaining error: " + str( abs( 1 - e ) ) )
    print( "> Scaling factor: " + str( l ) )
    return l

def visualizeScores( scores ) :
    scores2D = [ ]
    for i in residuesInSecondaryStructure :
        row = [ ]
        for j in residuesInSecondaryStructure :
            row.append( scores[ i + "/" + j ] )
        scores2D.append( row )
    scores2D = numpy.array( scores2D )
    fig, ax = plt.subplots( figsize = ( 14, 15 ) )
    sns.heatmap( scores2D,
                 cmap = "jet",
                 cbar_kws = {
                     "aspect": 50,
                     "label": "PRISMEM15 Score",
                     "orientation": "horizontal"
                 },
                 center = 0.0,
                 vmin = min( [ val for val in scores.values( ) if val != numpy.NINF ] ),
                 vmax = max( [ val for val in scores.values( ) if val != numpy.inf ] ),
                 linewidths = 3 )
    ax.set_xticks( numpy.arange( 0.5, 60.5 ) )
    ax.set_yticks( numpy.arange( 0.5, 60.5 ) )
    ax.set_xticklabels( [ label.split( "-" )[ 0 ] for label in residuesInSecondaryStructure ], size = 12 )
    ax.set_yticklabels( [ label.split( "-" )[ 0 ] for label in residuesInSecondaryStructure ], size = 12 )
    plt.setp( ax.get_xticklabels( ), rotation = 70, ha = "right", rotation_mode = "anchor")
    plt.setp( ax.get_yticklabels( ), rotation = 0, ha = "right")
    ax.set_title( "PRISMEM15 Matrix", size = 20 )
    ax.figure.axes[-1].xaxis.label.set_size( 18 )
    ax.figure.axes[-1].tick_params( labelsize = 14 )
    fig.tight_layout( )
    plt.show( )

### 2. Dataset Acquisition

The following steps describe the acquisition of an appropriate dataset to compute the `PRISMEM15` matrix.

#### 2.1 Selection of PDB Structures

A dataset of $5,224$ PDB structures of curated membrane proteins was selected from the `mpstruc` database (_https://blanco.biomol.uci.edu/mpstruc/_) and a list of the raw PDB ids was stored in the file `./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstruc_PDB_IDs.txt`.

#### 2.2 Culling with PISCES

In order to reduce redundancy and only include high quality structures, the list of all candidate PDB structures was filtered using the `PISCES` webserver (_https://dunbrack.fccc.edu/pisces/_). PISCES requires a list of PDB Identifiers and the respective chain of the protein structure as input (for example given the PDB with the identifier 1XXX and three chains A, B and C the correct input for `PISCES` would be 1XXXA, 1XXXB and 1XXXC). The list of downloaded PDB structures/identifiers had to be pre-processed in order to match this format (code cell below).

In [None]:
# Initialize PDBList and PDBParser instances as well as directory for temporary data.
pdbList = PDBList( )
pdbParser = PDBParser( QUIET=True )
if not os.path.isdir( "./temp" ) :
    os.mkdir( "./temp" )
# For each entry in the MPSTRUC list the pdb entry is downloaded and scanned for its chains.
# The respective pdbID and chainID are appended and stored in a set (thus, only unique entries are contained).
uniqueIDs = set( )
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstruc_PDB_IDs.txt", "r" ) as mpstrucPdbIdsFile :
    line = mpstrucPdbIdsFile.readline( )
    while line :
        try :
            pdbID = line.strip( )
            pdbList.retrieve_pdb_file( pdbID, pdir = "./temp", file_format = "pdb" )
            pdbStructure = pdbParser.get_structure( pdbID, "./temp/pdb" + pdbID + ".ent" )
            for chain in pdbStructure.get_chains( ) :
                chainLength = len( list( chain.get_residues( ) ) )
                # Prefilter for chains that have length below 80.
                if chainLength <= 80 :
                    print( pdbID + chain.id + " skipped: Length of " + str( chainLength ) + " below 80." )
                else :
                    uniqueIDs.add( pdbID + chain.id )
            os.remove( "./temp/pdb" + pdbID + ".ent" )
        except OSError as e :
            pass
        line = mpstrucRawIDs.readline( )
shutil.rmtree( "./temp" )
shutil.rmtree( "./obsolete" )
# Next, the list of unique PDB+Chain identifiers is written to a text file.
mpstrucPdbChainIds = "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstruc_PDB+Chain_IDs.txt"
with open( mpstrucPdbChainIds, "w" ) as mpstrucPdbChainIdsFile :
    for entry in uniqueIDs :
        mpstrucPdbChainIdsFile.write( entry + "\n" )

The obtained list of PDB identifiers with their chains being appended was stored in the file `./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstruc_PDB+Chain_IDs.txt`. Next, this list was culled using `PISCES` and the following parameters:

- Maximum pairwise percent sequence identity: 15 %
- Maximum resolution (X-ray and EM): 1.8 Å
- Maximum R-value (X-ray only): 0.3
- Minimum chain length: 80
- Include cryo-EM entries?	Yes
- Include NMR entries? Yes

The results of the query returned $112$ remaining protein chains (of $100$ proteins) which were stored at `./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/cullpdb_pc15.0_res0.0-1.8_len80-10000_R0.3_Xray+Nmr+EM_d2021_10_12_chains112.txt`

#### 2.4 Download Culled Structures
Each of the protein structures that remained after culling was downloaded and stored in the `./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstrucCulledStructures` directory.

In [None]:
mpstrucCulled = "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/cullpdb_pc15.0_res0.0-1.8_len80-10000_R0.3_Xray+Nmr+EM_d2021_10_12_chains112.txt"
PDBIdSet = set( )
with open( mpstrucCulled, "r" ) as mpstrucCulledFile :
    line = mpstrucCulledFile.readline( ) # Skip header line.
    line = mpstrucCulledFile.readline( )
    while line :
        PDBIdSet.add( line.split( " " )[ 0 ][ :-1 ] ) # Remove appended chain.
        line = culledPDBs.readline( )
# Initialize PDBList and output directory.
pdbList = PDBList( )
if not os.path.isdir( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstrucCulledStructures" ) :
    os.mkdir( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstrucCulledStructures" )
for PDBId in PDBIdSet :
    if not os.path.isdir( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstrucCulledStructures/" + PDBId ) :
        pdbList.retrieve_pdb_file( PDBId, pdir = "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstrucCulledStructures/" + PDBId, file_format = "pdb" )
        os.rename( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstrucCulledStructures/" + PDBId + "/pdb" + PDBId.lower( ) + ".ent", "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstrucCulledStructures/" + PDBId + "/" + PDBId + ".pdb" )

### 3. Compute PRISMEM15 Scores
Next the `PRISMEM15` matrix scores were computed from this dataset by executing the code cells below.

In [None]:
"""
Definition of helper functions and constants.
"""
# Dictionary to convert amino acid three-letter to one-letter code.
aa3to1 = { 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
           'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
           'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
           'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M' }
# Function to map modified residues.
def mapModified( resn ) :
    if resn == "KCX" :
        return "LYS"
    elif resn == "SEC" :
        return "CYS"
    elif resn == "MSE" :
        return "MET"
    else :
        return resn
# Dictionary storing atomic masses.
atomMass = { 'C' : 12.0011, 'N' : 14.0067, 'O' : 15.9994, 'S' : 32.06, 'H' : 1.00797, 'SE' : 78.96 }
# List storing residues assign to a secondary structure.
residuesInSecondaryStructure = [ ]
for secondaryStructure in [ "Helix", "Coil", "Sheet" ] :
    for residue in list( aa3to1.keys( ) ) :
        residuesInSecondaryStructure.append( residue + "-" + secondaryStructure )
# Dictionary to store counts.
counts = { }
counts[ "totalResidues" ] = 0
counts[ "totalContacts" ] = 0
for residueInSecondaryStructure in residuesInSecondaryStructure :
    counts[ residueInSecondaryStructure ] = 0
for residueInSecondaryStructureContact in list( itertools.product( residuesInSecondaryStructure, residuesInSecondaryStructure ) ) :
    counts[ residueInSecondaryStructureContact[ 0 ] + "/" + residueInSecondaryStructureContact[ 1 ] ] = 0

In [None]:
"""
Start the counting process using the prepared dataset.

NOTE: This process can take a long time and may be skipped if the respective counts storage file is present.
"""
# Start the counting process.
structureCounter = 0
for subdir in os.listdir( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstrucCulledStructures/" ) :
    structureCounter += 1
    print( "Processing structure " + str( structureCounter ) )
    countContacts( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/mpstrucCulledStructures/" + subdir, counts )
    print( )
# Store the counts in a .json file.
if not os.path.isdir( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix" ) :
    os.mkdir( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix" )
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/counts_pc15.0_res0.0-1.8_len80-10000_R0.3_Xray+Nmr+EM_d2021_10_12_chains112.json", "w+" ) as countsFile :
    json.dump( counts, countsFile )

In [None]:
"""
If the counting process was already run, this cell can be run instead of the one above.
"""
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/counts_pc15.0_res0.0-1.8_len80-10000_R0.3_Xray+Nmr+EM_d2021_10_12_chains112.json", "r" ) as countsFile :
    counts = json.load( countsFile )
"""
Add pseudo count of 0.1 to each pair of structure coupled aminoacids.
"""
for residueInSecondaryStructureContact in list( itertools.product( residuesInSecondaryStructure, residuesInSecondaryStructure ) ) :
    counts[ residueInSecondaryStructureContact[ 0 ] + "/" + residueInSecondaryStructureContact[ 1 ] ] += 0.1

In [None]:
"""
Compute the scores from the conducted countig and adjust the scaling parameter with 150 iterations of fitting
lambda to the respective equation (cf. 1.).
"""

# Compute scores:
scores = computeScores( counts )
l = computeScaling( counts, scores, l = 1, iterations = 150 )
scores = computeScores( counts, scores = scores, l = l )

In [None]:
# Store the computed scores in a .json file.
if not os.path.isdir( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix" ) :
    os.mkdir( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix" )
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/scores_pc15.0_res0.0-1.8_len80-10000_R0.3_Xray+Nmr+EM_d2021_10_12_chains112.json", "w+" ) as scoresFile :
    json.dump( scores, scoresFile )
visualizeScores( scores )

In [None]:
"""
If the score computation was already run, this cell can be run instead of the one above.
"""
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/scores_pc15.0_res0.0-1.8_len80-10000_R0.3_Xray+Nmr+EM_d2021_10_12_chains112.json", "r" ) as scoresFile :
    scores = json.load( scoresFile )
visualizeScores( scores )

---

## PRISMEM15 Matrix Evaluation

#### `Author: Simon Hackl`
#### `Project: The OMPeome of Treponema pallidum`
#### `Contact: simon.hackl@uni-tuebingen.de`
#### `Date: 15.02.2022`

The code cells below will guide through the process of evaluating the `PRISMEM15` matrix.

### 1. Evaluation Procedure
In order to benchmark the computed scoring matrix a dataset of known mutations of human membran proteins leading to a disease or with a neutral effect were conisdered. The information about these mutations was extracted from the `MutHTP` (https://www.iitm.ac.in/bioinfo/MutHTP/) database. It was queried for missense mutations that lead either to a disease or to a neutral effect. The respective results of the queries were stored at `./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Disease.csv` and `./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Neutral.csv`

The idea is to score the respective protein interactions of the mutated residues and compare the distribution of these scores using a _Kolmogorov-Smirnov_ test for goodness of fit (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kstest.html).

### 2. Imports and Definitions

In [None]:
from Bio.PDB import PDBList, PDBParser
from matplotlib.lines import Line2D

import os
import itertools
import numpy
import warnings
import json
import math
import shutil

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy as sc

# Dictionary to convert amino acid three-letter to one-letter code.
aa3to1 = { 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
           'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
           'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
           'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M' }
# Dictionary to convert amino acid one-letter to three-letter code.
aa1to3 = dict( ( value, key ) for key, value in aa3to1.items( ) )
# Parse missense mutation information as pandas dataframe.
MutHTP_Neutral_Df = pd.read_csv( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Neutral.csv", sep = "\t", quotechar = '"' )
MutHTP_Disease_Df = pd.read_csv( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Disease.csv", sep = "\t", quotechar = '"' )
# Parse scores.
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/scores_pc15.0_res0.0-1.8_len80-10000_R0.3_Xray+Nmr+EM_d2021_10_12_chains112.json", "r" ) as scoresFile :
    scores = json.load( scoresFile )

def scoreMutation( PDBId, mutation, scores ) :
    """
    mutation parameter format [WildTypeResidue]>[POS]:[CHAIN]>[MutatedResidue]
    """
    
    def sidechainCenterOfMass( atomList ) :
        totalMass = 0
        xCoord = 0
        yCoord = 0
        zCoord = 0
        # Store atomList as list (genarator object allows only one iteration).
        atomList = list( atomList )
        for atom in atomList :
            totalMass += atomMass[ atom.element ]
        # Calculate x-coord center of mass:
        xCoord = sum( [ atomMass[ atom.element ] * atom.get_coord( )[ 0 ] for atom in atomList ] ) / totalMass
        # Calculate y-coord center of mass:
        yCoord = sum( [ atomMass[ atom.element ] * atom.get_coord( )[ 1 ] for atom in atomList ] ) / totalMass
        # Calculate z-coord center of mass:
        zCoord = sum( [ atomMass[ atom.element ] * atom.get_coord( )[ 2 ] for atom in atomList ] ) / totalMass
        return [ xCoord, yCoord, zCoord ]
        
    def getSidechainAtoms( atomList ) :
        sidechainAtomList = [ ]
        isSidechain = False
        # Store atomList as list (genarator object allows only one iteration).
        atomList = list( atomList )
        for atom in atomList :
            if atom.id == "CB" :
                isSidechain = True
            if isSidechain :
                # Exclude non-sidechain hydrogen atoms:
                if not atom.name in [ "H", "1H", "2H", "3H", "HA", "1HA", "2HA", "3HA" ] :
                    sidechainAtomList.append( atom )
        # Add hydrogen 2HA atom as sidechain if residue is Glycin.
        if len( sidechainAtomList ) == 0 :
            hasHydrogen = False
            for atom in atomList :
                if atom.name == "2HA" :
                    hasHydrogen = True
                    sidechainAtomList.append( atom )
            if not hasHydrogen :
                for atom in atomList :
                    if atom.name == "CA" :
                        sidechainAtomList.append( atom )
        return sidechainAtomList
    
    def extractSecondaryStructures( PDBFile ) :
        secondaryStructures = { }
        with open( PDBFile, "r" ) as PDBIn :
            line = PDBIn.readline( )
            while line :
                if line.startswith( "SHEET" ) :
                    sheetStart = int( line[ 22 : 26 ].strip( ) )
                    sheetStop = int( line[ 33 : 37 ] )
                    chain = line[ 21 ]
                    for i in range( sheetStart, sheetStop + 1 ) :
                        secondaryStructures[ chain + str( i ) ] = "Sheet"
                elif line.startswith( "HELIX" ) :
                    helixStart = int( line[ 21 : 25 ].strip( ) )
                    helixStop = int( line[ 33 : 37 ].strip( ) )
                    chain = line[ 19 ].strip( )
                    for i in range( helixStart, helixStop + 1 ) :
                        secondaryStructures[ chain + str( i ) ] = "Helix"
                line = PDBIn.readline( )
        return lambda chain, pos : secondaryStructures[ chain + pos ] if chain + pos in secondaryStructures else "Coil"
    # Generate temporary working directory.
    if not os.path.isdir( "./temp" ) :
        os.mkdir( "./temp" )
    # Initialize PDBList and PDBParser instances.
    pdbList = PDBList( verbose=False )
    pdbParser = PDBParser( QUIET=True )
    # Initialize variables to store results.
    results = [ [ ], [ ] ]
    """ results format:
    [
        [
            <WT residue type>/<secondary structure>,
            <mutated residue position>,
            <mutated residue chain>,
            <MUT residue type>/<secondary structure>
        ],
        [
            [
                <contacting residue position>,
                <contacting residue type>/<contacting residue secondary structure>,
                <distance in Angstrom>,
                <PRISMEM15 score with WT residue>,
                <PRISMEM15 score with MUT residue>
            ],
            ...
        ]
    ]
    """
    # Download corresponding .pdb file and parse structure.
    pdbList.retrieve_pdb_file( PDBId, pdir = "./temp", file_format = "pdb" )
    pdbStructure = pdbParser.get_structure( PDBId, "./temp/pdb" + PDBId + ".ent" )
    secondaryStructure = extractSecondaryStructures( "./temp/pdb" + PDBId + ".ent" )
    # Iterate over structure chains:
    residues = [ ]
    mutationPos = mutation.split( ">" )[ 1 ].split( ":" )[ 0 ]
    mutationChain = mutation.split( ">" )[ 1 ].split( ":" )[ 1 ]
    mutationRes = aa1to3[ mutation.split( ">" )[ 2 ] ]
    wildTypeRes = aa1to3[ mutation.split( ">" )[ 0 ] ]
    mutatedResidue = None
    for model in pdbStructure.get_list():
        for chain in model.get_list():
            # Extract list of residues from chain.
            chainResiduesList = chain.get_list( )
            for residue in chainResiduesList :
                if chain.id == mutationChain and str( residue.id[ 1 ] ) == mutationPos :
                    mutatedResidue = residue
                    mutatedResidue.mutation = mutationRes
                    mutatedResidue.wildTypeRes = wildTypeRes
                    residue.secStruc = secondaryStructure( chain.id, str( residue.id[ 1 ] ) )
                else :
                    if residue.get_full_id( )[ 3 ][ 0 ] == ' ' :
                        residue.secStruc = secondaryStructure( chain.id, str( residue.id[ 1 ] ) )
                        residues.append( residue )
                    else :
                        if mapModified( residue.get_resname( ) ) in aa3to1 :
                            residue.secStruc = secondaryStructure( chain.id, str( residue.id[ 1 ] ) )
                            residues.append( residue )
    results[ 0 ] = [ wildTypeRes + "/" + mutatedResidue.secStruc, mutationPos, mutationChain, mutationRes + "/" + mutatedResidue.secStruc ]
    mutatedResidueSidechainAtoms = getSidechainAtoms( list( mutatedResidue.get_atoms( ) ) )
    mutatedResidueSidechainCOM = sidechainCenterOfMass( mutatedResidueSidechainAtoms )
    # For each other residue:
    for residue in residues :
        sidechainAtoms = getSidechainAtoms( list( residue.get_atoms( ) ) )
        sidechainCOM = sidechainCenterOfMass( sidechainAtoms )
        # Calculate the distance between the sidechains center of mass:
        diff = numpy.array( mutatedResidueSidechainCOM ) - numpy.array( sidechainCOM )
        distance = numpy.sqrt( numpy.sum( diff * diff ) )
        if distance <= 5 :
            scoreMutation = scores[ mutatedResidue.mutation + "-" + mutatedResidue.secStruc + "/" + mapModified( residue.get_resname( ) ) + "-" + residue.secStruc ]
            scoreWildType = scores[ mutatedResidue.wildTypeRes + "-" + mutatedResidue.secStruc + "/" + mapModified( residue.get_resname( ) ) + "-" + residue.secStruc ]
            results[ 1 ].append(
                ( residue.id[ 1 ], mapModified( residue.get_resname( ) ) + "/" +  residue.secStruc, round( distance, 3 ), scoreWildType, scoreMutation )
            )
    # Remove temporary directories.
    shutil.rmtree( "./temp" )
    shutil.rmtree( "./obsolete" )
    return results
    
def extractMutationInfo( dfRow ) :
    pdbInfo = dfRow.pdbid.split( "#" )[ 0 ]
    pdbId = pdbInfo.split( "_" )[ 0 ]
    wildTypeResidue = dfRow.protein_mut[ 0 ]
    mutationResidue = dfRow.protein_mut[ -1 ]
    chain = pdbInfo.split( "_" )[ 1 ].split( "," )[ 0 ].split( ":" )[ 1 ]
    position = pdbInfo.split( "_" )[ 1 ].split( "," )[ 0 ].split( ":" )[ 0 ][ 1 : ]
    return pdbId, wildTypeResidue + ">" + position + ":" + chain + ">" + mutationResidue

### 3. Run Evaluation
The evaluation was run by selecting $1,000$ random mutations from either set. The resulting list of scores were stored in the following files:
`./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Disease_PRISMEM15Scores_WT.txt`
`./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Disease_PRISMEM15Scores_MUT.txt`
`./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Neutral_PRISMEM15Scores_WT.txt`
`./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Neutral_PRISMEM15Scores_MUT.txt`

In [None]:
"""
Compute and extract scores for disease mutations.
"""
resultsDisease = [ ]
selectedDisease = set( )
selectedDiseaseCount = 0
while selectedDiseaseCount < 1000 :
    try :
        # Score disease mutation.
        selectDiseaseRow = MutHTP_Disease_Df.sample( ).iloc[ 0 ]
        diseasePDBId, diseaseMutation = extractMutationInfo( selectDiseaseRow )
        while diseasePDBId + "|" + diseaseMutation in selectedDisease :
            selectDiseaseRow = MutHTP_Disease_Df.sample( ).iloc[ 0 ]
            diseasePDBId, diseaseMutation = extractMutationInfo( selectDiseaseRow )
        selectedDisease.add( diseasePDBId + "|" + diseaseMutation )
        print( "Processing disease missense mutation " + diseasePDBId + "|" + diseaseMutation + " (" + str( selectedDiseaseCount + 1 ) + "/1000)" )
        resultsDisease.append( scoreMutation( diseasePDBId, diseaseMutation, scores ) )
        selectedDiseaseCount += 1
    except Exception as e :
        print( "Error during computation on " + diseasePDBId + " [SKIP]" )
        continue
scoresDiseaseWT = [ ]
scoresDiseaseMUT = [ ]
for resultDisease in resultsDisease :
    interactions = resultDisease[ 1 ]
    for interaction in interactions :
        scoresDiseaseWT.append( interaction[ 3 ] )
        scoresDiseaseMUT.append( interaction[ 4 ] )
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Disease_PRISMEM15Scores_WT.txt", "w+" ) as scoresDiseaseWTFile :
    scoresDiseaseWTFile.writelines( [ str( score ) + "\n" for score in scoresDiseaseWT ] )
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Disease_PRISMEM15Scores_MUT.txt", "w+" ) as scoresDiseaseMUTFile :
    scoresDiseaseMUTFile.writelines( [ str( score ) + "\n" for score in scoresDiseaseMUT ] )

In [None]:
"""
Compute and extract scores for neutral mutations.
"""
print( )
resultsNeutral = [ ]
selectedNeutral = set( )
selectedNeutralCount = 0
while selectedNeutralCount < 1000 :
    try :
        # Score disease mutation.
        selectNeutralRow = MutHTP_Neutral_Df.sample( ).iloc[ 0 ]
        neutralPDBId, neutralMutation = extractMutationInfo( selectNeutralRow )
        while neutralPDBId + "|" + neutralMutation in selectedNeutral :
            selectNeutralRow = MutHTP_Neutral_Df.sample( ).iloc[ 0 ]
            neutralPDBId, neutralMutation = extractMutationInfo( selectNeutralRow )
        selectedNeutral.add( neutralPDBId + "|" + neutralMutation )
        print( "Processing neutral missense mutation " + neutralPDBId + "|" + neutralMutation + " (" + str( selectedNeutralCount + 1 ) + "/1000)" )
        resultsNeutral.append( scoreMutation( neutralPDBId, neutralMutation, scores ) )
        selectedNeutralCount += 1
    except Exception as e :
        print( "Error during computation on " + neutralPDBId + " [SKIP]" )
        continue
scoresNeutralWT = [ ]
scoresNeutralMUT = [ ]
for resultNeutral in resultsNeutral :
    interactions = resultNeutral[ 1 ]
    for interaction in interactions :
        scoresNeutralWT.append( interaction[ 3 ] )
        scoresNeutralMUT.append( interaction[ 4 ] )
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Neutral_PRISMEM15Scores_WT.txt", "w+" ) as scoresNeutralWTFile :
    scoresNeutralWTFile.writelines( [ str( score ) + "\n" for score in scoresNeutralWT ] )
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Neutral_PRISMEM15Scores_MUT.txt", "w+" ) as scoresNeutralMUTFile :
    scoresNeutralMUTFile.writelines( [ str( score ) + "\n" for score in scoresNeutralMUT ] )

### 3. Visualize and Test Score Distributions 

In [None]:
# Parse scores from files.
scoresDiseaseWT = [ ]
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Disease_PRISMEM15Scores_WT.txt", "r" ) as scoresDiseaseWTFile :
    line = scoresDiseaseWTFile.readline( )
    while line :
        scoresDiseaseWT.append( float( line.strip( ) ) )
        line = scoresDiseaseWTFile.readline( )
        
scoresDiseaseMUT = [ ]
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Disease_PRISMEM15Scores_MUT.txt", "r" ) as scoresDiseaseMUTFile :
    line = scoresDiseaseMUTFile.readline( )
    while line :
        scoresDiseaseMUT.append( float( line.strip( ) ) )
        line = scoresDiseaseMUTFile.readline( )
        
scoresNeutralWT = [ ]
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Neutral_PRISMEM15Scores_WT.txt", "r" ) as scoresNeutralWTFile :
    line = scoresNeutralWTFile.readline( )
    while line :
        scoresNeutralWT.append( float( line.strip( ) ) )
        line = scoresNeutralWTFile.readline( )

scoresNeutralMUT = [ ]
with open( "./R2_TPOMPeome_Hackl2022_PRISMEM15Matrix/MutHTP_Missense_Neutral_PRISMEM15Scores_MUT.txt", "r" ) as scoresNeutralMUTFile :
    line = scoresNeutralMUTFile.readline( )
    while line :
        scoresNeutralMUT.append( float( line.strip( ) ) )
        line = scoresNeutralMUTFile.readline( )
        
# Visualize the value distributions and KS test p-values.
fig, ax = plt.subplots( 2, 2, figsize = ( 14, 8 ), sharey = True, sharex = True )

# Disease WT
(counts, bins) = np.histogram( scoresDiseaseWT, bins = np.arange( -6, 9, 1 ) )
n = str( len( scoresDiseaseWT ) )
ax[0][0].hist( bins[:-1], bins = np.arange( -6, 9, 1 ), weights = counts / len( scoresDiseaseWT ), rwidth = 0.9, color = "lightcoral" )
ax[0][0].set_ylabel( "$\\frac{Count}{n}$", size = 18 );
ax[0][0].set_title( "Disease Inducing, Wild-Type" + " ($n=$" + n + ")", size = 18 )
ax[0][0].tick_params( labelsize = 14 )
mean = np.mean( scoresDiseaseWT )
median = np.median( scoresDiseaseWT )
ax[0][0].plot( [ mean, mean ], [ -0.5, 0.5 ], color='silver', lw=1 )
ax[0][0].plot( [ median, median ], [ -0.5, 0.5 ], color='dimgrey', linestyle='dashed', lw=1 )
custom_lines = [Line2D([0], [0], color='silver', lw=1),
                Line2D([0], [0], color='dimgrey', linestyle='dashed', lw=1)]
ax[0][0].legend( custom_lines, ['Mean = ' + str( round( mean, 2 ) ), 'Median = ' + str( round( median, 2 ) )], ncol=1, fontsize=16, loc='upper left')
ax[0][0].set_ylim( 0.0, 0.3 )

# Disease MUT
(counts, bins) = np.histogram( scoresDiseaseMUT, bins = np.arange( -6, 9, 1 ) )
n = str( len( scoresDiseaseMUT ) )
ax[0][1].hist( bins[:-1], bins = np.arange( -6, 9, 1 ), weights = counts / len( scoresDiseaseMUT ), rwidth = 0.9, color = "indianred" )
ax[0][1].set_title( "Disease Inducing, Mutated" + " ($n=$" + n + ")", size = 18 )
mean = np.mean( scoresDiseaseMUT )
median = np.median( scoresDiseaseMUT )
ax[0][1].plot( [ mean, mean ], [ -0.5, 0.5 ], color='silver', lw=1 )
ax[0][1].plot( [ median, median ], [ -0.5, 0.5 ], color='dimgrey', linestyle='dashed', lw=1 )
custom_lines = [Line2D([0], [0], color='silver', lw=1),
                Line2D([0], [0], color='dimgrey', linestyle='dashed', lw=1)]
ax[0][1].legend( custom_lines, ['Mean = ' + str( round( mean, 2 ) ), 'Median = ' + str( round( median, 2 ) )], ncol=1, fontsize=16, loc='upper left')
ax[0][1].set_ylim( 0.0, 0.3 )

# Neutral WT
(counts, bins) = np.histogram( scoresNeutralWT, bins = np.arange( -6, 9, 1 ) )
n = str( len( scoresNeutralWT ) )
ax[1][0].hist( bins[:-1], bins = np.arange( -6, 9, 1 ), weights = counts / len( scoresNeutralWT ), rwidth = 0.9, color = "mediumseagreen" )
ax[1][0].set_ylabel( "$\\frac{Count}{n}$", size = 18 );
ax[1][0].set_xlabel( "PRISMEM15 Score", size = 16 )
ax[1][0].set_xticks( bins )
ax[1][0].set_title( "Neutral Effect, Wild-Type" + " ($n=$" + n + ")", size = 18 )
ax[1][0].tick_params( labelsize = 14 )
mean = np.mean( scoresNeutralWT )
median = np.median( scoresNeutralWT )
ax[1][0].plot( [ mean, mean ], [ -0.5, 0.5 ], color='silver', lw=1 )
ax[1][0].plot( [ median, median ], [ -0.5, 0.5 ], color='dimgrey', linestyle='dashed', lw=1 )
custom_lines = [Line2D([0], [0], color='silver', lw=1),
                Line2D([0], [0], color='dimgrey', linestyle='dashed', lw=1)]
ax[1][0].legend( custom_lines, ['Mean = ' + str( round( mean, 2 ) ), 'Median = ' + str( round( median, 2 ) )], ncol=1, fontsize=16, loc='upper left')
ax[1][0].set_ylim( 0.0, 0.3 )

# Neutral MUT
(counts, bins) = np.histogram( scoresNeutralMUT, bins = np.arange( -6, 9, 1 ) )
n = str( len( scoresNeutralMUT ) )
ax[1][1].hist( bins[:-1], bins = np.arange( -6, 9, 1 ), weights = counts / len( scoresNeutralMUT ), rwidth = 0.9, color = "seagreen" )
ax[1][1].set_xlabel( "PRISMEM15 Score", size = 16 )
ax[1][1].set_xticks( bins )
ax[1][1].set_title( "Neutral Effect, Mutated" + " ($n=$" + n + ")", size = 18 )
ax[1][1].tick_params( labelsize = 14 )
mean = np.mean( scoresNeutralMUT )
median = np.median( scoresNeutralMUT )
ax[1][1].plot( [ mean, mean ], [ -0.5, 0.5 ], color='silver', lw=1 )
ax[1][1].plot( [ median, median ], [ -0.5, 0.5 ], color='dimgrey', linestyle='dashed', lw=1 )
custom_lines = [Line2D([0], [0], color='silver', lw=1),
                Line2D([0], [0], color='dimgrey', linestyle='dashed', lw=1)]
ax[1][1].legend( custom_lines, ['Mean = ' + str( round( mean, 2 ) ), 'Median = ' + str( round( median, 2 ) )], ncol=1, fontsize=16, loc='upper left')
ax[1][1].set_ylim( 0.0, 0.3 )

plt.suptitle( "Histograms of PRISMEM15 Scores of Interactions with\nResidues of Different Induced Effects and Types", size = 20, y = 1.01 )

# Custom legend
#custom_lines = [Line2D([0], [0], color='silver', lw=1),
#                Line2D([0], [0], color='dimgrey', linestyle='dashed', lw=1)]
#plt.legend(custom_lines, ['Min./Mean/Max.', 'Median'],ncol=2,fontsize=18)

plt.tight_layout( );
plt.show( )

In [None]:
# Compute KS test Disease WT vs Disease MUT
_, pValue_disWT_disMUT = sc.stats.kstest( scoresDiseaseWT, scoresDiseaseMUT )
pValue_disWT_disMUT_pretty = '$' + ((str("%.1e" % pValue_disWT_disMUT)).replace("e", ' \\cdot 10^{ ')).replace("+0", "") + ' }$'
print( "D.WT vs. D.MUT: " + pValue_disWT_disMUT_pretty )

# Compute KS test Neutral WT vs Neutral MUT
_, pValue_neuWT_neuMUT = sc.stats.kstest( scoresNeutralWT, scoresNeutralMUT )
pValue_neuWT_neuMUT_pretty = '$' + ((str("%.1e" % pValue_neuWT_neuMUT)).replace("e", ' \\cdot 10^{ ')).replace("+0", "") + ' }$'
print( "N.WT vs. N.MUT: " + pValue_neuWT_neuMUT_pretty )

# Compute KS test Disease WT vs Neutral WT
_, pValue_neuWT_disWT = sc.stats.kstest( scoresNeutralWT, scoresDiseaseWT )
pValue_neuWT_disWT_pretty = '$' + ((str("%.1e" % pValue_neuWT_disWT)).replace("e", ' \\cdot 10^{ ')).replace("+0", "") + ' }$'
print( "N.WT vs. D.WT: " + pValue_neuWT_disWT_pretty )

# Compute KS test Disease MUT vs Neutral MUT
_, pValue_neuMUT_disMUT = sc.stats.kstest( scoresNeutralMUT, scoresDiseaseMUT )
pValue_neuMUT_disMUT_pretty = '$' + ((str("%.1e" % pValue_neuMUT_disMUT)).replace("e", ' \\cdot 10^{ ')).replace("+0", "") + ' }$'
print( "N.MUT vs. D.MUT: " + pValue_neuMUT_disMUT_pretty )