# Imports

In [1]:
import Bio.PDB
import pandas as pd
import numpy as np
import os
import re
import requests as r
from io import StringIO
from Bio import SeqIO

# Ramachandran Analysis - Helper/Prelim Section

In [None]:
# Set correct pathing
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

In [2]:
# Helper function to extract phi/psi angles from a .cif file 
def get_phi_and_psi(Protein_ID, CIF_file_path, verbose=False):
    """Extracts the psi and phi angle for each residue in protein's 
    alphafold structure.

    Args:
      Protein_ID: uniprot id of protein.
      CIF_file_path: file path to proteins CIF file.

    Returns:
      Dataframe that stores the dihedral angles of each residue in the 
      specified protein.

    """

    df = pd.DataFrame()
    for model in Bio.PDB.MMCIFParser().get_structure(Protein_ID, CIF_file_path):
        for chain in model:
            polypeptides = Bio.PDB.PPBuilder().build_peptides(chain)
            res_index_list = []
            res_name_list = []
            phi_list = []
            psi_list = []
            for poly_index, poly in enumerate(polypeptides):
                if verbose:
                  print("Model %s Chain %s" % (str(model.id), str(chain.id)))
                  print ("(part %i of %i)" % (poly_index+1, len(polypeptides)))
                  print ("length %i" % (len(poly)))
                  print ("from %s%i" % (poly[0].resname, poly[0].id[1]))
                  print ("to %s%i" % (poly[-1].resname, poly[-1].id[1]))
                phi_psi = poly.get_phi_psi_list()
                res_index_sublist = []
                res_name_sublist = []
                phi_sublist = []
                psi_sublist = []
                for res_index, residue in enumerate(poly) :
                    res_name = "%s%i" % (residue.resname, residue.id[1])
                    # print(res_name, phi_psi[res_index])
                    res_index_sublist.append(residue.id[1])
                    res_name_sublist.append(residue.resname)
                    phi_sublist.append(phi_psi[res_index][0])
                    psi_sublist.append(phi_psi[res_index][1])
                res_index_list.extend(res_index_sublist)
                res_name_list.extend(res_name_sublist)
                phi_list.extend(phi_sublist)
                psi_list.extend(psi_sublist)
            df['Protein ID'] = [Protein_ID] * len(res_index_list)
            df['Residue Name'] = res_name_list
            df['Residue Position'] = res_index_list
            df['PHI'] = phi_list
            df['PSI'] = psi_list
    return df
    


In [3]:
directory = '../alphafold_data/cif'
file_paths = []

for root, directories, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)
        file_paths.append(file_path)

print(file_paths)

['../alphafold_data/cif/Q9D404.cif', '../alphafold_data/cif/P62829.cif', '../alphafold_data/cif/Q96PK6.cif', '../alphafold_data/cif/Q9Z0X1.cif', '../alphafold_data/cif/O60814.cif', '../alphafold_data/cif/Q99MB2.cif', '../alphafold_data/cif/P07900.cif', '../alphafold_data/cif/Q8C6I2.cif', '../alphafold_data/cif/Q8QZT1.cif', '../alphafold_data/cif/Q9CQ92.cif', '../alphafold_data/cif/P47897.cif', '../alphafold_data/cif/Q9CQN1.cif', '../alphafold_data/cif/P38919.cif', '../alphafold_data/cif/Q920A5.cif', '../alphafold_data/cif/Q9NXV6.cif', '../alphafold_data/cif/Q8C1W2.cif', '../alphafold_data/cif/P27144.cif', '../alphafold_data/cif/P42125.cif', '../alphafold_data/cif/P50247.cif', '../alphafold_data/cif/P12074.cif', '../alphafold_data/cif/O75821.cif', '../alphafold_data/cif/Q5HZI9.cif', '../alphafold_data/cif/P24539.cif', '../alphafold_data/cif/Q9CPQ3.cif', '../alphafold_data/cif/Q9UN86.cif', '../alphafold_data/cif/Q8IUD2.cif', '../alphafold_data/cif/P56391.cif', '../alphafold_data/cif/Q923

In [4]:
pattern = r'/([^/]+)\.cif$'
protein_ids =[]

for file_path in file_paths:
    match = re.search(pattern, file_path)
    desired_substring = match.group(1)
    protein_ids.append(desired_substring)

print(protein_ids)

['Q9D404', 'P62829', 'Q96PK6', 'Q9Z0X1', 'O60814', 'Q99MB2', 'P07900', 'Q8C6I2', 'Q8QZT1', 'Q9CQ92', 'P47897', 'Q9CQN1', 'P38919', 'Q920A5', 'Q9NXV6', 'Q8C1W2', 'P27144', 'P42125', 'P50247', 'P12074', 'O75821', 'Q5HZI9', 'P24539', 'Q9CPQ3', 'Q9UN86', 'Q8IUD2', 'P56391', 'Q923K4', 'Q9CRD0', 'O14950', 'P35637', 'Q9Y3U8', 'A2ATU0', 'P62753', 'Q9HD42', 'Q9CWV0', 'O14776', 'P14174', 'Q16777', 'P07108', 'Q6YN16', 'Q99LP6', 'P09496', 'Q8WXI9', 'Q9CW42', 'Q9UNZ5', 'Q8BHE8', 'Q8C2E4', 'Q9D773', 'Q9CQC7', 'Q8BWF0', 'Q9CZS1', 'Q62425', 'Q9CXJ1', 'P52294', 'P05455', 'P50454', 'Q60597', 'Q7L4I2', 'P85094', 'Q99M87', 'Q9BYJ9', 'Q9UDY2', 'Q9CQY9', 'Q9CZ13', 'Q8R404', 'Q9QXX4', 'P55060', 'Q3U8Y1', 'Q14980', 'P46778', 'Q8JZQ2', 'O35143', 'Q05682', 'Q9Y5A9', 'Q5BKZ1', 'Q9BPW8', 'Q9D6K5', 'P27635', 'P42126', 'Q15056', 'Q3UG70', 'Q3U5Q7', 'Q7TNL9', 'Q9CY73', 'Q921H9', 'Q9CWB7', 'Q8BJ03', 'P09669', 'Q9NYF8', 'P56379', 'Q08211', 'Q96AE4', 'Q66GT5', 'P62750', 'Q9CPQ1', 'Q07889', 'P08238', 'Q7Z5L9', 'P23526',

In [5]:
list_of_dfs = []

for cif_file, prot_id in zip(file_paths, protein_ids):
    list_of_dfs.append(get_phi_and_psi(prot_id, cif_file))

In [6]:
concat_dihedrals = pd.concat(list_of_dfs)
concat_dihedrals['PSI degrees'] = np.rad2deg(concat_dihedrals['PSI'])
concat_dihedrals['PHI degrees'] = np.rad2deg(concat_dihedrals['PHI'])
concat_dihedrals

Unnamed: 0,Protein ID,Residue Name,Residue Position,PHI,PSI,PSI degrees,PHI degrees
0,Q9D404,MET,1,,0.387817,22.220252,
1,Q9D404,LEU,2,-2.157366,0.274497,15.727492,-123.607961
2,Q9D404,SER,3,-1.962920,0.385322,22.077320,-112.467009
3,Q9D404,LYS,4,-2.134724,-0.040241,-2.305665,-122.310672
4,Q9D404,CYS,5,-1.705889,-0.189105,-10.834934,-97.740259
...,...,...,...,...,...,...,...
251,Q8C3X2,PHE,252,-1.180788,-0.653133,-37.421783,-67.654193
252,Q8C3X2,TRP,253,-1.329307,-0.276255,-15.828235,-76.163670
253,Q8C3X2,LYS,254,-1.518433,-0.312795,-17.921836,-86.999812
254,Q8C3X2,GLU,255,-1.663209,0.034239,1.961770,-95.294845


In [7]:
#concat_dihedrals.to_csv(os.path.join(global_data_path, "dihedral_angles.csv"), index=False)

Sanity Check: Here, we verify that the AA sequences extracted from the AlphaFold database match those found in the UniProt database

In [8]:
concat_dihedrals = pd.read_csv(os.path.join(global_data_path, "dihedral_angles.csv"))
concat_dihedrals

Unnamed: 0,Protein ID,Residue Name,Residue Position,PHI,PSI,PSI degrees,PHI degrees
0,Q9D404,MET,1,,0.387817,22.220252,
1,Q9D404,LEU,2,-2.157366,0.274497,15.727492,-123.607961
2,Q9D404,SER,3,-1.962920,0.385322,22.077320,-112.467009
3,Q9D404,LYS,4,-2.134724,-0.040241,-2.305665,-122.310672
4,Q9D404,CYS,5,-1.705889,-0.189105,-10.834934,-97.740259
...,...,...,...,...,...,...,...
538890,Q8C3X2,PHE,252,-1.180788,-0.653133,-37.421783,-67.654193
538891,Q8C3X2,TRP,253,-1.329307,-0.276255,-15.828235,-76.163670
538892,Q8C3X2,LYS,254,-1.518433,-0.312795,-17.921836,-86.999812
538893,Q8C3X2,GLU,255,-1.663209,0.034239,1.961770,-95.294845


In [9]:
len(concat_dihedrals['Protein ID'].unique())

1110

In [10]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [11]:
uniprot_protein_sequences = pd.read_csv('../global_data/complete_sequence_cache.csv')

In [12]:
amino_acid_map = {
    "ALA": "A",
    "ARG": "R",
    "ASN": "N",
    "ASP": "D",
    "CYS": "C",
    "GLU": "E",
    "GLN": "Q",
    "GLY": "G",
    "HIS": "H",
    "ILE": "I",
    "LEU": "L",
    "LYS": "K",
    "MET": "M",
    "PHE": "F",
    "PRO": "P",
    "SER": "S",
    "THR": "T",
    "TRP": "W",
    "TYR": "Y",
    "VAL": "V",
    "SEC": "U",
    "PYL": "O"
}

In [13]:
# Helper function to verify sequences
def verify_sequences(df_residues, df_sequences, verbose=False):
    mismatches = pd.DataFrame(
        columns=['Protein ID', 'Residue Position', 'AlphaFold Residue', 'UniProt Residue'])

    missing_sequences = 0
    for protein_id in df_residues['Protein ID'].unique():
        # Get the complete sequence for the current Protein ID
        if protein_id in df_sequences['Protein ID'].values:
            complete_seq = df_sequences[df_sequences['Protein ID'] == protein_id]['Complete Sequence'].values[0]
        else:
            missing_sequences += 1
            print(f'{protein_id} is not one of the completed sequences we queried from UniProt', missing_sequences)
            continue

        # Filter residues for the current Protein ID
        residues = df_residues[df_residues['Protein ID'] == protein_id]

        for _, row in residues.iterrows():
            residue_name = row['Residue Name']
            residue_position = row['Residue Position']
            
            # Get the expected residue from the complete sequence
            if (residue_position - 1) <  len(complete_seq):
                expected_residue = complete_seq[residue_position - 1]  # position - 1 for zero-based indexing
                actual_residue = amino_acid_map[residue_name]
            else:
                expected_residue = 'X'
                actual_residue = amino_acid_map[residue_name]
            
            if expected_residue != actual_residue:
                if verbose:
                    print(protein_id)
                mismatches = mismatches._append({'Protein ID':protein_id,
                        'Residue Position':residue_position,
                        'AlphaFold Residue':actual_residue,
                        'UniProt Residue': expected_residue
                        }, ignore_index = True)
                #mismatches.loc[len(mismatches.index)] = [protein_id, residue_position, actual_residue, expected_residue]
                #mismatches.append((protein_id, residue_position, actual_residue, expected_residue))

    return mismatches

In [14]:
mismatches = verify_sequences(concat_dihedrals, uniprot_protein_sequences)
mismatches

Q99J99 is not one of the completed sequences we queried from UniProt 1
Q8R5C0 is not one of the completed sequences we queried from UniProt 2


Unnamed: 0,Protein ID,Residue Position,AlphaFold Residue,UniProt Residue
0,P27635,202,N,S
1,Q9NX55,2,R,A
2,Q9NX55,3,R,T
3,Q9NX55,4,R,E
4,Q9NX55,6,E,D
...,...,...,...,...
396,Q8R0F8,223,K,X
397,Q8R0F8,224,R,X
398,Q8R0F8,225,S,X
399,Q8R0F8,226,E,X


In [15]:
unique_mismatches = mismatches['Protein ID'].unique()
print(unique_mismatches)
print(len(unique_mismatches))

['P27635' 'Q9NX55' 'Q9ULT8' 'P62861' 'O94851' 'Q00341' 'O75396' 'Q8R0F8']
8


In [18]:
# Helper function to get psi/phi angles for an entire dataset
def extract_PSI_and_PHI(residue_df, psi_and_phi_df, verbose=False):
    df = pd.DataFrame(columns=['PSI Radians', 'PHI Radians', 'PSI Degrees', 'PHI Degrees'])
    for _, row in residue_df.iterrows():
        protein_id = row['Protein ID']
        methionine_posn = row['Methionine Location'] + 1
        row_of_interest = psi_and_phi_df.loc[(psi_and_phi_df['Protein ID'] == protein_id) & (psi_and_phi_df['Residue Position'] == methionine_posn)]
        if verbose:
            print(row_of_interest)
        new_row = {'PSI Radians': float('nan'), 'PHI Radians':float('nan'), 'PSI Degrees': float('nan'), 'PHI Degrees': float('nan')}
        if row_of_interest.shape[0] == 1 and row_of_interest['Residue Name'].iloc[0] == 'MET':
            new_row = {'PSI Radians': row_of_interest['PSI'].iloc[0], 'PHI Radians':row_of_interest['PHI'].iloc[0], 'PSI Degrees': row_of_interest['PSI degrees'].iloc[0], 'PHI Degrees': row_of_interest['PHI degrees'].iloc[0]}
        df = df._append(new_row, ignore_index = True)
    return df

# Ramachandran Analysis - MsrAKD Dataset

In [23]:
MsrAKD_with_alphafold = pd.read_csv('../MsrKD/MsrAKD_with_alphafold.csv').drop(columns=['Unnamed: 0'])
MsrAKD_with_alphafold

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,2.238095,3.380952,4.190476,5.047619,5.761905,13.714286,38.714286,67.380952,3.190476,0.0
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,2.047619,4.238095,6.142857,7.904762,8.000000,14.666667,35.619048,52.333333,3.952381,0.0
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,2.380952,3.428571,4.571429,5.952381,7.666667,22.476190,65.476190,125.333333,5.857143,0.0
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,2.476190,3.476190,4.714286,6.000000,7.333333,16.857143,42.238095,63.095238,2.619048,0.0
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,2.095238,3.571429,4.333333,5.428571,5.904762,16.952381,55.571429,105.619048,5.380952,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,2.238095,3.761905,4.571429,5.714286,6.190476,16.047619,50.666667,105.571429,4.619048,0.0
652,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,2.428571,5.047619,5.857143,7.619048,7.619048,11.904762,18.952381,26.047619,2.238095,1.0
653,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,2.095238,4.761905,5.333333,7.095238,7.333333,13.714286,31.571429,51.142857,3.047619,0.0
654,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,2.619048,4.238095,5.666667,6.666667,7.285714,11.333333,18.714286,24.904762,2.428571,1.0


In [24]:
# Some of the proteins in the MsrAKD dataset have sequence mismatches - drop these
MsrAKD_with_alphafold[MsrAKD_with_alphafold['Protein ID'].isin(unique_mismatches)]

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
229,RRM[649.3660]QYNR,RRM[655.3735]QYNR,-3.506345,-3.704064,,-4.418432,-3.873315,,,,...,,,,,,,,,,
365,FNADEFEDM[649.3660]VAEKR,FNADEFEDM[655.3735]VAEKR,,,,-2.716301,,,,,...,2.380952,3.190476,4.285714,5.52381,6.619048,13.714286,36.571429,61.52381,2.904762,0.0
497,IM[649.3660]VANIEEVLQR,IM[655.3735]VANIEEVLQR,-2.040931,-1.316275,,,-1.904369,-1.83446,-1.749982,-1.672267,...,2.047619,2.571429,2.809524,3.428571,3.619048,8.142857,22.666667,46.52381,1.190476,0.0
634,KEDLELIM[649.3660]TEMEISR,KEDLELIM[655.3735]TEMEISR,,,,,,-0.557871,,,...,2.0,2.952381,3.666667,4.285714,4.47619,9.761905,22.142857,30.428571,2.238095,1.0


In [25]:
MsrAKD_with_alphafold_wo_mismatches = MsrAKD_with_alphafold.drop(MsrAKD_with_alphafold[MsrAKD_with_alphafold['Protein ID'].isin(unique_mismatches)].index).reset_index(drop = True)
MsrAKD_with_alphafold_wo_mismatches

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,2.238095,3.380952,4.190476,5.047619,5.761905,13.714286,38.714286,67.380952,3.190476,0.0
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,2.047619,4.238095,6.142857,7.904762,8.000000,14.666667,35.619048,52.333333,3.952381,0.0
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,2.380952,3.428571,4.571429,5.952381,7.666667,22.476190,65.476190,125.333333,5.857143,0.0
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,2.476190,3.476190,4.714286,6.000000,7.333333,16.857143,42.238095,63.095238,2.619048,0.0
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,2.095238,3.571429,4.333333,5.428571,5.904762,16.952381,55.571429,105.619048,5.380952,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,2.238095,3.761905,4.571429,5.714286,6.190476,16.047619,50.666667,105.571429,4.619048,0.0
648,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,2.428571,5.047619,5.857143,7.619048,7.619048,11.904762,18.952381,26.047619,2.238095,1.0
649,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,2.095238,4.761905,5.333333,7.095238,7.333333,13.714286,31.571429,51.142857,3.047619,0.0
650,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,2.619048,4.238095,5.666667,6.666667,7.285714,11.333333,18.714286,24.904762,2.428571,1.0


In [26]:
MsrAKD_psi_and_psi_map = extract_PSI_and_PHI(MsrAKD_with_alphafold_wo_mismatches, concat_dihedrals)
MsrAKD_psi_and_psi_map

  df = df._append(new_row, ignore_index = True)


Unnamed: 0,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,-0.555659,-1.146523,-31.836928,-65.690956
1,-0.870103,-1.057627,-49.853215,-60.597577
2,-0.491176,-1.322379,-28.142283,-75.766714
3,2.271450,-2.178688,130.144505,-124.829609
4,2.297773,-1.229620,131.652676,-70.452048
...,...,...,...,...
647,-0.668824,-1.106320,-38.320821,-63.387457
648,-0.724554,-1.152084,-41.513904,-66.009532
649,-0.693338,-1.070541,-39.725343,-61.337507
650,-0.650127,-1.307328,-37.249556,-74.904401


In [27]:
sum(MsrAKD_psi_and_psi_map['PHI Degrees'].isna())

0

In [28]:
MsrAKD_with_PSI_and_PHI = pd.concat([MsrAKD_with_alphafold_wo_mismatches, MsrAKD_psi_and_psi_map], axis=1)
MsrAKD_with_PSI_and_PHI

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrA_KD_1 Log2 Ratio HL,MsrA_KD_10 Log2 Ratio HL,MsrA_KD_11 Log2 Ratio HL,MsrA_KD_12 Log2 Ratio HL,MsrA_KD_2 Log2 Ratio HL,MsrA_KD_3 Log2 Ratio HL,MsrA_KD_4 Log2 Ratio HL,MsrA_KD_5 Log2 Ratio HL,...,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,DHFEEAM[649.3660]R,DHFEEAM[655.3735]R,,,,,,,-5.758896,,...,5.761905,13.714286,38.714286,67.380952,3.190476,0.0,-0.555659,-1.146523,-31.836928,-65.690956
1,LRHSEREM[649.3660]R,LRHSEREM[655.3735]R,-5.516297,,,,,-6.230529,,,...,8.000000,14.666667,35.619048,52.333333,3.952381,0.0,-0.870103,-1.057627,-49.853215,-60.597577
2,VIAHTQM[649.3660]R,VIAHTQM[655.3735]R,,,,,,-5.784255,,,...,7.666667,22.476190,65.476190,125.333333,5.857143,0.0,-0.491176,-1.322379,-28.142283,-75.766714
3,TTGFGM[649.3660]IYDSLDYAK,TTGFGM[655.3735]IYDSLDYAK,,,-5.731299,,-5.257074,,,,...,7.333333,16.857143,42.238095,63.095238,2.619048,0.0,2.271450,-2.178688,130.144505,-124.829609
4,QM[649.3660]QVLHPAAR,QM[655.3735]QVLHPAAR,,,,-5.202635,,,,,...,5.904762,16.952381,55.571429,105.619048,5.380952,0.0,2.297773,-1.229620,131.652676,-70.452048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,TWM[649.3660]WWHNFR,TWM[655.3735]WWHNFR,,,-0.384153,,,,,-0.379949,...,6.190476,16.047619,50.666667,105.571429,4.619048,0.0,-0.668824,-1.106320,-38.320821,-63.387457
648,M[649.3660]EELHNQEVQK,M[655.3735]EELHNQEVQK,-2.289166,-2.214471,,-2.123250,3.901065,4.142890,-1.931592,3.755303,...,7.619048,11.904762,18.952381,26.047619,2.238095,1.0,-0.724554,-1.152084,-41.513904,-66.009532
649,LAM[649.3660]QLEEQASR,LAM[655.3735]QLEEQASR,,0.426347,,,,,,-0.483441,...,7.333333,13.714286,31.571429,51.142857,3.047619,0.0,-0.693338,-1.070541,-39.725343,-61.337507
650,QNFHM[649.3660]EQLK,QNFHM[655.3735]EQLK,,1.544593,,,-0.553917,,0.962793,-2.636370,...,7.285714,11.333333,18.714286,24.904762,2.428571,1.0,-0.650127,-1.307328,-37.249556,-74.904401


In [29]:
#MsrAKD_with_PSI_and_PHI.to_csv('MsrAKD_with_PSI_and_PHI.csv')

# Ramachandran Analysis - MsrBKD Dataset

In [30]:
MsrBKD_with_alphafold = pd.read_csv('../MsrKD/MsrB2KD_with_alphafold.csv').drop(columns=['Unnamed: 0'])
MsrBKD_with_alphafold

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,2.571429,3.571429,5.333333,6.571429,8.333333,26.047619,71.952381,142.619048,6.809524,0.0
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,2.047619,3.238095,3.809524,4.857143,5.714286,12.904762,38.190476,60.285714,3.523810,0.0
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,2.285714,4.809524,5.571429,7.190476,7.380952,11.857143,18.476190,25.095238,2.047619,1.0
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,2.523810,1.0
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,2.619048,4.523810,5.857143,7.428571,8.142857,23.523810,60.000000,109.238095,7.190476,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,2.095238,2.619048,3.190476,4.000000,4.904762,13.142857,29.952381,49.190476,1.952381,0.0
738,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,2.333333,3.714286,5.095238,6.809524,7.857143,22.428571,67.000000,123.000000,5.809524,0.0
739,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,2.714286,5.000000,6.142857,7.523810,7.523810,11.904762,19.571429,26.333333,2.333333,1.0
740,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,2.619048,4.142857,6.047619,7.666667,8.428571,28.761905,87.285714,172.428571,8.761905,0.0


In [31]:
# Some of the proteins in the MsrAKD dataset have sequence mismatches - drop these
MsrBKD_with_alphafold[MsrBKD_with_alphafold['Protein ID'].isin(unique_mismatches)]

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
266,FNADEFEDM[649.3660]VAEKR,FNADEFEDM[655.3735]VAEKR,,2.620562,,2.630807,,3.185583,4.025693,3.236847,...,2.380952,3.190476,4.285714,5.52381,6.619048,13.714286,36.571429,61.52381,2.904762,0.0
295,RRM[649.3660]QYNR,RRM[655.3735]QYNR,1.998028,2.366995,2.286614,2.303103,2.031332,1.902578,,2.012649,...,,,,,,,,,,
310,IM[649.3660]VANIEEVLQR,IM[655.3735]VANIEEVLQR,1.820905,2.000746,1.601487,1.987363,1.789481,2.344528,2.025588,,...,2.047619,2.571429,2.809524,3.428571,3.619048,8.142857,22.666667,46.52381,1.190476,0.0
508,LSMVM[649.3660]YLSK,LSMVM[655.3735]YLSK,,0.943869,,,,,,,...,2.190476,3.47619,4.952381,5.857143,6.190476,15.380952,47.095238,80.571429,4.285714,0.0
627,TNATNNM[649.3660]NLSR,TNATNNM[655.3735]NLSR,,,-1.202873,,-1.711192,,,,...,1.904762,2.0,2.0,2.0,2.0,4.0,8.142857,10.952381,0.0,1.0
737,DQGLSIM[649.3660]VSGK,DQGLSIM[655.3735]VSGK,,2.936602,,,,,,,...,2.095238,2.619048,3.190476,4.0,4.904762,13.142857,29.952381,49.190476,1.952381,0.0


In [32]:
MsrBKD_with_alphafold_wo_mismatches = MsrBKD_with_alphafold.drop(MsrBKD_with_alphafold[MsrBKD_with_alphafold['Protein ID'].isin(unique_mismatches)].index).reset_index(drop = True)
MsrBKD_with_alphafold_wo_mismatches

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,2.571429,3.571429,5.333333,6.571429,8.333333,26.047619,71.952381,142.619048,6.809524,0.0
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,2.047619,3.238095,3.809524,4.857143,5.714286,12.904762,38.190476,60.285714,3.523810,0.0
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,2.285714,4.809524,5.571429,7.190476,7.380952,11.857143,18.476190,25.095238,2.047619,1.0
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,2.476190,5.047619,6.380952,7.904762,7.904762,12.095238,20.142857,29.714286,2.523810,1.0
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,2.619048,4.523810,5.857143,7.428571,8.142857,23.523810,60.000000,109.238095,7.190476,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,NM[649.3660]SIIDAFK,NM[655.3735]SIIDAFK,,,,3.080768,,,1.972472,,...,2.000000,2.000000,2.000000,2.142857,2.380952,5.000000,8.047619,11.619048,0.285714,1.0
732,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,2.333333,3.714286,5.095238,6.809524,7.857143,22.428571,67.000000,123.000000,5.809524,0.0
733,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,2.714286,5.000000,6.142857,7.523810,7.523810,11.904762,19.571429,26.333333,2.333333,1.0
734,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,2.619048,4.142857,6.047619,7.666667,8.428571,28.761905,87.285714,172.428571,8.761905,0.0


In [33]:
MsrBKD_psi_and_psi_map = extract_PSI_and_PHI(MsrBKD_with_alphafold_wo_mismatches, concat_dihedrals)
MsrBKD_psi_and_psi_map

  df = df._append(new_row, ignore_index = True)


Unnamed: 0,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,2.171959,-1.189476,124.444077,-68.151944
1,-0.818135,-1.241461,-46.875675,-71.130470
2,-0.730723,-1.061428,-41.867347,-60.815364
3,-0.665294,-1.089169,-38.118560,-62.404803
4,-0.516803,-1.130868,-29.610647,-64.793967
...,...,...,...,...
731,2.407488,-1.480153,137.938905,-84.806547
732,-0.487434,-1.169793,-27.927937,-67.024224
733,-0.653646,-1.338789,-37.451150,-76.706966
734,-0.905823,-1.087117,-51.899856,-62.287222


In [34]:
sum(MsrBKD_psi_and_psi_map['PHI Degrees'].isna())

0

In [35]:
MsrBKD_with_PSI_and_PHI = pd.concat([MsrBKD_with_alphafold_wo_mismatches, MsrBKD_psi_and_psi_map], axis=1)
MsrBKD_with_PSI_and_PHI

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,MsrB2_KD_1 Log2 Ratio HL,MsrB2_KD_10 Log2 Ratio HL,MsrB2_KD_11 Log2 Ratio HL,MsrB2_KD_12 Log2 Ratio HL,MsrB2_KD_2 Log2 Ratio HL,MsrB2_KD_3 Log2 Ratio HL,MsrB2_KD_4 Log2 Ratio HL,MsrB2_KD_5 Log2 Ratio HL,...,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,FAGLHFFNPVPVM[649.3660]K,FAGLHFFNPVPVM[655.3735]K,-4.205662,-3.548803,-4.189766,-4.160031,-4.436141,-3.632403,-3.887475,-4.186961,...,8.333333,26.047619,71.952381,142.619048,6.809524,0.0,2.171959,-1.189476,124.444077,-68.151944
1,IGM[649.3660]SVNAIR,IGM[655.3735]SVNAIR,,,,-4.399577,,-3.417119,-3.768051,,...,5.714286,12.904762,38.190476,60.285714,3.523810,0.0,-0.818135,-1.241461,-46.875675,-71.130470
2,KMEM[649.3660]EMEQVFEMK,KMEM[655.3735]EMEQVFEMK,-2.274200,-2.855665,-2.947620,-2.942285,-3.161963,-2.854557,-2.779239,-2.276797,...,7.380952,11.857143,18.476190,25.095238,2.047619,1.0,-0.730723,-1.061428,-41.867347,-60.815364
3,LRLEVNLQAM[649.3660]K,LRLEVNLQAM[655.3735]K,,,,,,,-2.033939,,...,7.904762,12.095238,20.142857,29.714286,2.523810,1.0,-0.665294,-1.089169,-38.118560,-62.404803
4,AASDIAM[649.3660]TELPPTHPIR,AASDIAM[655.3735]TELPPTHPIR,-2.630824,,-2.145969,-1.169837,,-2.919210,,,...,8.142857,23.523810,60.000000,109.238095,7.190476,0.0,-0.516803,-1.130868,-29.610647,-64.793967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,NM[649.3660]SIIDAFK,NM[655.3735]SIIDAFK,,,,3.080768,,,1.972472,,...,2.380952,5.000000,8.047619,11.619048,0.285714,1.0,2.407488,-1.480153,137.938905,-84.806547
732,EAM[649.3660]NHPGHLK,EAM[655.3735]NHPGHLK,,,,,,,,,...,7.857143,22.428571,67.000000,123.000000,5.809524,0.0,-0.487434,-1.169793,-27.927937,-67.024224
733,ALEEAM[649.3660]EQK,ALEEAM[655.3735]EQK,2.683003,,3.269254,,,,,,...,7.523810,11.904762,19.571429,26.333333,2.333333,1.0,-0.653646,-1.338789,-37.451150,-76.706966
734,VTM[649.3660]LFLGLHNVR,VTM[655.3735]LFLGLHNVR,,,,,,,2.743130,,...,8.428571,28.761905,87.285714,172.428571,8.761905,0.0,-0.905823,-1.087117,-51.899856,-62.287222


In [36]:
#MsrBKD_with_PSI_and_PHI.to_csv('MsrBKD_with_PSI_and_PHI.csv', index=False)

# End