# Imports

In [1]:
import Bio.PDB
import pandas as pd
import os
import re
import requests as r
from io import StringIO
from Bio import SeqIO

# Ramachandran Analysis - Helper/Prelim Section

In [2]:
# Helper function to extract phi/psi angles from a .cif file 
def get_phi_and_psi(Protein_ID, CIF_file_path, verbose=False):
    """Extracts the psi and phi angle for each residue in protein's 
    alphofild structure.

    Args:
      Protein_ID: uniprot id of protein.
      CIF_file_path: file path to proteins mmCIF file.

    Returns:
      Dataframe that stores the dihedral angles of each residue in the 
      specified protein.

    """

    df = pd.DataFrame()
    for model in Bio.PDB.MMCIFParser().get_structure(Protein_ID, CIF_file_path):
        for chain in model:
            polypeptides = Bio.PDB.PPBuilder().build_peptides(chain)
            res_index_list = []
            res_name_list = []
            phi_list = []
            psi_list = []
            for poly_index, poly in enumerate(polypeptides):
                if verbose:
                  print("Model %s Chain %s" % (str(model.id), str(chain.id)))
                  print ("(part %i of %i)" % (poly_index+1, len(polypeptides)))
                  print ("length %i" % (len(poly)))
                  print ("from %s%i" % (poly[0].resname, poly[0].id[1]))
                  print ("to %s%i" % (poly[-1].resname, poly[-1].id[1]))
                phi_psi = poly.get_phi_psi_list()
                res_index_sublist = []
                res_name_sublist = []
                phi_sublist = []
                psi_sublist = []
                for res_index, residue in enumerate(poly) :
                    res_name = "%s%i" % (residue.resname, residue.id[1])
                    # print(res_name, phi_psi[res_index])
                    res_index_sublist.append(residue.id[1])
                    res_name_sublist.append(residue.resname)
                    phi_sublist.append(phi_psi[res_index][0])
                    psi_sublist.append(phi_psi[res_index][1])
                res_index_list.extend(res_index_sublist)
                res_name_list.extend(res_name_sublist)
                phi_list.extend(phi_sublist)
                psi_list.extend(psi_sublist)
            df['Protein ID'] = [Protein_ID] * len(res_index_list)
            df['Residue Name'] = res_name_list
            df['Residue Position'] = res_index_list
            df['PHI'] = phi_list
            df['PSI'] = psi_list
    return df
    


In [3]:
directory = '../alphafold_data/cif'
file_paths = []

for root, directories, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)
        file_paths.append(file_path)

print(file_paths)

['../alphafold_data/cif/Q9D404.cif', '../alphafold_data/cif/P62829.cif', '../alphafold_data/cif/Q96PK6.cif', '../alphafold_data/cif/Q9Z0X1.cif', '../alphafold_data/cif/O60814.cif', '../alphafold_data/cif/Q99MB2.cif', '../alphafold_data/cif/P07900.cif', '../alphafold_data/cif/Q8C6I2.cif', '../alphafold_data/cif/Q8QZT1.cif', '../alphafold_data/cif/Q9CQ92.cif', '../alphafold_data/cif/P47897.cif', '../alphafold_data/cif/Q9CQN1.cif', '../alphafold_data/cif/P38919.cif', '../alphafold_data/cif/Q920A5.cif', '../alphafold_data/cif/Q9NXV6.cif', '../alphafold_data/cif/Q8C1W2.cif', '../alphafold_data/cif/P27144.cif', '../alphafold_data/cif/P42125.cif', '../alphafold_data/cif/P50247.cif', '../alphafold_data/cif/P12074.cif', '../alphafold_data/cif/O75821.cif', '../alphafold_data/cif/Q5HZI9.cif', '../alphafold_data/cif/P24539.cif', '../alphafold_data/cif/Q9CPQ3.cif', '../alphafold_data/cif/Q9UN86.cif', '../alphafold_data/cif/Q8IUD2.cif', '../alphafold_data/cif/P56391.cif', '../alphafold_data/cif/Q923

In [4]:
pattern = r'/([^/]+)\.cif$'
protein_ids =[]

for file_path in file_paths:
    match = re.search(pattern, file_path)
    desired_substring = match.group(1)
    protein_ids.append(desired_substring)

print(protein_ids)

['Q9D404', 'P62829', 'Q96PK6', 'Q9Z0X1', 'O60814', 'Q99MB2', 'P07900', 'Q8C6I2', 'Q8QZT1', 'Q9CQ92', 'P47897', 'Q9CQN1', 'P38919', 'Q920A5', 'Q9NXV6', 'Q8C1W2', 'P27144', 'P42125', 'P50247', 'P12074', 'O75821', 'Q5HZI9', 'P24539', 'Q9CPQ3', 'Q9UN86', 'Q8IUD2', 'P56391', 'Q923K4', 'Q9CRD0', 'O14950', 'P35637', 'Q9Y3U8', 'A2ATU0', 'P62753', 'Q9HD42', 'Q9CWV0', 'O14776', 'P14174', 'Q16777', 'P07108', 'Q6YN16', 'Q99LP6', 'P09496', 'Q8WXI9', 'Q9CW42', 'Q9UNZ5', 'Q8BHE8', 'Q8C2E4', 'Q9D773', 'Q9CQC7', 'Q8BWF0', 'Q9CZS1', 'Q62425', 'Q9CXJ1', 'P52294', 'P05455', 'P50454', 'Q60597', 'Q7L4I2', 'P85094', 'Q99M87', 'Q9BYJ9', 'Q9UDY2', 'Q9CQY9', 'Q9CZ13', 'Q8R404', 'Q9QXX4', 'P55060', 'Q3U8Y1', 'Q14980', 'P46778', 'Q8JZQ2', 'O35143', 'Q05682', 'Q9Y5A9', 'Q5BKZ1', 'Q9BPW8', 'Q9D6K5', 'P27635', 'P42126', 'Q15056', 'Q3UG70', 'Q3U5Q7', 'Q7TNL9', 'Q9CY73', 'Q921H9', 'Q9CWB7', 'Q8BJ03', 'P09669', 'Q9NYF8', 'P56379', 'Q08211', 'Q96AE4', 'Q66GT5', 'P62750', 'Q9CPQ1', 'Q07889', 'P08238', 'Q7Z5L9', 'P23526',

In [5]:
list_of_dfs = []

for cif_file, prot_id in zip(file_paths, protein_ids):
    list_of_dfs.append(get_phi_and_psi(prot_id, cif_file))

In [6]:
concat_dihedrals = pd.concat(list_of_dfs)
concat_dihedrals

Unnamed: 0,Protein ID,Residue Name,Residue Position,PHI,PSI
0,Q9D404,MET,1,,0.387817
1,Q9D404,LEU,2,-2.157366,0.274497
2,Q9D404,SER,3,-1.962920,0.385322
3,Q9D404,LYS,4,-2.134724,-0.040241
4,Q9D404,CYS,5,-1.705889,-0.189105
...,...,...,...,...,...
251,Q8C3X2,PHE,252,-1.180788,-0.653133
252,Q8C3X2,TRP,253,-1.329307,-0.276255
253,Q8C3X2,LYS,254,-1.518433,-0.312795
254,Q8C3X2,GLU,255,-1.663209,0.034239


In [7]:
concat_dihedrals.to_csv('dihedral_angles.csv', index=False)

Sanity Check: Here, we verify that the AA sequences extracted from the AlphaFold database match those found in the UniProt database

In [8]:
concat_dihedrals = pd.read_csv('dihedral_angles.csv')
concat_dihedrals

Unnamed: 0,Protein ID,Residue Name,Residue Position,PHI,PSI
0,Q9D404,MET,1,,0.387817
1,Q9D404,LEU,2,-2.157366,0.274497
2,Q9D404,SER,3,-1.962920,0.385322
3,Q9D404,LYS,4,-2.134724,-0.040241
4,Q9D404,CYS,5,-1.705889,-0.189105
...,...,...,...,...,...
538890,Q8C3X2,PHE,252,-1.180788,-0.653133
538891,Q8C3X2,TRP,253,-1.329307,-0.276255
538892,Q8C3X2,LYS,254,-1.518433,-0.312795
538893,Q8C3X2,GLU,255,-1.663209,0.034239


In [9]:
len(concat_dihedrals['Protein ID'].unique())

1110

In [10]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [11]:
#uniprot_protein_sequences = pd.DataFrame({'Protein ID':concat_dihedrals['Protein ID'].unique()})
#uniprot_protein_sequences['Complete Sequence'] = uniprot_protein_sequences['Protein ID'].apply(get_complete_sequence)
#uniprot_protein_sequences.to_csv('uniprot_protein_sequences.csv', index=False)

In [12]:
#uniprot_protein_sequences = pd.read_csv('../global_data/uniprot_protein_sequences.csv')
uniprot_protein_sequences = pd.read_csv('../global_data/complete_sequence_cache.csv')

In [13]:
amino_acid_map = {
    "ALA": "A",
    "ARG": "R",
    "ASN": "N",
    "ASP": "D",
    "CYS": "C",
    "GLU": "E",
    "GLN": "Q",
    "GLY": "G",
    "HIS": "H",
    "ILE": "I",
    "LEU": "L",
    "LYS": "K",
    "MET": "M",
    "PHE": "F",
    "PRO": "P",
    "SER": "S",
    "THR": "T",
    "TRP": "W",
    "TYR": "Y",
    "VAL": "V",
    "SEC": "U",
    "PYL": "O"
}

In [14]:
# Function to verify sequences
def verify_sequences(df_residues, df_sequences, verbose=False):
    mismatches = pd.DataFrame(
        columns=['Protein ID', 'Residue Position', 'AlphaFold Residue', 'UniProt Residue'])

    missing_sequences = 0
    for protein_id in df_residues['Protein ID'].unique():
        # Get the complete sequence for the current Protein ID
        if protein_id in df_sequences['Protein ID'].values:
            complete_seq = df_sequences[df_sequences['Protein ID'] == protein_id]['Complete Sequence'].values[0]
        else:
            missing_sequences += 1
            print(f'{protein_id} is not one of the completed sequences we queried from UniProt', missing_sequences)
            continue

        # Filter residues for the current Protein ID
        residues = df_residues[df_residues['Protein ID'] == protein_id]

        for _, row in residues.iterrows():
            residue_name = row['Residue Name']
            residue_position = row['Residue Position']
            
            # Get the expected residue from the complete sequence
            if (residue_position - 1) <  len(complete_seq):
                expected_residue = complete_seq[residue_position - 1]  # position - 1 for zero-based indexing
                actual_residue = amino_acid_map[residue_name]
            else:
                expected_residue = 'X'
                actual_residue = amino_acid_map[residue_name]
            
            if expected_residue != actual_residue:
                if verbose:
                    print(protein_id)
                mismatches = mismatches._append({'Protein ID':protein_id,
                        'Residue Position':residue_position,
                        'AlphaFold Residue':actual_residue,
                        'UniProt Residue': expected_residue
                        }, ignore_index = True)
                #mismatches.loc[len(mismatches.index)] = [protein_id, residue_position, actual_residue, expected_residue]
                #mismatches.append((protein_id, residue_position, actual_residue, expected_residue))

    return mismatches

In [15]:
mismatches = verify_sequences(concat_dihedrals, uniprot_protein_sequences)
mismatches

Q99J99 is not one of the completed sequences we queried from UniProt 1
Q8R5C0 is not one of the completed sequences we queried from UniProt 2


Unnamed: 0,Protein ID,Residue Position,AlphaFold Residue,UniProt Residue
0,P27635,202,N,S
1,Q9NX55,2,R,A
2,Q9NX55,3,R,T
3,Q9NX55,4,R,E
4,Q9NX55,6,E,D
...,...,...,...,...
396,Q8R0F8,223,K,X
397,Q8R0F8,224,R,X
398,Q8R0F8,225,S,X
399,Q8R0F8,226,E,X


In [16]:
unique_mismatches = mismatches['Protein ID'].unique()
print(unique_mismatches)
print(len(unique_mismatches))

['P27635' 'Q9NX55' 'Q9ULT8' 'P62861' 'O94851' 'Q00341' 'O75396' 'Q8R0F8']
8


# Ramachandran Analysis - RvsS Dataset

In [17]:
RvsS = pd.read_csv('../RvsS/RvsS_peptides_with_alphafold.csv').drop(columns = ['Unnamed: 0'])
pd.set_option('display.max_columns', None)
display(RvsS)
pd.reset_option('display.max_columns')

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,4 Log2 Ratio HL,5 Log2 Ratio HL,6 Log2 Ratio HL,7 Log2 Ratio HL,8 Log2 Ratio HL,9 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,p-value,neglogp,Log2HL avg,label,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,y_coord_ca,y_coord_cb,y_coord_n,z_coord_c,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_2_180_pae,nAA_3_180_pae,nAA_4_180_pae,nAA_4.5_180_pae,nAA_5_180_pae,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae,nAA_2_180_pae_smooth10,nAA_3_180_pae_smooth10,nAA_4_180_pae_smooth10,nAA_4.5_180_pae_smooth10,nAA_5_180_pae_smooth10,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,,3.310961,,,,,,sp|Q8C196|CPSM_MOUSE,Q8C196,CPSM_MOUSE,Cps1,"Carbamoyl-phosphate synthase [ammonia], mitoch...",4.058191e-03,2.391668,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9,584,FAVESMEDALKAADTIGYPV,IRSAYALGGLGSGICPNKET,Q8C196,66,M,584,96.55,-14.238,-12.741,-11.922,-12.356,-21.547,-21.776,-20.583,-22.969,-8.140,-7.953,-8.470,-8.702,STRN,STRN,0,0,1,0,0,0,0,0,0,2,3,3,4,7,8,11,31,85,144,5,0.0,0.0,0.095238,0.095238,2.000000,2.142857,2.380952,3.666667,4.857143,6.190476,7.857143,22.190476,59.952381,120.857143,6.285714,0
1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,,,3.309030,2.577856,2.251824,,,sp|Q07417|ACADS_MOUSE,Q07417,ACADS_MOUSE,Acads,"Short-chain specific acyl-CoA dehydrogenase, m...",9.647931e-06,5.015566,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...,262,10,IA,2,264,DCRIPKENLLGEPGMGFKIA,QTLDMGRIGIASQALGIAQA,Q07417,47,M,264,98.03,-3.998,-3.218,-3.663,-1.765,5.685,7.004,7.867,6.799,-16.050,-16.071,-17.261,-16.097,HELX_RH_AL_P,HELX,0,1,0,0,0,0,0,0,0,2,2,2,5,6,8,8,19,90,173,1,0.0,0.0,0.000000,0.000000,2.047619,2.047619,2.761905,4.857143,6.285714,8.047619,9.047619,30.571429,90.857143,176.571429,8.285714,0
2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,2.383482,2.727931,,,,,,sp|Q91YI0|ARLY_MOUSE,Q91YI0,ARLY_MOUSE,Asl,Argininosuccinate lyase,4.283587e-02,1.368192,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...,12,11,FVGAVDPI,8,20,MASESGKLWGGRFVGAVDPI,EKFNSSISYDRHLWNVDVQG,Q91YI0,79,M,20,93.06,0.281,1.383,0.792,2.350,4.013,3.236,2.284,4.136,34.874,35.596,36.649,36.227,HELX_RH_AL_P,HELX,0,1,0,0,0,0,0,0,0,2,2,3,6,7,7,7,9,24,52,2,0.0,0.0,0.000000,0.000000,1.952381,2.000000,2.238095,3.285714,4.380952,5.238095,5.523810,11.238095,29.571429,61.952381,2.380952,0
3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,2.412394,,,2.559564,,2.839492,2.607501,sp|P50247|SAHH_MOUSE,P50247,SAHH_MOUSE,Ahcy,Adenosylhomocysteinase,8.561078e-10,9.067472,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...,412,20,QAQYLG,6,418,LGKLNVKLTKLTEKQAQYLG,PINGPFKPDHYRY,P50247,26,M,418,96.03,-31.883,-31.393,-30.160,-32.451,-1.725,-2.820,-2.363,-3.286,-2.198,-3.157,-3.955,-4.067,BEND,BEND,1,0,0,0,0,0,0,0,0,2,2,2,3,4,5,6,15,24,39,5,0.0,0.0,0.095238,0.095238,2.000000,2.095238,2.095238,3.238095,4.428571,5.380952,5.952381,13.380952,27.047619,48.190476,3.047619,0
4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,2.450718,2.078252,,,,,,sp|P33267|CP2F2_MOUSE,P33267,CP2F2_MOUSE,Cyp2f2,Cytochrome P450 2F2,2.032232e-04,3.692027,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...,358,12,FADVIP,6,364,SMPYTDAVIHEVQRFADVIP,NLPHRVTRDTPFRGFLIPKG,P33267,20,M,364,96.29,1.166,0.040,0.595,-0.973,1.621,2.637,4.050,2.668,-9.734,-9.982,-10.213,-8.915,TURN_TY1_P,TURN,0,0,0,1,0,0,0,0,0,2,2,3,3,6,7,9,27,104,207,6,0.0,0.0,0.000000,0.000000,2.000000,2.285714,2.857143,4.238095,6.095238,7.380952,8.904762,27.523810,85.285714,171.619048,7.809524,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,-1.407199,-3.203481,-2.025144,-1.873531,-1.746197,-1.613004,-1.760150,sp|Q9QXF8|GNMT_MOUSE,Q9QXF8,GNMT_MOUSE,Gnmt,Glycine N-methyltransferase,1.880219e-08,7.725792,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,213,17,AH,2,215,KSDLTKDITTSVLTVNNKAH,VTLDYTVQVPGTGRDGSPGF,Q9QXF8,104,M,215,95.79,17.127,17.414,18.275,18.120,-2.387,-2.142,-0.893,-3.274,-3.437,-1.957,-1.736,-1.371,STRN,STRN,0,0,1,0,0,0,0,0,0,2,2,3,4,6,9,10,25,60,104,2,0.0,0.0,0.000000,0.000000,2.000000,2.142857,2.523810,3.380952,4.428571,6.095238,7.666667,16.476190,42.238095,79.904762,2.380952,0
199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,-1.988603,,-1.791248,-1.912515,-1.928952,-1.891596,-1.881011,sp|Q8K3J1|NDUS8_MOUSE,Q8K3J1,NDUS8_MOUSE,Ndufs8,NADH dehydrogenase [ubiquinone] iron-sulfur pr...,7.034064e-14,13.152794,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...,41,10,KEQESEVD,8,49,SAVAATYKYVNKKEQESEVD,KSATDNAARILMWTELIRGL,Q8K3J1,72,M,49,82.35,-36.502,-36.519,-37.547,-36.824,10.719,11.823,11.524,13.124,50.492,51.559,52.662,50.954,HELX_RH_AL_P,HELX,0,1,0,0,0,0,0,0,0,1,1,2,2,2,2,2,7,11,19,0,0.0,0.0,0.000000,0.000000,1.333333,1.333333,2.000000,2.904762,3.238095,4.000000,4.238095,7.190476,12.523810,19.428571,0.857143,1
200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,-2.094407,-1.994187,-1.873064,-2.015018,-1.664986,-1.829820,-1.935236,sp|P63038|CH60_MOUSE,P63038,CH60_MOUSE,Hspd1,"60 kDa heat shock protein, mitochondrial",7.289026e-11,10.137330,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...,141,15,RGV,3,144,KEGFEKISKGANPVEIRRGV,LAVDAVIAELKKQSKPVTTP,P63038,38,M,144,97.14,19.270,18.162,17.695,18.579,-5.132,-5.161,-6.600,-4.520,-13.015,-14.075,-14.333,-15.329,HELX_RH_AL_P,HELX,0,1,0,0,0,0,0,0,0,2,2,3,5,6,8,8,26,71,128,2,0.0,0.0,0.000000,0.000000,2.047619,2.142857,2.857143,5.285714,6.238095,8.476190,8.952381,26.095238,62.571429,115.380952,7.571429,0
201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,,-2.131793,-2.980788,-2.198685,,-2.326194,-2.205174,sp|Q8BMS1|ECHA_MOUSE,Q8BMS1,ECHA_MOUSE,Hadha,"Trifunctional enzyme subunit alpha, mitochondrial",5.582467e-09,8.253174,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...,505,11,,0,505,AVSKRPEKVIGMHYFSPVDK,QLLEIITTDKTSKDTTASAV,Q8BMS1,65,M,505,95.41,2.614,3.983,5.051,3.903,-3.420,-4.006,-3.584,-5.460,-5.193,-4.857,-5.870,-4.770,unstructured,unstructured,0,0,0,0,1,0,0,0,0,2,2,2,2,5,6,7,15,55,158,8,0.0,0.0,0.000000,0.095238,2.000000,2.095238,2.571429,3.428571,4.904762,6.761905,8.190476,25.952381,80.380952,173.428571,7.190476,0


In [18]:
# None of the proteins in the RvsS dataset have mismatched sequences
RvsS[RvsS['Protein ID'].isin(unique_mismatches)]

Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,3 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR


In [19]:
def extract_PSI_and_PHI(residue_df, psi_and_phi_df, verbose=False):
    df = pd.DataFrame(columns=['PSI Radians', 'PHI Radians', 'PSI Degrees', 'PHI Degrees'])
    for _, row in residue_df.iterrows():
        protein_id = row['Protein ID']
        methionine_posn = row['Methionine Location'] + 1
        row_of_interest = psi_and_phi_df.loc[(psi_and_phi_df['Protein ID'] == protein_id) & (psi_and_phi_df['Residue Position'] == methionine_posn)]
        if verbose:
            print(row_of_interest)
        new_row = {'PSI Radians': float('nan'), 'PHI Radians':float('nan'), 'PSI Degrees': float('nan'), 'PHI Degrees': float('nan')}
        if row_of_interest.shape[0] == 1 and row_of_interest['Residue Name'].iloc[0] == 'MET':
            new_row = {'PSI Radians': row_of_interest['PSI'].iloc[0], 'PHI Radians':row_of_interest['PHI'].iloc[0], 'PSI Degrees': row_of_interest['PSI degrees'].iloc[0], 'PHI Degrees': row_of_interest['PHI degrees'].iloc[0]}
        df = df._append(new_row, ignore_index = True)
    return df

In [20]:
RvsS_psi_phi_map = extract_PSI_and_PHI(RvsS, concat_dihedrals)

KeyError: 'PSI degrees'

In [None]:
RvsS_psi_phi_map

In [None]:
sum(RvsS_psi_phi_map['PHI Degrees'].isna())

In [None]:
RvsS_with_PSI_and_PHI = pd.concat([RvsS, RvsS_psi_phi_map], axis=1)
RvsS_with_PSI_and_PHI

In [None]:
RvsS_with_PSI_and_PHI.to_csv('RvsS_with_PSI_and_PHI.csv', index=False)

# Ramachandran Analysis - MsrAKD Dataset

In [None]:
MsrAKD_with_alphafold = pd.read_csv('../MsrKD/MsrAKD_with_alphafold.csv').drop(columns=['Unnamed: 0'])
MsrAKD_with_alphafold

In [None]:
# Some of the proteins in the MsrAKD dataset have sequence mismatches - drop these
MsrAKD_with_alphafold[MsrAKD_with_alphafold['Protein ID'].isin(unique_mismatches)]

In [None]:
MsrAKD_with_alphafold_wo_mismatches = MsrAKD_with_alphafold.drop(MsrAKD_with_alphafold[MsrAKD_with_alphafold['Protein ID'].isin(unique_mismatches)].index).reset_index(drop = True)
MsrAKD_with_alphafold_wo_mismatches

In [None]:
MsrAKD_psi_and_psi_map = extract_PSI_and_PHI(MsrAKD_with_alphafold_wo_mismatches, concat_dihedrals)

In [None]:
sum(MsrAKD_psi_and_psi_map['PHI Degrees'].isna())

In [None]:
MsrAKD_psi_and_psi_map

In [None]:
MsrAKD_with_PSI_and_PHI = pd.concat([MsrAKD_with_alphafold_wo_mismatches, MsrAKD_psi_and_psi_map], axis=1)
MsrAKD_with_PSI_and_PHI

In [None]:
MsrAKD_with_PSI_and_PHI.to_csv('MsrAKD_with_PSI_and_PHI.csv')

# Ramachandran Analysis - MsrBKD Dataset

In [None]:
MsrBKD_with_alphafold = pd.read_csv('../MsrKD/MsrB2KD_with_alphafold.csv').drop(columns=['Unnamed: 0'])
MsrBKD_with_alphafold

In [None]:
# Some of the proteins in the MsrAKD dataset have sequence mismatches - drop these
MsrBKD_with_alphafold[MsrBKD_with_alphafold['Protein ID'].isin(unique_mismatches)]

In [None]:
MsrBKD_with_alphafold_wo_mismatches = MsrBKD_with_alphafold.drop(MsrBKD_with_alphafold[MsrBKD_with_alphafold['Protein ID'].isin(unique_mismatches)].index).reset_index(drop = True)
MsrBKD_with_alphafold_wo_mismatches

In [None]:
MsrBKD_psi_and_psi_map = extract_PSI_and_PHI(MsrBKD_with_alphafold_wo_mismatches, concat_dihedrals)

In [None]:
sum(MsrBKD_psi_and_psi_map['PHI Degrees'].isna())

In [None]:
MsrBKD_psi_and_psi_map

In [None]:
MsrBKD_with_PSI_and_PHI = pd.concat([MsrBKD_with_alphafold_wo_mismatches, MsrBKD_psi_and_psi_map], axis=1)
MsrBKD_with_PSI_and_PHI

In [None]:
MsrBKD_with_PSI_and_PHI.to_csv('MsrBKD_with_PSI_and_PHI.csv', index=False)

# End