# Imports

In [1]:
import Bio.PDB
import pandas as pd
import numpy as np
import os
import re
import requests as r
from io import StringIO
from Bio import SeqIO

# Ramachandran Analysis - Helper/Prelim Section

In [2]:
# Set correct pathing
curr_dir_path_str = "./"
curr_dir_path = os.path.abspath(curr_dir_path_str)

global_data_path_str = "../global_data"
global_data_path = os.path.abspath(global_data_path_str)

print("Current Directory: " + curr_dir_path)
print("Global Data Directory: " + global_data_path)

Current Directory: /Users/amralazali/MetML/ChURRO_revisions
Global Data Directory: /Users/amralazali/MetML/global_data


In [3]:
# Helper function to extract phi/psi angles from a .cif file 
def get_phi_and_psi(Protein_ID, CIF_file_path, verbose=False):
    """Extracts the psi and phi angle for each residue in protein's 
    alphofold structure.

    Args:
      Protein_ID: uniprot id of protein.
      CIF_file_path: file path to proteins mmCIF file.

    Returns:
      Dataframe that stores the dihedral angles of each residue in the 
      specified protein.

    """

    df = pd.DataFrame()
    for model in Bio.PDB.MMCIFParser().get_structure(Protein_ID, CIF_file_path):
        for chain in model:
            polypeptides = Bio.PDB.PPBuilder().build_peptides(chain)
            res_index_list = []
            res_name_list = []
            phi_list = []
            psi_list = []
            for poly_index, poly in enumerate(polypeptides):
                if verbose:
                  print("Model %s Chain %s" % (str(model.id), str(chain.id)))
                  print ("(part %i of %i)" % (poly_index+1, len(polypeptides)))
                  print ("length %i" % (len(poly)))
                  print ("from %s%i" % (poly[0].resname, poly[0].id[1]))
                  print ("to %s%i" % (poly[-1].resname, poly[-1].id[1]))
                phi_psi = poly.get_phi_psi_list()
                res_index_sublist = []
                res_name_sublist = []
                phi_sublist = []
                psi_sublist = []
                for res_index, residue in enumerate(poly) :
                    res_name = "%s%i" % (residue.resname, residue.id[1])
                    # print(res_name, phi_psi[res_index])
                    res_index_sublist.append(residue.id[1])
                    res_name_sublist.append(residue.resname)
                    phi_sublist.append(phi_psi[res_index][0])
                    psi_sublist.append(phi_psi[res_index][1])
                res_index_list.extend(res_index_sublist)
                res_name_list.extend(res_name_sublist)
                phi_list.extend(phi_sublist)
                psi_list.extend(psi_sublist)
            df['Protein ID'] = [Protein_ID] * len(res_index_list)
            df['Residue Name'] = res_name_list
            df['Residue Position'] = res_index_list
            df['PHI'] = phi_list
            df['PSI'] = psi_list
    return df
    


In [4]:
directory = '../alphafold_data/cif'
file_paths = []

for root, directories, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)
        file_paths.append(file_path)

print(file_paths)

['../alphafold_data/cif/O14617.cif', '../alphafold_data/cif/Q9D404.cif', '../alphafold_data/cif/P19525.cif', '../alphafold_data/cif/P62829.cif', '../alphafold_data/cif/Q96PK6.cif', '../alphafold_data/cif/Q9Z0X1.cif', '../alphafold_data/cif/O60814.cif', '../alphafold_data/cif/Q99MB2.cif', '../alphafold_data/cif/P07900.cif', '../alphafold_data/cif/Q8C6I2.cif', '../alphafold_data/cif/Q86YV0.cif', '../alphafold_data/cif/Q5JQC4.cif', '../alphafold_data/cif/Q8QZT1.cif', '../alphafold_data/cif/Q9CQ92.cif', '../alphafold_data/cif/P47897.cif', '../alphafold_data/cif/Q14203.cif', '../alphafold_data/cif/O75955.cif', '../alphafold_data/cif/Q9Y5B6.cif', '../alphafold_data/cif/Q9CQN1.cif', '../alphafold_data/cif/P15121.cif', '../alphafold_data/cif/Q969T9.cif', '../alphafold_data/cif/P38919.cif', '../alphafold_data/cif/Q920A5.cif', '../alphafold_data/cif/Q9NXV6.cif', '../alphafold_data/cif/Q9P2K3.cif', '../alphafold_data/cif/Q53HL2.cif', '../alphafold_data/cif/Q96E09.cif', '../alphafold_data/cif/Q8C1

In [5]:
pattern = r'/([^/]+)\.cif$'
protein_ids =[]

for file_path in file_paths:
    match = re.search(pattern, file_path)
    desired_substring = match.group(1)
    protein_ids.append(desired_substring)

print(protein_ids)

['O14617', 'Q9D404', 'P19525', 'P62829', 'Q96PK6', 'Q9Z0X1', 'O60814', 'Q99MB2', 'P07900', 'Q8C6I2', 'Q86YV0', 'Q5JQC4', 'Q8QZT1', 'Q9CQ92', 'P47897', 'Q14203', 'O75955', 'Q9Y5B6', 'Q9CQN1', 'P15121', 'Q969T9', 'P38919', 'Q920A5', 'Q9NXV6', 'Q9P2K3', 'Q53HL2', 'Q96E09', 'Q8C1W2', 'Q1ED39', 'P27144', 'P42125', 'Q13418', 'P50247', 'P12074', 'O75821', 'Q9Y6Q5', 'Q08945', 'Q5HZI9', 'P21333', 'P24539', 'O60784', 'O14777', 'Q9CPQ3', 'O43175', 'Q9UN86', 'Q9Y6I3', 'Q8IUD2', 'P56391', 'Q9NRY2', 'Q923K4', 'Q9CRD0', 'O14950', 'P35637', 'Q9BQ04', 'Q9Y3U8', 'A2ATU0', 'P39019', 'P62753', 'Q9HD42', 'Q9BVA1', 'P54578', 'Q9CWV0', 'Q99536', 'P12277', 'O14776', 'Q9UI36', 'P14174', 'Q16777', 'Q8N5G2', 'P07108', 'Q14160', 'Q6YN16', 'O75175', 'P17858', 'Q99LP6', 'P09496', 'P56192', 'Q8WXI9', 'Q9NYP7', 'Q9CW42', 'Q15054', 'Q9UNZ5', 'O00267', 'Q8BHE8', 'Q8C2E4', 'Q9UL46', 'Q9D773', 'Q9CQC7', 'Q13586', 'Q15691', 'Q8BWF0', 'Q9CZS1', 'Q62425', 'Q9CXJ1', 'P52294', 'Q9P2E9', 'P05455', 'P50454', 'Q60597', 'Q7L4I2',

In [6]:
list_of_dfs = []

for cif_file, prot_id in zip(file_paths, protein_ids):
    list_of_dfs.append(get_phi_and_psi(prot_id, cif_file))

In [7]:
concat_dihedrals = pd.concat(list_of_dfs)
concat_dihedrals['PSI degrees'] = np.rad2deg(concat_dihedrals['PSI'])
concat_dihedrals['PHI degrees'] = np.rad2deg(concat_dihedrals['PHI'])
concat_dihedrals

Unnamed: 0,Protein ID,Residue Name,Residue Position,PHI,PSI,PSI degrees,PHI degrees
0,O14617,MET,1,,-1.015463,-58.181772,
1,O14617,ALA,2,-0.954294,-0.783044,-44.865122,-54.677005
2,O14617,LEU,3,-1.047323,-0.604341,-34.626186,-60.007208
3,O14617,LYS,4,-1.189739,-0.662262,-37.944790,-68.167023
4,O14617,MET,5,-1.213749,-0.669750,-38.373822,-69.542706
...,...,...,...,...,...,...,...
591,P41743,ALA,592,-1.460363,-0.203497,-11.659505,-83.672639
592,P41743,GLU,593,-1.886433,-0.425161,-24.359948,-108.084665
593,P41743,GLU,594,-1.403320,-0.176936,-10.137675,-80.404323
594,P41743,CYS,595,-1.599052,0.315555,18.079959,-91.618920


In [8]:
# concat_dihedrals.to_csv(os.path.join(global_data_path, "revised_dihedral_angles.csv"), index=False)

Sanity Check: Here, we verify that the AA sequences extracted from the AlphaFold database match those found in the UniProt database

In [9]:
concat_dihedrals = pd.read_csv(os.path.join(global_data_path, "revised_dihedral_angles.csv"))
concat_dihedrals

Unnamed: 0,Protein ID,Residue Name,Residue Position,PHI,PSI,PSI degrees,PHI degrees
0,O14617,MET,1,,-1.015463,-58.181772,
1,O14617,ALA,2,-0.954294,-0.783044,-44.865122,-54.677005
2,O14617,LEU,3,-1.047323,-0.604341,-34.626186,-60.007208
3,O14617,LYS,4,-1.189739,-0.662262,-37.944790,-68.167023
4,O14617,MET,5,-1.213749,-0.669750,-38.373822,-69.542706
...,...,...,...,...,...,...,...
1090026,P41743,ALA,592,-1.460363,-0.203497,-11.659505,-83.672639
1090027,P41743,GLU,593,-1.886433,-0.425161,-24.359948,-108.084665
1090028,P41743,GLU,594,-1.403320,-0.176936,-10.137675,-80.404323
1090029,P41743,CYS,595,-1.599052,0.315555,18.079959,-91.618920


In [11]:
len(concat_dihedrals['Protein ID'].unique())

1984

In [12]:
# Helper function to get full amino acid sequence for a protein
def get_complete_sequence(cID):
    baseUrl="http://www.uniprot.org/uniprot/"
    currentUrl=baseUrl+cID+".fasta"
    response = r.post(currentUrl)
    cData=''.join(response.text)
    
    Seq=StringIO(cData)
    pSeq=list(SeqIO.parse(Seq,"fasta"))

    return str(pSeq[0].seq)

In [21]:
uniprot_protein_sequences = pd.read_csv('../global_data/complete_sequence_cache.csv')

In [23]:
uniprot_protein_sequences

Unnamed: 0.1,Unnamed: 0,Protein ID,Complete Sequence
0,0,Q8C196,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...
1,1,Q07417,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...
2,2,Q91YI0,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...
3,3,P50247,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...
4,4,P33267,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...
...,...,...,...
2131,1,P22392,MANLERTFIAIKPDGVQRGLVGEIIKRFEQKGFRLVAMKFLRASEE...
2132,2,O00116,MAEAAAAAGGTGLGAGASYGSAADRDRDPDPDRAGRRLRVLSGHLL...
2133,3,P04792,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...
2134,4,Q9NR28,MAALKSWLSRSVTSFFRYRQCLCVPVVANFKKRCFSELIRPWHKTV...


In [24]:
amino_acid_map = {
    "ALA": "A",
    "ARG": "R",
    "ASN": "N",
    "ASP": "D",
    "CYS": "C",
    "GLU": "E",
    "GLN": "Q",
    "GLY": "G",
    "HIS": "H",
    "ILE": "I",
    "LEU": "L",
    "LYS": "K",
    "MET": "M",
    "PHE": "F",
    "PRO": "P",
    "SER": "S",
    "THR": "T",
    "TRP": "W",
    "TYR": "Y",
    "VAL": "V",
    "SEC": "U",
    "PYL": "O"
}

In [25]:
# Function to verify sequences
def verify_sequences(df_residues, df_sequences, verbose=False):
    mismatches = pd.DataFrame(
        columns=['Protein ID', 'Residue Position', 'AlphaFold Residue', 'UniProt Residue'])

    missing_sequences = 0
    for protein_id in df_residues['Protein ID'].unique():
        # Get the complete sequence for the current Protein ID
        if protein_id in df_sequences['Protein ID'].values:
            complete_seq = df_sequences[df_sequences['Protein ID'] == protein_id]['Complete Sequence'].values[0]
        else:
            missing_sequences += 1
            print(f'{protein_id} is not one of the completed sequences we queried from UniProt', missing_sequences)
            continue

        # Filter residues for the current Protein ID
        residues = df_residues[df_residues['Protein ID'] == protein_id]

        for _, row in residues.iterrows():
            residue_name = row['Residue Name']
            residue_position = row['Residue Position']
            
            # Get the expected residue from the complete sequence
            if (residue_position - 1) <  len(complete_seq):
                expected_residue = complete_seq[residue_position - 1]  # position - 1 for zero-based indexing
                actual_residue = amino_acid_map[residue_name]
            else:
                expected_residue = 'X'
                actual_residue = amino_acid_map[residue_name]
            
            if expected_residue != actual_residue:
                if verbose:
                    print(protein_id)
                mismatches = mismatches._append({'Protein ID':protein_id,
                        'Residue Position':residue_position,
                        'AlphaFold Residue':actual_residue,
                        'UniProt Residue': expected_residue
                        }, ignore_index = True)
                #mismatches.loc[len(mismatches.index)] = [protein_id, residue_position, actual_residue, expected_residue]
                #mismatches.append((protein_id, residue_position, actual_residue, expected_residue))

    return mismatches

In [26]:
mismatches = verify_sequences(concat_dihedrals, uniprot_protein_sequences)
mismatches

Q99J99 is not one of the completed sequences we queried from UniProt 1
Q8R5C0 is not one of the completed sequences we queried from UniProt 2


Unnamed: 0,Protein ID,Residue Position,AlphaFold Residue,UniProt Residue
0,Q14160,674,V,E
1,P27635,202,N,S
2,Q8NFD5,3,H,A
3,Q8NFD5,4,N,R
4,Q8NFD5,6,G,A
...,...,...,...,...
4981,Q8R0F8,224,R,X
4982,Q8R0F8,225,S,X
4983,Q8R0F8,226,E,X
4984,Q8R0F8,227,Y,X


In [27]:
unique_mismatches = mismatches['Protein ID'].unique()
print(unique_mismatches)
print(len(unique_mismatches))

['Q14160' 'P27635' 'Q8NFD5' 'Q9NX55' 'Q9Y2D5' 'Q9ULT8' 'Q8NFV4' 'Q92616'
 'P62861' 'P49411' 'Q10567' 'P26599' 'O94851' 'Q00341' 'A0A2R8Y4L2'
 'O14929' 'Q9NX58' 'O75396' 'Q9UJX3' 'Q8R0F8' 'Q96BZ8']
21


# Ramachandran Analysis - ChURRO_1 Dataset

In [45]:
CHURRO_1_df = pd.read_csv('./ChURRO_1_with_alphafold.csv').drop(columns = ['Unnamed: 0'])
pd.set_option('display.max_columns', None)
display(CHURRO_1_df)
pd.reset_option('display.max_columns')

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,Protein,Protein ID,Entry Name,Gene,Protein Description,pvalue,avg ratio,neglogpval,Site,Label,Color,Complete Sequence,Peptide Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,y_coord_ca,y_coord_cb,y_coord_n,z_coord_c,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,nAA_2_180_pae,nAA_3_180_pae,nAA_4_180_pae,nAA_4.5_180_pae,nAA_5_180_pae,nAA_5.5_180_pae,nAA_6_180_pae,nAA_6.5_180_pae,nAA_7_180_pae,nAA_7.5_180_pae,nAA_8_180_pae,nAA_12_180_pae,nAA_18_180_pae,nAA_24_180_pae,nAA_12_70_pae,nAA_2_180_pae_smooth10,nAA_3_180_pae_smooth10,nAA_4_180_pae_smooth10,nAA_4.5_180_pae_smooth10,nAA_5_180_pae_smooth10,nAA_5.5_180_pae_smooth10,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,sp|P11142|HSP7C_HUMAN,P11142,HSP7C_HUMAN,HSPA8,Heat shock cognate 71 kDa protein,5.529095e-04,4.131862,3.257346,M549,HSP7C_M549,red,MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,NSLESYAFNMK,539,11,NSLESYAFN,9,548,DEKQRDKVSSKNSLESYAFN,KATVEDEKLQGKINDEDKQK,P11142,99.0,M,548.0,89.69,37.293,36.147,36.471,34.907,18.565,18.150,16.869,17.958,13.623,14.548,15.317,13.811,HELX_RH_AL_P,HELX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,6.0,6.0,8.0,8.0,20.0,54.0,74.0,9.0,0.0,0.0,0.000000,0.000000,2.000000,2.000000,2.095238,4.380952,5.285714,6.714286,6.952381,15.666667,40.666667,61.238095,4.380952,0.0
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,sp|Q07065|CKAP4_HUMAN,Q07065,CKAP4_HUMAN,CKAP4,Cytoskeleton-associated protein 4,1.337810e-07,3.424534,6.873606,M423,CKAP4_M423,red,MPSAKQRGSKGGHGAASPSEKGAHPSGGADDVAKKPPPAPQQPPPP...,LQHVEDGVLSMQVASAR,412,17,LQHVEDGVLS,10,422,QQKSQGLDSRLQHVEDGVLS,QVASARQTESLESLLSKSQE,Q07065,306.0,M,422.0,71.71,-75.225,-76.396,-77.149,-77.368,-23.796,-23.479,-22.216,-24.573,41.658,40.716,41.163,40.667,HELX_RH_AL_P,HELX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,4.0,6.0,8.0,8.0,12.0,27.0,46.0,3.0,0.0,0.0,0.000000,0.000000,2.000000,2.000000,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0.0
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,sp|Q9Y490|TLN1_HUMAN,Q9Y490,TLN1_HUMAN,TLN1,Talin-1,1.377828e-03,2.946435,2.860805,M1759,TLN1_M1759,red,MVALSLKISIGNVVKTMQFEPSTMVYDACRIIRERIPEAPAGPPSD...,TLSHPQQMALLDQTK,1751,15,TLSHPQQ,7,1758,EPLTLAAVGAASKTLSHPQQ,ALLDQTKTLAESALQLLYTA,Q9Y490,550.0,M,1758.0,89.36,-10.855,-10.915,-10.721,-9.924,22.782,24.283,24.555,25.018,-10.557,-10.846,-12.345,-10.051,HELX_RH_AL_P,HELX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,5.0,7.0,8.0,8.0,16.0,49.0,92.0,2.0,0.0,0.0,0.000000,0.000000,2.000000,2.000000,2.380952,3.714286,5.238095,6.619048,7.047619,18.047619,57.952381,95.047619,5.380952,0.0
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,sp|P99999|CYC_HUMAN,P99999,CYC_HUMAN,CYCS,Cytochrome c,2.391139e-03,2.876139,2.621395,M66,CYC_M66,red,MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,GIIWGEDTLMEYLENPK,56,17,GIIWGEDTL,9,65,GYSYTAANKNKGIIWGEDTL,EYLENPKKYIPGTKMIFVGI,P99999,288.0,M,65.0,98.52,-3.727,-4.683,-6.135,-4.359,-5.119,-3.953,-4.466,-3.242,2.159,1.908,1.881,0.670,HELX_RH_AL_P,HELX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,4.0,6.0,8.0,9.0,26.0,61.0,94.0,1.0,0.0,0.0,0.000000,0.000000,2.000000,2.047619,2.380952,3.857143,6.000000,7.285714,8.190476,20.333333,51.047619,87.857143,4.619048,0.0
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,sp|P00374|DYR_HUMAN,P00374,DYR_HUMAN,DHFR,Dihydrofolate reductase,1.270312e-02,2.831927,1.896090,M112,DYR_M112,red,MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEG...,VDMVWIVGGSSVYK,109,14,VD,2,111,RSLDDALKLTEQPELANKVD,VWIVGGSSVYKEAMNHPGHL,P00374,66.0,M,111.0,96.12,-9.211,-8.992,-7.786,-8.817,5.026,6.276,7.064,5.967,7.772,8.629,8.105,10.054,STRN,STRN,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,3.0,3.0,5.0,7.0,27.0,70.0,123.0,8.0,0.0,0.0,0.047619,0.047619,2.000000,2.000000,2.476190,3.333333,4.857143,6.666667,8.095238,21.428571,59.619048,108.380952,5.523810,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,sp|Q9H444|CHM4B_HUMAN,Q9H444,CHM4B_HUMAN,CHMP4B,Charged multivesicular body protein 4b,9.760989e-01,-0.002733,0.010506,M35,CHM4B_M35,grey,MSVFGKLFGAGGGKAGKGGPTPQEAIQRLRDTEEMLSKKQEFLEKK...,LRDTEEMLSK,28,10,LRDTEE,6,34,AGKGGPTPQEAIQRLRDTEE,LSKKQEFLEKKIEQELTAAK,Q9H444,481.0,M,34.0,97.87,11.882,11.620,12.417,10.198,9.090,9.850,9.267,9.861,-2.216,-0.910,0.265,-0.558,HELX_RH_AL_P,HELX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,5.0,6.0,8.0,8.0,16.0,35.0,55.0,4.0,0.0,0.0,0.000000,0.000000,2.000000,2.000000,2.428571,5.095238,6.047619,8.380952,8.476190,18.142857,36.428571,52.190476,4.619048,0.0
1023,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,sp|Q15233|NONO_HUMAN,Q15233,NONO_HUMAN,NONO,Non-POU domain-containing octamer-binding protein,9.809736e-01,0.002278,0.008343,M441,NONO_M441,grey,MQSNKTFNLEKQNHTPRKHHQHHHQQQHHQQQQQQPPPPPIPANGQ...,FGQAATMEGIGAIGGTPPAFNR,434,22,FGQAAT,6,440,PDGTLGLTPPTTERFGQAAT,EGIGAIGGTPPAFNRAAPGA,Q15233,361.0,M,440.0,46.92,-38.877,-40.364,-40.886,-40.628,-17.279,-17.076,-15.706,-17.258,-25.729,-25.385,-25.856,-23.949,unstructured,unstructured,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,4.0,6.0,10.0,0.0,0.0,0.0,0.000000,0.190476,1.095238,1.285714,2.000000,2.000000,2.000000,2.000000,2.000000,4.000000,6.285714,10.000000,0.095238,1.0
1024,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,sp|Q86UP2|KTN1_HUMAN,Q86UP2,KTN1_HUMAN,KTN1,Kinectin,9.825290e-01,-0.008675,0.007655,M406,KTN1_M406,grey,MEFYESAYFIVLIPSIVITVIFLFFWLFMKETLYDEVLAKQKREQK...,IHVSYQETQQMQMK,393,14,IHVSYQETQQMQ,12,405,EHNVFQNKIHVSYQETQQMQ,KFQQVREQMEAEIAHLKQEN,Q86UP2,402.0,M,405.0,85.68,4.533,3.747,4.416,2.340,27.322,28.247,28.263,27.838,9.502,10.447,11.832,10.583,HELX_RH_AL_P,HELX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,4.0,5.0,8.0,8.0,12.0,20.0,27.0,2.0,0.0,0.0,0.000000,0.000000,2.000000,2.000000,2.571429,5.285714,6.095238,7.952381,8.000000,12.047619,19.904762,27.476190,2.142857,1.0
1025,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,sp|P51148|RAB5C_HUMAN,P51148,RAB5C_HUMAN,RAB5C,Ras-related protein Rab-5C,9.893231e-01,-0.012026,0.004662,M89,RAB5C_M89,grey,MAGRGGAARPNGPAAGNKICQFKLVLLGESAVGKSSLVLRFVKGQF...,YHSLAPMYYR,82,10,YHSLAP,6,88,TVKFEIWDTAGQERYHSLAP,YYRGAQAAIVVYDITNTDTF,P51148,221.0,M,88.0,94.05,-0.736,0.581,0.992,1.668,11.471,11.845,13.287,10.953,10.816,11.513,11.189,11.114,HELX_RH_AL_P,HELX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,4.0,5.0,5.0,6.0,9.0,46.0,78.0,2.0,0.0,0.0,0.000000,0.000000,2.000000,2.000000,2.523810,3.428571,4.619048,5.952381,6.904762,18.904762,55.714286,94.666667,4.857143,0.0


In [46]:
# 9 of the proteins in the CHURRO_1 dataset have mismatched sequences, so we remove them
clean_CHURRO_1 = CHURRO_1_df[CHURRO_1_df['Protein ID'].isin(unique_mismatches) == False].reset_index(drop=True)
clean_CHURRO_1

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,2.095238,4.380952,5.285714,6.714286,6.952381,15.666667,40.666667,61.238095,4.380952,0.0
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0.0
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,2.380952,3.714286,5.238095,6.619048,7.047619,18.047619,57.952381,95.047619,5.380952,0.0
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,2.380952,3.857143,6.000000,7.285714,8.190476,20.333333,51.047619,87.857143,4.619048,0.0
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,2.476190,3.333333,4.857143,6.666667,8.095238,21.428571,59.619048,108.380952,5.523810,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,2.428571,5.095238,6.047619,8.380952,8.476190,18.142857,36.428571,52.190476,4.619048,0.0
1014,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,2.000000,2.000000,2.000000,2.000000,2.000000,4.000000,6.285714,10.000000,0.095238,1.0
1015,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,2.571429,5.285714,6.095238,7.952381,8.000000,12.047619,19.904762,27.476190,2.142857,1.0
1016,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,2.523810,3.428571,4.619048,5.952381,6.904762,18.904762,55.714286,94.666667,4.857143,0.0


In [50]:
def extract_PSI_and_PHI(residue_df, psi_and_phi_df, verbose=False):
    df = pd.DataFrame(columns=['PSI Radians', 'PHI Radians', 'PSI Degrees', 'PHI Degrees'])
    for _, row in residue_df.iterrows():
        protein_id = row['Protein ID']
        methionine_posn = row['Methionine Location'] + 1
        row_of_interest = psi_and_phi_df.loc[(psi_and_phi_df['Protein ID'] == protein_id) & (psi_and_phi_df['Residue Position'] == methionine_posn)]
        if verbose:
            print(row_of_interest)
        new_row = {'PSI Radians': float('nan'), 'PHI Radians':float('nan'), 'PSI Degrees': float('nan'), 'PHI Degrees': float('nan')}
        if row_of_interest.shape[0] == 1 and row_of_interest['Residue Name'].iloc[0] == 'MET':
            new_row = {'PSI Radians': row_of_interest['PSI'].iloc[0], 'PHI Radians':row_of_interest['PHI'].iloc[0], 'PSI Degrees': row_of_interest['PSI degrees'].iloc[0], 'PHI Degrees': row_of_interest['PHI degrees'].iloc[0]}
        df = df._append(new_row, ignore_index = True)
    return df

In [51]:
CHURRO_1_psi_phi_map = extract_PSI_and_PHI(clean_CHURRO_1, concat_dihedrals, verbose=True)
CHURRO_1_psi_phi_map

       Protein ID Residue Name  Residue Position      PHI       PSI  \
194614     P11142          MET               549 -1.18129 -0.742821   

        PSI degrees  PHI degrees  
194614   -42.560482   -67.682903  
        Protein ID Residue Name  Residue Position       PHI       PSI  \
1028092     Q07065          MET               423 -1.069864 -0.781354   

         PSI degrees  PHI degrees  
1028092   -44.768303   -61.298705  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
509116     Q9Y490          MET              1759 -1.174102 -0.943332   

        PSI degrees  PHI degrees  
509116   -54.048956    -67.27111  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
355500     P99999          MET                66 -1.018011 -0.697202   

        PSI degrees  PHI degrees  
355500   -39.946739   -58.327732  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
321337     P00374          MET               112 -1.846402  2.6001

  df = df._append(new_row, ignore_index = True)


       Protein ID Residue Name  Residue Position      PHI       PSI  \
502239     P07437          MET               257 -1.85679 -0.428587   

        PSI degrees  PHI degrees  
502239   -24.556212  -106.386219  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
464491     Q15233          MET               326 -1.152084 -0.724554   

        PSI degrees  PHI degrees  
464491   -41.513904   -66.009532  
       Protein ID Residue Name  Residue Position       PHI      PSI  \
298915     P13693          MET               115 -1.110935 -0.60763   

        PSI degrees  PHI degrees  
298915   -34.814649   -63.651885  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
842559     P62820          MET               176 -1.242656 -0.431342   

        PSI degrees  PHI degrees  
842559   -24.714072   -71.198956  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
152869     P39023          MET               181 -2.245469  2.673912   


Unnamed: 0,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,-0.742821,-1.181290,-42.560482,-67.682903
1,-0.781354,-1.069864,-44.768303,-61.298705
2,-0.943332,-1.174102,-54.048956,-67.271110
3,-0.697202,-1.018011,-39.946739,-58.327732
4,2.600141,-1.846402,148.977115,-105.791046
...,...,...,...,...
1013,-0.761189,-1.120597,-43.612912,-64.205451
1014,2.177622,-2.618134,124.768543,-150.008032
1015,-0.762902,-1.032733,-43.711045,-59.171241
1016,-0.675357,-1.130696,-38.695083,-64.784116


In [52]:
sum(CHURRO_1_psi_phi_map['PHI Degrees'].isna())

0

In [53]:
CHURRO_1_with_PSI_and_PHI = pd.concat([clean_CHURRO_1, CHURRO_1_psi_phi_map], axis=1)
CHURRO_1_with_PSI_and_PHI

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,NSLESYAFNM[577.3085]K,NSLESYAFNM[583.3160]K,,,3.408509,,4.603527,4.107654,,4.407757,...,6.952381,15.666667,40.666667,61.238095,4.380952,0.0,-0.742821,-1.181290,-42.560482,-67.682903
1,LQHVEDGVLSM[577.3085]QVASAR,LQHVEDGVLSM[583.3160]QVASAR,3.190389,3.374976,3.092695,2.789313,3.451202,4.105873,3.316610,4.075215,...,7.523810,11.952381,23.952381,43.190476,2.333333,0.0,-0.781354,-1.069864,-44.768303,-61.298705
2,TLSHPQQM[577.3085]ALLDQTK,TLSHPQQM[583.3160]ALLDQTK,2.766721,2.927959,3.144626,,,,,,...,7.047619,18.047619,57.952381,95.047619,5.380952,0.0,-0.943332,-1.174102,-54.048956,-67.271110
3,GIIWGEDTLM[577.3085]EYLENPK,GIIWGEDTLM[583.3160]EYLENPK,2.820631,2.664639,,3.143147,,,,,...,8.190476,20.333333,51.047619,87.857143,4.619048,0.0,-0.697202,-1.018011,-39.946739,-58.327732
4,VDM[577.3085]VWIVGGSSVYK,VDM[583.3160]VWIVGGSSVYK,,,2.888442,,,2.775411,,,...,8.095238,21.428571,59.619048,108.380952,5.523810,0.0,2.600141,-1.846402,148.977115,-105.791046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,LRDTEEM[577.3085]LSK,LRDTEEM[583.3160]LSK,0.138583,0.354235,-0.005030,0.084883,-0.302264,-0.280001,-0.009539,,...,8.476190,18.142857,36.428571,52.190476,4.619048,0.0,-0.761189,-1.120597,-43.612912,-64.205451
1014,FGQAATM[577.3085]EGIGAIGGTPPAFNR,FGQAATM[583.3160]EGIGAIGGTPPAFNR,0.094201,0.207053,0.114255,0.243672,0.047130,-0.064950,-0.586272,-0.036869,...,2.000000,4.000000,6.285714,10.000000,0.095238,1.0,2.177622,-2.618134,124.768543,-150.008032
1015,IHVSYQETQQM[15.9949]QM[577.3085]K,IHVSYQETQQM[15.9949]QM[583.3160]K,0.835447,0.089325,,0.549177,,-0.204260,-1.313063,,...,8.000000,12.047619,19.904762,27.476190,2.142857,1.0,-0.762902,-1.032733,-43.711045,-59.171241
1016,YHSLAPM[577.3085]YYR,YHSLAPM[583.3160]YYR,,,-0.729042,,,,0.704989,,...,6.904762,18.904762,55.714286,94.666667,4.857143,0.0,-0.675357,-1.130696,-38.695083,-64.784116


In [54]:
CHURRO_1_with_PSI_and_PHI.to_csv('ChURRO_1_with_alphafold_and_PSI_and_PHI_angles.csv', index=False)

# Ramachandran Analysis - ChURRO_2 Dataset

In [55]:
CHURRO_2_df = pd.read_csv('./ChURRO_2_with_alphafold.csv').drop(columns=['Unnamed: 0'])
CHURRO_2_df

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,2.428571,3.952381,4.476190,5.523810,6.761905,18.714286,54.571429,106.047619,5.333333,0
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,2.523810,3.761905,4.666667,6.380952,7.238095,22.333333,70.333333,127.857143,6.619048,0
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,2.571429,5.380952,6.142857,8.238095,8.523810,19.190476,50.190476,111.476190,5.380952,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,TDFFIGGEEGM[649.3660]AEK,TDFFIGGEEGM[655.3735]AEK,0.008080,-0.318884,0.097640,-0.137602,0.183818,-0.011744,0.116157,0.026499,...,2.000000,3.285714,4.571429,5.571429,6.000000,11.476190,23.333333,37.857143,2.476190,0
610,EDIERM[649.3660]VQEAEK,EDIERM[655.3735]VQEAEK,-0.909764,,,,,,,,...,2.666667,5.000000,5.285714,6.761905,6.809524,12.857143,36.238095,81.238095,2.857143,0
611,SM[649.3660]PWNVDTLSK,SM[655.3735]PWNVDTLSK,0.276155,-0.230254,0.361472,0.135673,-0.067806,-0.089783,0.210097,-0.544967,...,2.333333,3.476190,3.523810,4.380952,4.857143,9.238095,22.095238,40.380952,1.619048,0
612,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,1.809524,1.809524,2.142857,2.666667,3.000000,8.047619,14.857143,20.857143,1.142857,1


In [58]:
# Some of the proteins in the ChURRO_2 dataset have sequence mismatches - drop these
CHURRO_2_df[CHURRO_2_df['Protein ID'].isin(unique_mismatches)]

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
139,DLINRM[649.3660]DYVEINIDHK,DLINRM[655.3735]DYVEINIDHK,,,,0.569772,0.80832,0.473233,,0.347552,...,2.142857,2.619048,3.285714,3.952381,4.380952,10.571429,29.047619,54.857143,1.857143,0
340,AQAALQAVNSVQSGNLALAASAAAVDAGM[649.3660]AMAGQSPVLR,AQAALQAVNSVQSGNLALAASAAAVDAGM[655.3735]AMAGQSPVLR,0.519305,,,,,0.441238,,,...,2.0,2.142857,2.428571,2.619048,2.904762,7.333333,17.333333,27.142857,1.0,1
399,VADGLPLAASM[649.3660]QEDEQSGR,VADGLPLAASM[655.3735]QEDEQSGR,,,,,,0.298995,0.408374,,...,2.095238,2.761905,3.333333,4.238095,4.47619,12.333333,33.333333,60.52381,2.333333,0
570,ITLEGPTEDVNVAQEQIEGM[649.3660]VK,ITLEGPTEDVNVAQEQIEGM[655.3735]VK,,,0.432476,0.575591,,0.36963,-0.490661,-1.065496,...,2.333333,3.428571,4.380952,5.380952,6.0,12.190476,34.095238,58.761905,2.904762,0


In [60]:
clean_CHURRO_2 = CHURRO_2_df[CHURRO_2_df['Protein ID'].isin(unique_mismatches) == False].reset_index(drop=True)
clean_CHURRO_2

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,2.761905,4.857143,6.238095,7.523810,9.095238,24.238095,59.190476,107.904762,6.238095,0
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,2.428571,3.952381,4.476190,5.523810,6.761905,18.714286,54.571429,106.047619,5.333333,0
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,2.523810,3.761905,4.666667,6.380952,7.238095,22.333333,70.333333,127.857143,6.619048,0
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,2.571429,5.380952,6.142857,8.238095,8.523810,19.190476,50.190476,111.476190,5.380952,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,TDFFIGGEEGM[649.3660]AEK,TDFFIGGEEGM[655.3735]AEK,0.008080,-0.318884,0.097640,-0.137602,0.183818,-0.011744,0.116157,0.026499,...,2.000000,3.285714,4.571429,5.571429,6.000000,11.476190,23.333333,37.857143,2.476190,0
606,EDIERM[649.3660]VQEAEK,EDIERM[655.3735]VQEAEK,-0.909764,,,,,,,,...,2.666667,5.000000,5.285714,6.761905,6.809524,12.857143,36.238095,81.238095,2.857143,0
607,SM[649.3660]PWNVDTLSK,SM[655.3735]PWNVDTLSK,0.276155,-0.230254,0.361472,0.135673,-0.067806,-0.089783,0.210097,-0.544967,...,2.333333,3.476190,3.523810,4.380952,4.857143,9.238095,22.095238,40.380952,1.619048,0
608,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,1.809524,1.809524,2.142857,2.666667,3.000000,8.047619,14.857143,20.857143,1.142857,1


In [61]:
CHURRO_psi_and_psi_map = extract_PSI_and_PHI(clean_CHURRO_2, concat_dihedrals, verbose=True)

        Protein ID Residue Name  Residue Position       PHI       PSI  \
1028092     Q07065          MET               423 -1.069864 -0.781354   

         PSI degrees  PHI degrees  
1028092   -44.768303   -61.298705  
       Protein ID Residue Name  Residue Position      PHI       PSI  \
103535     P25786          MET                26 -1.10221 -0.587313   

        PSI degrees  PHI degrees  
103535   -33.650528   -63.151991  
      Protein ID Residue Name  Residue Position       PHI       PSI  \
93190     P08238          MET               466 -1.566903  2.594126   

       PSI degrees  PHI degrees  
93190    148.63248   -89.776905  
      Protein ID Residue Name  Residue Position       PHI       PSI  \
11080     P15121          MET               169 -1.016269 -0.789571   

       PSI degrees  PHI degrees  
11080   -45.239094   -58.227902  
      Protein ID Residue Name  Residue Position       PHI       PSI  \
74120     P34932          MET               604 -1.157256 -0.710617   

   

  df = df._append(new_row, ignore_index = True)


        Protein ID Residue Name  Residue Position       PHI       PSI  \
1079429     P12694          MET               391 -1.120475 -0.678972   

         PSI degrees  PHI degrees  
1079429   -38.902245   -64.198474  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
771481     Q15366          MET               251 -2.690206  1.563798   

        PSI degrees  PHI degrees  
771481    89.599047  -154.137425  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
929251     P14625          MET               154 -2.288282  2.397712   

        PSI degrees  PHI degrees  
929251   137.378791  -131.108894  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
616229     P15374          MET               213 -1.135298 -0.657131   

        PSI degrees  PHI degrees  
616229   -37.650809   -65.047766  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
707786     Q9H078          MET               298 -1.370159 -0.11

In [64]:
CHURRO_psi_and_psi_map

Unnamed: 0,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,-0.781354,-1.069864,-44.768303,-61.298705
1,-0.587313,-1.102210,-33.650528,-63.151991
2,2.594126,-1.566903,148.632480,-89.776905
3,-0.789571,-1.016269,-45.239094,-58.227902
4,-0.710617,-1.157256,-40.715336,-66.305895
...,...,...,...,...
605,-0.692984,-1.066275,-39.705080,-61.093071
606,-0.622047,-1.144587,-35.640643,-65.580007
607,2.254328,-1.198355,129.163484,-68.660701
608,-0.646178,-1.162867,-37.023279,-66.627367


In [62]:
sum(CHURRO_psi_and_psi_map['PHI Degrees'].isna())

0

In [63]:
CHURRO_2_with_PSI_and_PHI = pd.concat([clean_CHURRO_2, CHURRO_psi_and_psi_map], axis=1)
CHURRO_2_with_PSI_and_PHI

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,LQHVEDGVLSM[649.3660]QVASAR,LQHVEDGVLSM[655.3735]QVASAR,,,,7.233420,6.853712,6.550813,,5.946325,...,7.523810,11.952381,23.952381,43.190476,2.333333,0,-0.781354,-1.069864,-44.768303,-61.298705
1,IHQIEYAM[649.3660]EAVK,IHQIEYAM[655.3735]EAVK,7.437563,,,5.649850,,7.006431,5.829334,,...,9.095238,24.238095,59.190476,107.904762,6.238095,0,-0.587313,-1.102210,-33.650528,-63.151991
2,YHTSQSGDEM[649.3660]TSLSEYVSR,YHTSQSGDEM[655.3735]TSLSEYVSR,,3.404261,3.587883,4.018111,3.963818,4.068696,3.961067,3.689166,...,6.761905,18.714286,54.571429,106.047619,5.333333,0,2.594126,-1.566903,148.632480,-89.776905
3,AIGISNFNHLQVEM[649.3660]ILNKPGLK,AIGISNFNHLQVEM[655.3735]ILNKPGLK,2.572725,2.859067,5.304034,2.905442,2.890081,2.809590,2.710625,2.643557,...,7.238095,22.333333,70.333333,127.857143,6.619048,0,-0.789571,-1.016269,-45.239094,-58.227902
4,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[649.3660]IMQDK,TSTVDLPIENQLLWQIDREMLNLYIENEGKM[655.3735]IMQDK,,,,,2.259787,,,2.211374,...,8.523810,19.190476,50.190476,111.476190,5.380952,0,-0.710617,-1.157256,-40.715336,-66.305895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,TDFFIGGEEGM[649.3660]AEK,TDFFIGGEEGM[655.3735]AEK,0.008080,-0.318884,0.097640,-0.137602,0.183818,-0.011744,0.116157,0.026499,...,6.000000,11.476190,23.333333,37.857143,2.476190,0,-0.692984,-1.066275,-39.705080,-61.093071
606,EDIERM[649.3660]VQEAEK,EDIERM[655.3735]VQEAEK,-0.909764,,,,,,,,...,6.809524,12.857143,36.238095,81.238095,2.857143,0,-0.622047,-1.144587,-35.640643,-65.580007
607,SM[649.3660]PWNVDTLSK,SM[655.3735]PWNVDTLSK,0.276155,-0.230254,0.361472,0.135673,-0.067806,-0.089783,0.210097,-0.544967,...,4.857143,9.238095,22.095238,40.380952,1.619048,0,2.254328,-1.198355,129.163484,-68.660701
608,LSLQEQDAAIVKNM[649.3660]K,LSLQEQDAAIVKNM[655.3735]K,,,,,,0.222761,,,...,3.000000,8.047619,14.857143,20.857143,1.142857,1,-0.646178,-1.162867,-37.023279,-66.627367


In [65]:
CHURRO_2_with_PSI_and_PHI.to_csv('ChURRO_2_with_alphafold_and_PSI_and_PHI_angles.csv')

# Ramachandran Analysis - ChURRO_3 Dataset

In [66]:
CHURRO_3_df = pd.read_csv('./ChURRO_3_with_alphafold.csv').drop(columns=['Unnamed: 0'])
CHURRO_3_df

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,SGEHDFGAAFDGDGDRNM[661.3660]ILGK,SGEHDFGAAFDGDGDRNM[667.3735]ILGK,,,4.117889,,,,,4.727480,...,3.238095,4.666667,6.809524,8.047619,10.047619,35.095238,102.000000,202.285714,9.095238,0
1,M[661.3660]LESYLHAK,M[667.3735]LESYLHAK,,,,,,,,1.845600,...,2.666667,4.571429,5.523810,6.809524,7.904762,25.904762,86.714286,172.285714,8.285714,0
2,AHSIQIM[661.3660]K,AHSIQIM[667.3735]K,,1.517974,1.695327,,,,,,...,2.285714,3.095238,4.285714,5.333333,6.666667,17.952381,49.857143,82.714286,4.000000,0
3,GFQQILAGEYDHLPEQAFYM[661.3660]VGPIEEAVAK,GFQQILAGEYDHLPEQAFYM[667.3735]VGPIEEAVAK,,1.587855,1.706229,,,,,1.419860,...,2.238095,4.095238,5.190476,6.000000,6.904762,19.380952,55.857143,99.571429,5.047619,0
4,NLKPIKPM[661.3660]QFLGDEETVRK,NLKPIKPM[667.3735]QFLGDEETVRK,1.806439,1.527960,1.332013,,,,1.576976,1.474839,...,2.285714,3.476190,4.428571,5.952381,6.619048,15.095238,42.142857,81.523810,2.904762,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170,LVSDGQALPEM[661.3660]EIHLQTNAEK,LVSDGQALPEM[667.3735]EIHLQTNAEK,,0.549687,0.592261,,,,-0.680663,-0.187379,...,2.619048,3.190476,4.428571,5.714286,6.714286,15.904762,41.190476,79.857143,2.809524,0
1171,SSIHNFM[661.3660]THPEFR,SSIHNFM[667.3735]THPEFR,,-0.278141,,,,,,0.282283,...,2.000000,2.000000,2.000000,2.000000,2.047619,4.095238,7.285714,10.857143,0.000000,1
1172,AAFTVSLDPGPLEQFPHSM[661.3660]EPQLR,AAFTVSLDPGPLEQFPHSM[667.3735]EPQLR,,0.408145,,,,,,-0.412637,...,2.285714,3.523810,4.761905,5.809524,6.476190,14.714286,38.761905,57.476190,3.047619,0
1173,AIGVLTSGGDAQGM[661.3660]NAAVR,AIGVLTSGGDAQGM[667.3735]NAAVR,,0.003958,-0.003922,,,,,,...,2.571429,4.476190,6.428571,7.904762,9.666667,35.047619,96.285714,192.095238,10.952381,0


In [68]:
# Some of the proteins in the ChURRO_3 dataset have sequence mismatches - drop these
CHURRO_3_df[CHURRO_3_df['Protein ID'].isin(unique_mismatches)]

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
31,NSFGLAPAAPLQVHAPLSPNQTVEISLPLSTVGSVM[661.3660]K,NSFGLAPAAPLQVHAPLSPNQTVEISLPLSTVGSVM[667.3735]K,,0.676336,0.548487,,,,,0.316254,...,2.190476,2.428571,3.095238,4.285714,5.238095,13.761905,40.333333,78.142857,2.285714,0
81,IVGELEQM[661.3660]VSEDVPLDHR,IVGELEQM[667.3735]VSEDVPLDHR,,-0.321336,-0.263063,,,,,-0.326161,...,2.095238,3.047619,3.52381,4.47619,5.142857,11.380952,33.904762,60.142857,2.095238,0
287,DLINRM[661.3660]DYVEINIDHK,DLINRM[667.3735]DYVEINIDHK,,0.033372,0.297264,,,,1.049806,0.619457,...,2.142857,2.619048,3.285714,3.952381,4.380952,10.571429,29.047619,54.857143,1.857143,0
299,ITLEGPTEDVNVAQEQIEGM[661.3660]VK,ITLEGPTEDVNVAQEQIEGM[667.3735]VK,,-0.240073,-0.067886,,,,,-0.003078,...,2.333333,3.428571,4.380952,5.380952,6.0,12.190476,34.095238,58.761905,2.904762,0
689,IM[661.3660]VANIEEVLQR,IM[667.3735]VANIEEVLQR,,,0.262925,,,,,0.02347,...,2.047619,2.571429,2.809524,3.428571,3.619048,8.142857,22.666667,46.52381,1.190476,0
700,KM[661.3660]KLPEHPEGGEPEDDEAPAK,KM[667.3735]KLPEHPEGGEPEDDEAPAK,,-0.356401,0.180117,,,,,-0.286201,...,1.952381,1.952381,2.0,2.0,2.0,3.904762,5.952381,9.714286,0.047619,1
715,VADGLPLAASM[661.3660]QEDEQSGR,VADGLPLAASM[667.3735]QEDEQSGR,,-0.551733,-0.737575,,,,,0.370644,...,2.095238,2.761905,3.333333,4.238095,4.47619,12.333333,33.333333,60.52381,2.333333,0
725,VSQM[661.3660]LILTPFQGQGHGAQLLETVHR,VSQM[667.3735]LILTPFQGQGHGAQLLETVHR,,,-0.077975,,,,,-2.009381,...,2.571429,3.285714,4.714286,6.571429,7.52381,21.190476,63.809524,131.571429,5.333333,0
860,VHIGQVIM[661.3660]SIR,VHIGQVIM[667.3735]SIR,,0.021698,0.228543,,,,,-0.78527,...,2.619048,3.666667,5.47619,6.904762,8.380952,22.52381,62.380952,102.52381,4.52381,0
1060,EEFAIM[661.3660]QTPAGELYDK,EEFAIM[667.3735]QTPAGELYDK,,,-0.389314,,,,,0.260292,...,2.142857,2.904762,3.0,3.761905,4.190476,11.809524,32.47619,56.095238,2.333333,0


In [71]:
clean_CHURRO_3 = CHURRO_3_df[CHURRO_3_df['Protein ID'].isin(unique_mismatches)==False].reset_index(drop = True)
clean_CHURRO_3

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,SGEHDFGAAFDGDGDRNM[661.3660]ILGK,SGEHDFGAAFDGDGDRNM[667.3735]ILGK,,,4.117889,,,,,4.727480,...,3.238095,4.666667,6.809524,8.047619,10.047619,35.095238,102.000000,202.285714,9.095238,0
1,M[661.3660]LESYLHAK,M[667.3735]LESYLHAK,,,,,,,,1.845600,...,2.666667,4.571429,5.523810,6.809524,7.904762,25.904762,86.714286,172.285714,8.285714,0
2,AHSIQIM[661.3660]K,AHSIQIM[667.3735]K,,1.517974,1.695327,,,,,,...,2.285714,3.095238,4.285714,5.333333,6.666667,17.952381,49.857143,82.714286,4.000000,0
3,GFQQILAGEYDHLPEQAFYM[661.3660]VGPIEEAVAK,GFQQILAGEYDHLPEQAFYM[667.3735]VGPIEEAVAK,,1.587855,1.706229,,,,,1.419860,...,2.238095,4.095238,5.190476,6.000000,6.904762,19.380952,55.857143,99.571429,5.047619,0
4,NLKPIKPM[661.3660]QFLGDEETVRK,NLKPIKPM[667.3735]QFLGDEETVRK,1.806439,1.527960,1.332013,,,,1.576976,1.474839,...,2.285714,3.476190,4.428571,5.952381,6.619048,15.095238,42.142857,81.523810,2.904762,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1160,LVSDGQALPEM[661.3660]EIHLQTNAEK,LVSDGQALPEM[667.3735]EIHLQTNAEK,,0.549687,0.592261,,,,-0.680663,-0.187379,...,2.619048,3.190476,4.428571,5.714286,6.714286,15.904762,41.190476,79.857143,2.809524,0
1161,SSIHNFM[661.3660]THPEFR,SSIHNFM[667.3735]THPEFR,,-0.278141,,,,,,0.282283,...,2.000000,2.000000,2.000000,2.000000,2.047619,4.095238,7.285714,10.857143,0.000000,1
1162,AAFTVSLDPGPLEQFPHSM[661.3660]EPQLR,AAFTVSLDPGPLEQFPHSM[667.3735]EPQLR,,0.408145,,,,,,-0.412637,...,2.285714,3.523810,4.761905,5.809524,6.476190,14.714286,38.761905,57.476190,3.047619,0
1163,AIGVLTSGGDAQGM[661.3660]NAAVR,AIGVLTSGGDAQGM[667.3735]NAAVR,,0.003958,-0.003922,,,,,,...,2.571429,4.476190,6.428571,7.904762,9.666667,35.047619,96.285714,192.095238,10.952381,0


In [73]:
CHURRO_3_psi_and_psi_map = extract_PSI_and_PHI(clean_CHURRO_3, concat_dihedrals,verbose=True)
CHURRO_3_psi_and_psi_map

        Protein ID Residue Name  Residue Position       PHI       PSI  \
1077074     P36871          MET               295 -1.758809  2.442435   

         PSI degrees  PHI degrees  
1077074   139.941233  -100.772305  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
281960     Q86X55          MET               268 -1.334826 -0.454934   

        PSI degrees  PHI degrees  
281960   -26.065813    -76.47989  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
568543     Q02543          MET               127 -1.599069 -0.818994   

        PSI degrees  PHI degrees  
568543   -46.924893   -91.619911  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
157161     P06576          MET               509  0.907058  1.027626   

        PSI degrees  PHI degrees  
157161    58.878618    51.970601  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
828644     P18669          MET               230 -0.946585  2.45

  df = df._append(new_row, ignore_index = True)


       Protein ID Residue Name  Residue Position       PHI      PSI  \
612689     Q969G3          MET               235 -1.084539 -0.68864   

        PSI degrees  PHI degrees  
612689   -39.456183    -62.13948  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
589269     Q9NPH2          MET               527 -0.968368  1.627369   

        PSI degrees  PHI degrees  
589269     93.24137   -55.483424  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
540023     P35241          MET               467 -2.914886 -2.971617   

        PSI degrees  PHI degrees  
540023  -170.261095  -167.010638  
       Protein ID Residue Name  Residue Position      PHI       PSI  \
444523     P10809          MET               145 -1.11825 -0.711062   

        PSI degrees  PHI degrees  
444523   -40.740826   -64.071029  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
368410     P07814          MET               258 -1.108333 -0.550293   


Unnamed: 0,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,2.442435,-1.758809,139.941233,-100.772305
1,-0.454934,-1.334826,-26.065813,-76.479890
2,-0.818994,-1.599069,-46.924893,-91.619911
3,1.027626,0.907058,58.878618,51.970601
4,2.457769,-0.946585,140.819803,-54.235342
...,...,...,...,...
1160,1.828180,-1.895589,104.746982,-108.609264
1161,1.526607,-3.103464,87.468132,-177.815396
1162,-0.218514,-1.480004,-12.519931,-84.797997
1163,-0.723987,-0.939829,-41.481381,-53.848240


In [74]:
sum(CHURRO_3_psi_and_psi_map['PHI Degrees'].isna())

0

In [75]:
CHURRO_3_with_PSI_and_PHI = pd.concat([clean_CHURRO_3, CHURRO_3_psi_and_psi_map], axis=1)
CHURRO_3_with_PSI_and_PHI

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,SGEHDFGAAFDGDGDRNM[661.3660]ILGK,SGEHDFGAAFDGDGDRNM[667.3735]ILGK,,,4.117889,,,,,4.727480,...,10.047619,35.095238,102.000000,202.285714,9.095238,0,2.442435,-1.758809,139.941233,-100.772305
1,M[661.3660]LESYLHAK,M[667.3735]LESYLHAK,,,,,,,,1.845600,...,7.904762,25.904762,86.714286,172.285714,8.285714,0,-0.454934,-1.334826,-26.065813,-76.479890
2,AHSIQIM[661.3660]K,AHSIQIM[667.3735]K,,1.517974,1.695327,,,,,,...,6.666667,17.952381,49.857143,82.714286,4.000000,0,-0.818994,-1.599069,-46.924893,-91.619911
3,GFQQILAGEYDHLPEQAFYM[661.3660]VGPIEEAVAK,GFQQILAGEYDHLPEQAFYM[667.3735]VGPIEEAVAK,,1.587855,1.706229,,,,,1.419860,...,6.904762,19.380952,55.857143,99.571429,5.047619,0,1.027626,0.907058,58.878618,51.970601
4,NLKPIKPM[661.3660]QFLGDEETVRK,NLKPIKPM[667.3735]QFLGDEETVRK,1.806439,1.527960,1.332013,,,,1.576976,1.474839,...,6.619048,15.095238,42.142857,81.523810,2.904762,0,2.457769,-0.946585,140.819803,-54.235342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1160,LVSDGQALPEM[661.3660]EIHLQTNAEK,LVSDGQALPEM[667.3735]EIHLQTNAEK,,0.549687,0.592261,,,,-0.680663,-0.187379,...,6.714286,15.904762,41.190476,79.857143,2.809524,0,1.828180,-1.895589,104.746982,-108.609264
1161,SSIHNFM[661.3660]THPEFR,SSIHNFM[667.3735]THPEFR,,-0.278141,,,,,,0.282283,...,2.047619,4.095238,7.285714,10.857143,0.000000,1,1.526607,-3.103464,87.468132,-177.815396
1162,AAFTVSLDPGPLEQFPHSM[661.3660]EPQLR,AAFTVSLDPGPLEQFPHSM[667.3735]EPQLR,,0.408145,,,,,,-0.412637,...,6.476190,14.714286,38.761905,57.476190,3.047619,0,-0.218514,-1.480004,-12.519931,-84.797997
1163,AIGVLTSGGDAQGM[661.3660]NAAVR,AIGVLTSGGDAQGM[667.3735]NAAVR,,0.003958,-0.003922,,,,,,...,9.666667,35.047619,96.285714,192.095238,10.952381,0,-0.723987,-0.939829,-41.481381,-53.848240


In [76]:
CHURRO_3_with_PSI_and_PHI.to_csv('ChURRO_3_with_alphafold_and_PSI_and_PHI_angles.csv', index=False)

# Ramachandran Analysis - ChURRO_4 Dataset

In [77]:
CHURRO_4_df = pd.read_csv('./ChURRO_4_with_alphafold.csv').drop(columns=['Unnamed: 0'])
CHURRO_4_df

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,LQHVEDGVLSM[716.3718]QVASAR,LQHVEDGVLSM[722.3793]QVASAR,,4.656273,,,4.133012,4.551815,,4.242820,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0
1,NLKPIKPM[716.3718]QFLGDEETVRK,NLKPIKPM[722.3793]QFLGDEETVRK,3.769576,4.084958,3.440390,3.120784,3.223466,3.237076,3.317560,3.738827,...,2.285714,3.476190,4.428571,5.952381,6.619048,15.095238,42.142857,81.523810,2.904762,0
2,EEELKDIQNM[716.3718]NFLLK,EEELKDIQNM[722.3793]NFLLK,3.013929,3.156209,2.977990,3.132468,2.965553,3.016646,2.986863,3.034743,...,3.142857,5.285714,6.333333,7.714286,7.714286,12.000000,19.809524,26.952381,2.047619,1
3,NLLHVTDTGVGM[716.3718]TREELVK,NLLHVTDTGVGM[722.3793]TREELVK,2.985369,3.111739,,,2.774224,,,,...,2.666667,4.000000,6.000000,7.238095,8.809524,23.428571,67.428571,128.190476,5.476190,0
4,M[716.3718]HTTFEHDIQALGTQVR,M[722.3793]HTTFEHDIQALGTQVR,,1.828016,,,2.890468,2.740659,3.560620,1.773127,...,2.238095,4.190476,4.761905,5.761905,6.238095,12.476190,34.809524,55.095238,2.476190,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,SGEGEVSGLM[716.3718]R,SGEGEVSGLM[722.3793]R,-0.297660,-0.280111,-0.458942,,-0.265283,0.388757,0.583342,0.346406,...,2.000000,2.000000,2.000000,2.000000,2.047619,4.047619,7.142857,10.380952,0.000000,1
689,FLRLM[716.3718]GAGK,FLRLM[722.3793]GAGK,,,,0.125250,,,,-0.121592,...,2.142857,3.285714,3.857143,4.666667,4.761905,8.619048,16.476190,22.238095,1.666667,1
690,SEIEYYAM[716.3718]LAK,SEIEYYAM[722.3793]LAK,-0.315793,,,,0.323492,,,,...,2.333333,3.809524,5.285714,6.619048,7.238095,15.857143,45.571429,73.904762,4.238095,0
691,EYWM[716.3718]DPEGEMKPGRK,EYWM[722.3793]DPEGEMKPGRK,,0.781025,,,,-0.132193,,-0.640560,...,2.476190,3.571429,4.380952,6.142857,7.476190,14.619048,28.476190,42.333333,1.761905,0


In [78]:
# Some of the proteins in the ChURRO_4 dataset have sequence mismatches - drop these
CHURRO_4_df[CHURRO_4_df['Protein ID'].isin(unique_mismatches)]

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
68,DLINRM[716.3718]DYVEINIDHK,DLINRM[722.3793]DYVEINIDHK,0.556827,-0.225588,0.686367,-0.072403,0.140975,0.599879,0.380257,0.48282,...,2.142857,2.619048,3.285714,3.952381,4.380952,10.571429,29.047619,54.857143,1.857143,0
71,VADGLPLAASM[716.3718]QEDEQSGR,VADGLPLAASM[722.3793]QEDEQSGR,0.156336,0.42264,0.224529,0.500254,0.24556,,0.246935,,...,2.095238,2.761905,3.333333,4.238095,4.47619,12.333333,33.333333,60.52381,2.333333,0
161,FNADEFEDM[716.3718]VAEK,FNADEFEDM[722.3793]VAEK,-0.010885,-0.212205,,,-0.189462,-0.352105,-0.274088,-0.494603,...,2.380952,3.190476,4.285714,5.52381,6.619048,13.714286,36.571429,61.52381,2.904762,0
224,KM[716.3718]KLPEHPEGGEPEDDEAPAK,KM[722.3793]KLPEHPEGGEPEDDEAPAK,-0.070958,-0.612453,-0.258176,-0.674844,-0.142179,-0.868376,-0.587499,-0.79214,...,1.952381,1.952381,2.0,2.0,2.0,3.904762,5.952381,9.714286,0.047619,1
444,EEFAIM[716.3718]QTPAGELYDK,EEFAIM[722.3793]QTPAGELYDK,,,-0.271974,,,,-0.173128,-0.009217,...,2.142857,2.904762,3.0,3.761905,4.190476,11.809524,32.47619,56.095238,2.333333,0
501,RELLELASRM[716.3718]ENER,RELLELASRM[722.3793]ENER,,-0.084248,,,,-0.211251,,,...,2.095238,2.857143,3.333333,4.142857,4.714286,10.428571,29.904762,51.333333,1.904762,0


In [79]:
clean_CHURRO_4 = CHURRO_4_df[CHURRO_4_df['Protein ID'].isin(unique_mismatches)==False].reset_index(drop = True)
clean_CHURRO_4

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,LQHVEDGVLSM[716.3718]QVASAR,LQHVEDGVLSM[722.3793]QVASAR,,4.656273,,,4.133012,4.551815,,4.242820,...,2.428571,4.476190,5.666667,7.476190,7.523810,11.952381,23.952381,43.190476,2.333333,0
1,NLKPIKPM[716.3718]QFLGDEETVRK,NLKPIKPM[722.3793]QFLGDEETVRK,3.769576,4.084958,3.440390,3.120784,3.223466,3.237076,3.317560,3.738827,...,2.285714,3.476190,4.428571,5.952381,6.619048,15.095238,42.142857,81.523810,2.904762,0
2,EEELKDIQNM[716.3718]NFLLK,EEELKDIQNM[722.3793]NFLLK,3.013929,3.156209,2.977990,3.132468,2.965553,3.016646,2.986863,3.034743,...,3.142857,5.285714,6.333333,7.714286,7.714286,12.000000,19.809524,26.952381,2.047619,1
3,NLLHVTDTGVGM[716.3718]TREELVK,NLLHVTDTGVGM[722.3793]TREELVK,2.985369,3.111739,,,2.774224,,,,...,2.666667,4.000000,6.000000,7.238095,8.809524,23.428571,67.428571,128.190476,5.476190,0
4,M[716.3718]HTTFEHDIQALGTQVR,M[722.3793]HTTFEHDIQALGTQVR,,1.828016,,,2.890468,2.740659,3.560620,1.773127,...,2.238095,4.190476,4.761905,5.761905,6.238095,12.476190,34.809524,55.095238,2.476190,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
682,SGEGEVSGLM[716.3718]R,SGEGEVSGLM[722.3793]R,-0.297660,-0.280111,-0.458942,,-0.265283,0.388757,0.583342,0.346406,...,2.000000,2.000000,2.000000,2.000000,2.047619,4.047619,7.142857,10.380952,0.000000,1
683,FLRLM[716.3718]GAGK,FLRLM[722.3793]GAGK,,,,0.125250,,,,-0.121592,...,2.142857,3.285714,3.857143,4.666667,4.761905,8.619048,16.476190,22.238095,1.666667,1
684,SEIEYYAM[716.3718]LAK,SEIEYYAM[722.3793]LAK,-0.315793,,,,0.323492,,,,...,2.333333,3.809524,5.285714,6.619048,7.238095,15.857143,45.571429,73.904762,4.238095,0
685,EYWM[716.3718]DPEGEMKPGRK,EYWM[722.3793]DPEGEMKPGRK,,0.781025,,,,-0.132193,,-0.640560,...,2.476190,3.571429,4.380952,6.142857,7.476190,14.619048,28.476190,42.333333,1.761905,0


In [80]:
CHURRO_4_psi_and_psi_map = extract_PSI_and_PHI(clean_CHURRO_4, concat_dihedrals, verbose=True)
CHURRO_4_psi_and_psi_map

        Protein ID Residue Name  Residue Position       PHI       PSI  \
1028092     Q07065          MET               423 -1.069864 -0.781354   

         PSI degrees  PHI degrees  
1028092   -44.768303   -61.298705  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
828644     P18669          MET               230 -0.946585  2.457769   

        PSI degrees  PHI degrees  
828644   140.819803   -54.235342  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
780318     Q86UP2          MET               643 -1.148097 -0.668732   

        PSI degrees  PHI degrees  
780318   -38.315541   -65.781106  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
929251     P14625          MET               154 -2.288282  2.397712   

        PSI degrees  PHI degrees  
929251   137.378791  -131.108894  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
386828     Q01082          MET              1845 -1.111664 -0.59

  df = df._append(new_row, ignore_index = True)


       Protein ID Residue Name  Residue Position       PHI       PSI  \
145674     P04406          MET               105 -0.997507 -0.722755   

        PSI degrees  PHI degrees  
145674   -41.410823   -57.152918  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
157161     P06576          MET               509  0.907058  1.027626   

        PSI degrees  PHI degrees  
157161    58.878618    51.970601  
       Protein ID Residue Name  Residue Position      PHI       PSI  \
260501     Q9Y237          MET                62 -1.19499  2.443023   

        PSI degrees  PHI degrees  
260501   139.974879   -68.467876  
        Protein ID Residue Name  Residue Position       PHI       PSI  \
1059855     P11940          MET               584 -1.143756 -0.749642   

         PSI degrees  PHI degrees  
1059855   -42.951337   -65.532414  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
721056     Q9Y4L1          MET               417 -1.171078 -0.7038

Unnamed: 0,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,-0.781354,-1.069864,-44.768303,-61.298705
1,2.457769,-0.946585,140.819803,-54.235342
2,-0.668732,-1.148097,-38.315541,-65.781106
3,2.397712,-2.288282,137.378791,-131.108894
4,-0.593221,-1.111664,-33.989066,-63.693630
...,...,...,...,...
682,2.148378,-1.089016,123.092976,-62.396023
683,0.036523,-1.320989,2.092631,-75.687091
684,-0.839279,-1.031756,-48.087119,-59.115286
685,2.227214,-1.584730,127.609944,-90.798340


In [81]:
sum(CHURRO_4_psi_and_psi_map['PHI Degrees'].isna())

0

In [82]:
CHURRO_4_with_PSI_and_PHI = pd.concat([clean_CHURRO_4, CHURRO_4_psi_and_psi_map], axis=1)
CHURRO_4_with_PSI_and_PHI

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,1_4 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,2_4 Log2 Ratio HL,...,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,LQHVEDGVLSM[716.3718]QVASAR,LQHVEDGVLSM[722.3793]QVASAR,,4.656273,,,4.133012,4.551815,,4.242820,...,7.523810,11.952381,23.952381,43.190476,2.333333,0,-0.781354,-1.069864,-44.768303,-61.298705
1,NLKPIKPM[716.3718]QFLGDEETVRK,NLKPIKPM[722.3793]QFLGDEETVRK,3.769576,4.084958,3.440390,3.120784,3.223466,3.237076,3.317560,3.738827,...,6.619048,15.095238,42.142857,81.523810,2.904762,0,2.457769,-0.946585,140.819803,-54.235342
2,EEELKDIQNM[716.3718]NFLLK,EEELKDIQNM[722.3793]NFLLK,3.013929,3.156209,2.977990,3.132468,2.965553,3.016646,2.986863,3.034743,...,7.714286,12.000000,19.809524,26.952381,2.047619,1,-0.668732,-1.148097,-38.315541,-65.781106
3,NLLHVTDTGVGM[716.3718]TREELVK,NLLHVTDTGVGM[722.3793]TREELVK,2.985369,3.111739,,,2.774224,,,,...,8.809524,23.428571,67.428571,128.190476,5.476190,0,2.397712,-2.288282,137.378791,-131.108894
4,M[716.3718]HTTFEHDIQALGTQVR,M[722.3793]HTTFEHDIQALGTQVR,,1.828016,,,2.890468,2.740659,3.560620,1.773127,...,6.238095,12.476190,34.809524,55.095238,2.476190,0,-0.593221,-1.111664,-33.989066,-63.693630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
682,SGEGEVSGLM[716.3718]R,SGEGEVSGLM[722.3793]R,-0.297660,-0.280111,-0.458942,,-0.265283,0.388757,0.583342,0.346406,...,2.047619,4.047619,7.142857,10.380952,0.000000,1,2.148378,-1.089016,123.092976,-62.396023
683,FLRLM[716.3718]GAGK,FLRLM[722.3793]GAGK,,,,0.125250,,,,-0.121592,...,4.761905,8.619048,16.476190,22.238095,1.666667,1,0.036523,-1.320989,2.092631,-75.687091
684,SEIEYYAM[716.3718]LAK,SEIEYYAM[722.3793]LAK,-0.315793,,,,0.323492,,,,...,7.238095,15.857143,45.571429,73.904762,4.238095,0,-0.839279,-1.031756,-48.087119,-59.115286
685,EYWM[716.3718]DPEGEMKPGRK,EYWM[722.3793]DPEGEMKPGRK,,0.781025,,,,-0.132193,,-0.640560,...,7.476190,14.619048,28.476190,42.333333,1.761905,0,2.227214,-1.584730,127.609944,-90.798340


In [83]:
CHURRO_4_with_PSI_and_PHI.to_csv('ChURRO_4_with_alphafold_and_PSI_and_PHI_angles.csv', index=False)

# Ramachandran Analysis - ChURRO_5 Dataset

In [84]:
CHURRO_5_df = pd.read_csv('./ChURRO_5_with_alphafold.csv').drop(columns=['Unnamed: 0'])
CHURRO_5_df

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,GQYISPFHDIPIYADKDVFHM[695.3503]VVEVPR,GQYISPFHDIPIYADKDVFHM[701.3578]VVEVPR,,,,,3.997861,4.300817,,,...,2.809524,4.238095,5.952381,7.285714,9.142857,25.238095,74.333333,148.714286,5.857143,0
1,MMVDKDGDVTVTNDGATILSMM[695.3503]DVDHQIAK,MMVDKDGDVTVTNDGATILSM[701.3578]MDVDHQIAK,,,,,,,,3.390691,...,2.190476,3.619048,4.190476,5.380952,5.714286,16.571429,55.904762,108.142857,5.190476,0
2,EEELKDIQNM[695.3503]NFLLK,EEELKDIQNM[701.3578]NFLLK,,,3.025504,1.850107,2.063453,1.286034,5.001484,4.197276,...,3.142857,5.285714,6.333333,7.714286,7.714286,12.000000,19.809524,26.952381,2.047619,1
3,LGEM[695.3503]WSEQSAK,LGEM[701.3578]WSEQSAK,,,,1.829404,1.759502,1.939710,,,...,2.095238,3.761905,4.666667,6.000000,6.190476,11.904762,28.761905,42.190476,2.476190,0
4,AVENYLIQM[695.3503]AR,AVENYLIQM[701.3578]AR,,2.517243,2.418565,2.137920,,,,2.337744,...,2.095238,3.809524,4.238095,5.238095,5.285714,10.619048,29.285714,51.714286,2.238095,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,KGDIVDIKGM[695.3503]GTVQK,KGDIVDIKGM[701.3578]GTVQK,,0.612230,-0.372074,-0.465165,0.065857,0.175049,,,...,2.285714,2.904762,4.190476,5.523810,7.047619,18.714286,50.857143,79.476190,4.047619,0
699,AITGASLADIM[695.3503]AK,AITGASLADIM[701.3578]AK,,0.878992,,0.613909,-0.488093,-0.495052,-0.533030,,...,2.000000,2.809524,3.380952,4.000000,4.428571,8.904762,17.000000,23.714286,1.095238,1
700,QM[695.3503]QSSFTSSEQELER,QM[701.3578]QSSFTSSEQELER,,,,-0.213786,,0.209838,,,...,2.714286,5.047619,6.190476,7.904762,7.904762,12.142857,20.190476,27.333333,1.904762,1
701,RDHFEEAM[695.3503]R,RDHFEEAM[701.3578]R,,,,-1.053555,0.704664,0.357943,,,...,2.238095,3.380952,4.190476,5.047619,5.761905,13.714286,38.714286,67.380952,3.190476,0


In [85]:
# Some of the proteins in the ChURRO_5 dataset have sequence mismatches - drop these
CHURRO_5_df[CHURRO_5_df['Protein ID'].isin(unique_mismatches)]

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
23,DLINRM[695.3503]DYVEINIDHK,DLINRM[701.3578]DYVEINIDHK,,,1.098093,0.673753,1.24563,0.7719,,1.35997,...,2.142857,2.619048,3.285714,3.952381,4.380952,10.571429,29.047619,54.857143,1.857143,0
217,FNADEFEDM[695.3503]VAEK,FNADEFEDM[701.3578]VAEK,,,-0.700671,-0.490154,,,-0.704055,-1.730357,...,2.380952,3.190476,4.285714,5.52381,6.619048,13.714286,36.571429,61.52381,2.904762,0
327,VHIGQVIM[695.3503]SIR,VHIGQVIM[701.3578]SIR,,,-0.714431,0.40922,-0.927942,-0.584113,,,...,2.619048,3.666667,5.47619,6.904762,8.380952,22.52381,62.380952,102.52381,4.52381,0
492,KM[695.3503]KLPEHPEGGEPEDDEAPAK,KM[701.3578]KLPEHPEGGEPEDDEAPAK,,,,0.39079,,0.11146,,,...,1.952381,1.952381,2.0,2.0,2.0,3.904762,5.952381,9.714286,0.047619,1
515,ITLEGPTEDVNVAQEQIEGM[695.3503]VK,ITLEGPTEDVNVAQEQIEGM[701.3578]VK,,0.5575,-0.041178,0.127462,0.10288,-0.022903,-0.694505,-0.511367,...,2.333333,3.428571,4.380952,5.380952,6.0,12.190476,34.095238,58.761905,2.904762,0
518,AQAALQAVNSVQSGNLALAASAAAVDAGM[695.3503]AMAGQSPVLR,AQAALQAVNSVQSGNLALAASAAAVDAGM[701.3578]AMAGQSPVLR,,,0.31377,0.809926,-0.119063,-0.113959,,,...,2.0,2.142857,2.428571,2.619048,2.904762,7.333333,17.333333,27.142857,1.0,1


In [87]:
clean_CHURRO_5 = CHURRO_5_df[CHURRO_5_df['Protein ID'].isin(unique_mismatches) == False].reset_index(drop=True)
clean_CHURRO_5

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_6_180_pae_smooth10,nAA_6.5_180_pae_smooth10,nAA_7_180_pae_smooth10,nAA_7.5_180_pae_smooth10,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR
0,GQYISPFHDIPIYADKDVFHM[695.3503]VVEVPR,GQYISPFHDIPIYADKDVFHM[701.3578]VVEVPR,,,,,3.997861,4.300817,,,...,2.809524,4.238095,5.952381,7.285714,9.142857,25.238095,74.333333,148.714286,5.857143,0
1,MMVDKDGDVTVTNDGATILSMM[695.3503]DVDHQIAK,MMVDKDGDVTVTNDGATILSM[701.3578]MDVDHQIAK,,,,,,,,3.390691,...,2.190476,3.619048,4.190476,5.380952,5.714286,16.571429,55.904762,108.142857,5.190476,0
2,EEELKDIQNM[695.3503]NFLLK,EEELKDIQNM[701.3578]NFLLK,,,3.025504,1.850107,2.063453,1.286034,5.001484,4.197276,...,3.142857,5.285714,6.333333,7.714286,7.714286,12.000000,19.809524,26.952381,2.047619,1
3,LGEM[695.3503]WSEQSAK,LGEM[701.3578]WSEQSAK,,,,1.829404,1.759502,1.939710,,,...,2.095238,3.761905,4.666667,6.000000,6.190476,11.904762,28.761905,42.190476,2.476190,0
4,AVENYLIQM[695.3503]AR,AVENYLIQM[701.3578]AR,,2.517243,2.418565,2.137920,,,,2.337744,...,2.095238,3.809524,4.238095,5.238095,5.285714,10.619048,29.285714,51.714286,2.238095,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692,KGDIVDIKGM[695.3503]GTVQK,KGDIVDIKGM[701.3578]GTVQK,,0.612230,-0.372074,-0.465165,0.065857,0.175049,,,...,2.285714,2.904762,4.190476,5.523810,7.047619,18.714286,50.857143,79.476190,4.047619,0
693,AITGASLADIM[695.3503]AK,AITGASLADIM[701.3578]AK,,0.878992,,0.613909,-0.488093,-0.495052,-0.533030,,...,2.000000,2.809524,3.380952,4.000000,4.428571,8.904762,17.000000,23.714286,1.095238,1
694,QM[695.3503]QSSFTSSEQELER,QM[701.3578]QSSFTSSEQELER,,,,-0.213786,,0.209838,,,...,2.714286,5.047619,6.190476,7.904762,7.904762,12.142857,20.190476,27.333333,1.904762,1
695,RDHFEEAM[695.3503]R,RDHFEEAM[701.3578]R,,,,-1.053555,0.704664,0.357943,,,...,2.238095,3.380952,4.190476,5.047619,5.761905,13.714286,38.714286,67.380952,3.190476,0


In [88]:
CHURRO_5_psi_and_psi_map = extract_PSI_and_PHI(clean_CHURRO_5, concat_dihedrals, verbose=True)
CHURRO_5_psi_and_psi_map

       Protein ID Residue Name  Residue Position      PHI       PSI  \
280921     Q15181          MET                46 -2.14499  2.355836   

        PSI degrees  PHI degrees  
280921   134.979488  -122.898859  
       Protein ID Residue Name  Residue Position       PHI      PSI  \
379125     P48643          MET                81 -1.305379  2.10845   

        PSI degrees  PHI degrees  
379125   120.805315   -74.792719  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
780318     Q86UP2          MET               643 -1.148097 -0.668732   

        PSI degrees  PHI degrees  
780318   -38.315541   -65.781106  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
684132     P26583          MET               132 -1.011894 -0.714445   

        PSI degrees  PHI degrees  
684132   -40.934669   -57.977265  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
659472     O14737          MET                77 -1.005026 -0.674473   


  df = df._append(new_row, ignore_index = True)


       Protein ID Residue Name  Residue Position       PHI       PSI  \
442847     Q9UBF2          MET               576 -1.020662 -0.385914   

        PSI degrees  PHI degrees  
442847   -22.111222   -58.479627  
       Protein ID Residue Name  Residue Position       PHI     PSI  \
654871     Q14684          MET               486 -2.009258  1.8041   

        PSI degrees  PHI degrees  
654871    103.36734  -115.121978  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
828620     P18669          MET               206 -1.179164 -0.483346   

        PSI degrees  PHI degrees  
828620    -27.69367   -67.561137  
       Protein ID Residue Name  Residue Position       PHI       PSI  \
631302     Q13263          MET               796 -1.173366 -0.835218   

        PSI degrees  PHI degrees  
631302    -47.85448    -67.22892  
      Protein ID Residue Name  Residue Position       PHI       PSI  \
67804     P35580          MET               909 -1.056535 -0.703234   

 

Unnamed: 0,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,2.355836,-2.144990,134.979488,-122.898859
1,2.108450,-1.305379,120.805315,-74.792719
2,-0.668732,-1.148097,-38.315541,-65.781106
3,-0.714445,-1.011894,-40.934669,-57.977265
4,-0.674473,-1.005026,-38.644440,-57.583767
...,...,...,...,...
692,2.430323,-2.171364,139.247273,-124.409990
693,-0.750828,-1.166838,-43.019297,-66.854912
694,-0.704603,-1.016828,-40.370798,-58.259925
695,-0.555659,-1.146523,-31.836928,-65.690956


In [89]:
sum(CHURRO_5_psi_and_psi_map['PHI Degrees'].isna())

0

In [90]:
CHURRO_5_with_PSI_and_PHI = pd.concat([clean_CHURRO_5, CHURRO_5_psi_and_psi_map], axis=1)
CHURRO_5_with_PSI_and_PHI

Unnamed: 0,Light Modified Peptide,Heavy Modified Peptide,1_1 Log2 Ratio HL,1_2 Log2 Ratio HL,1_3 Log2 Ratio HL,2_1 Log2 Ratio HL,2_2 Log2 Ratio HL,2_3 Log2 Ratio HL,3_1 Log2 Ratio HL,3_2 Log2 Ratio HL,...,nAA_8_180_pae_smooth10,nAA_12_180_pae_smooth10,nAA_18_180_pae_smooth10,nAA_24_180_pae_smooth10,nAA_12_70_pae_smooth10,IDR,PSI Radians,PHI Radians,PSI Degrees,PHI Degrees
0,GQYISPFHDIPIYADKDVFHM[695.3503]VVEVPR,GQYISPFHDIPIYADKDVFHM[701.3578]VVEVPR,,,,,3.997861,4.300817,,,...,9.142857,25.238095,74.333333,148.714286,5.857143,0,2.355836,-2.144990,134.979488,-122.898859
1,MMVDKDGDVTVTNDGATILSMM[695.3503]DVDHQIAK,MMVDKDGDVTVTNDGATILSM[701.3578]MDVDHQIAK,,,,,,,,3.390691,...,5.714286,16.571429,55.904762,108.142857,5.190476,0,2.108450,-1.305379,120.805315,-74.792719
2,EEELKDIQNM[695.3503]NFLLK,EEELKDIQNM[701.3578]NFLLK,,,3.025504,1.850107,2.063453,1.286034,5.001484,4.197276,...,7.714286,12.000000,19.809524,26.952381,2.047619,1,-0.668732,-1.148097,-38.315541,-65.781106
3,LGEM[695.3503]WSEQSAK,LGEM[701.3578]WSEQSAK,,,,1.829404,1.759502,1.939710,,,...,6.190476,11.904762,28.761905,42.190476,2.476190,0,-0.714445,-1.011894,-40.934669,-57.977265
4,AVENYLIQM[695.3503]AR,AVENYLIQM[701.3578]AR,,2.517243,2.418565,2.137920,,,,2.337744,...,5.285714,10.619048,29.285714,51.714286,2.238095,0,-0.674473,-1.005026,-38.644440,-57.583767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692,KGDIVDIKGM[695.3503]GTVQK,KGDIVDIKGM[701.3578]GTVQK,,0.612230,-0.372074,-0.465165,0.065857,0.175049,,,...,7.047619,18.714286,50.857143,79.476190,4.047619,0,2.430323,-2.171364,139.247273,-124.409990
693,AITGASLADIM[695.3503]AK,AITGASLADIM[701.3578]AK,,0.878992,,0.613909,-0.488093,-0.495052,-0.533030,,...,4.428571,8.904762,17.000000,23.714286,1.095238,1,-0.750828,-1.166838,-43.019297,-66.854912
694,QM[695.3503]QSSFTSSEQELER,QM[701.3578]QSSFTSSEQELER,,,,-0.213786,,0.209838,,,...,7.904762,12.142857,20.190476,27.333333,1.904762,1,-0.704603,-1.016828,-40.370798,-58.259925
695,RDHFEEAM[695.3503]R,RDHFEEAM[701.3578]R,,,,-1.053555,0.704664,0.357943,,,...,5.761905,13.714286,38.714286,67.380952,3.190476,0,-0.555659,-1.146523,-31.836928,-65.690956


In [94]:
CHURRO_5_with_PSI_and_PHI.to_csv('ChURRO_5_with_alphafold_and_PSI_and_PHI_angles.csv', index=False)

# End