In [16]:
import Bio.PDB
import pandas as pd
import os
import re




In [23]:
def get_phi_and_psi(Protein_ID, CIF_file_path):
    df = pd.DataFrame()
    for model in Bio.PDB.MMCIFParser().get_structure(Protein_ID, CIF_file_path) :
        for chain in model :
            polypeptides = Bio.PDB.PPBuilder().build_peptides(chain)
            res_index_list = []
            res_name_list = []
            phi_list = []
            psi_list = []
            for poly_index, poly in enumerate(polypeptides) :
                print("Model %s Chain %s" % (str(model.id), str(chain.id)))
                print ("(part %i of %i)" % (poly_index+1, len(polypeptides)))
                print ("length %i" % (len(poly)))
                print ("from %s%i" % (poly[0].resname, poly[0].id[1]))
                print ("to %s%i" % (poly[-1].resname, poly[-1].id[1]))
                phi_psi = poly.get_phi_psi_list()
                res_index_sublist = []
                res_name_sublist = []
                phi_sublist = []
                psi_sublist = []
                for res_index, residue in enumerate(poly) :
                    res_name = "%s%i" % (residue.resname, residue.id[1])
                    # print(res_name, phi_psi[res_index])
                    res_index_sublist.append(residue.id[1])
                    res_name_sublist.append(residue.resname)
                    phi_sublist.append(phi_psi[res_index][0])
                    psi_sublist.append(phi_psi[res_index][1])
                res_index_list.extend(res_index_sublist)
                res_name_list.extend(res_name_sublist)
                phi_list.extend(phi_sublist)
                psi_list.extend(psi_sublist)
            df['Protein ID'] = [Protein_ID] * len(res_index_list)
            df['Residue Name'] = res_name_list
            df['Residue Position'] = res_index_list
            df['PHI'] = phi_list
            df['PSI'] = psi_list
    return df
    


In [8]:
CIF_file_path = '../alphafold_data/cif/A0MZ66.cif'
Protein_ID = 'A0MZ66'

In [9]:
example_df = get_phi_and_psi(Protein_ID, CIF_file_path)

Model 0 Chain A
(part 1 of 1)
length 631
from MET1
to CYS631
MET1 (None, -0.41924193091742024)
ASN2 (-1.2563680154935646, -0.3478143078050237)
SER3 (-1.1301186996978088, -0.43179000062903455)
SER4 (-1.3932408231752358, -0.5569298886325121)
ASP5 (-1.3990875781240113, -0.6153789529636478)
GLU6 (-1.0732327532226462, -0.6942234989393498)
GLU7 (-1.1281981705840554, -0.7407819320122645)
LYS8 (-1.114846171253197, -0.7060186520664944)
GLN9 (-1.1209231896545704, -0.7349413192532175)
LEU10 (-1.0753693733913043, -0.7433458148467411)
GLN11 (-1.102470942726099, -0.7653005891387816)
LEU12 (-1.0980756756239562, -0.7920545155490558)
ILE13 (-1.0852565232413844, -0.679495824939244)
THR14 (-1.140503814168088, -0.80273898333612)
SER15 (-1.0945978955211637, -0.7663913620755732)
LEU16 (-1.095165069052245, -0.7141728355612355)
LYS17 (-1.096642465794401, -0.7748727391772476)
GLU18 (-1.163354440860692, -0.7317109700535894)
GLN19 (-1.0519605582376574, -0.8291372772761164)
ALA20 (-1.1275273271123547, -0.56364471

In [10]:
example_df

Unnamed: 0,Protein ID,Residue Name,Residue Position,PHI,PSI
0,A0MZ66,MET,1,,-0.419242
1,A0MZ66,ASN,2,-1.256368,-0.347814
2,A0MZ66,SER,3,-1.130119,-0.431790
3,A0MZ66,SER,4,-1.393241,-0.556930
4,A0MZ66,ASP,5,-1.399088,-0.615379
...,...,...,...,...,...
626,A0MZ66,ASP,627,-2.789701,1.913404
627,A0MZ66,SER,628,0.573859,2.097280
628,A0MZ66,SER,629,1.593444,2.238103
629,A0MZ66,ASN,630,1.072666,1.765689


In [12]:
directory = '../alphafold_data/cif'

# Initialize an empty list to store file paths
file_paths = []

# Walk through the directory and its subdirectories
for root, directories, files in os.walk(directory):
    # Iterate over each file in the current directory
    for file in files:
        # Construct the full path of the file
        file_path = os.path.join(root, file)
        # Append the file path to the list
        file_paths.append(file_path)

# Now, file_paths contains the paths of all files in the directory and its subdirectories
print(file_paths)

['../alphafold_data/cif/O14617.cif', '../alphafold_data/cif/Q9D404.cif', '../alphafold_data/cif/P19525.cif', '../alphafold_data/cif/P62829.cif', '../alphafold_data/cif/Q96PK6.cif', '../alphafold_data/cif/Q9Z0X1.cif', '../alphafold_data/cif/Q99MB2.cif', '../alphafold_data/cif/Q6PDF3.cif', '../alphafold_data/cif/P07900.cif', '../alphafold_data/cif/Q8C6I2.cif', '../alphafold_data/cif/Q86YV0.cif', '../alphafold_data/cif/Q5JQC4.cif', '../alphafold_data/cif/Q8QZT1.cif', '../alphafold_data/cif/Q9CQ92.cif', '../alphafold_data/cif/P47897.cif', '../alphafold_data/cif/Q14203.cif', '../alphafold_data/cif/O75955.cif', '../alphafold_data/cif/Q9Y5B6.cif', '../alphafold_data/cif/P41216.cif', '../alphafold_data/cif/Q9CQN1.cif', '../alphafold_data/cif/P15121.cif', '../alphafold_data/cif/P38919.cif', '../alphafold_data/cif/Q920A5.cif', '../alphafold_data/cif/Q9NXV6.cif', '../alphafold_data/cif/Q9P2K3.cif', '../alphafold_data/cif/Q53HL2.cif', '../alphafold_data/cif/Q96E09.cif', '../alphafold_data/cif/Q8C1

In [19]:
len(file_paths)

1617

In [17]:
pattern = r'/([^/]+)\.cif$'
protein_ids =[]
for file_path in file_paths:
# Search for the pattern in the string
    match = re.search(pattern, file_path)

    # Extract the desired substring from the matched pattern
    desired_substring = match.group(1)
    protein_ids.append(desired_substring)


In [18]:
protein_ids

['O14617',
 'Q9D404',
 'P19525',
 'P62829',
 'Q96PK6',
 'Q9Z0X1',
 'Q99MB2',
 'Q6PDF3',
 'P07900',
 'Q8C6I2',
 'Q86YV0',
 'Q5JQC4',
 'Q8QZT1',
 'Q9CQ92',
 'P47897',
 'Q14203',
 'O75955',
 'Q9Y5B6',
 'P41216',
 'Q9CQN1',
 'P15121',
 'P38919',
 'Q920A5',
 'Q9NXV6',
 'Q9P2K3',
 'Q53HL2',
 'Q96E09',
 'Q8C1W2',
 'Q1ED39',
 'P27144',
 'P42125',
 'Q13418',
 'P50247',
 'O75821',
 'Q9Y6Q5',
 'Q08945',
 'Q5HZI9',
 'P21333',
 'P24539',
 'O14777',
 'Q9CPQ3',
 'Q9UN86',
 'Q8IUD2',
 'P56391',
 'Q923K4',
 'Q9CRD0',
 'P35637',
 'Q9Y3U8',
 'A2ATU0',
 'P39019',
 'P62753',
 'Q9HD42',
 'P54578',
 'Q9CWV0',
 'O14776',
 'P14174',
 'P07108',
 'Q14160',
 'Q6YN16',
 'O75175',
 'P17858',
 'Q99LP6',
 'P09496',
 'Q8WXI9',
 'Q9CW42',
 'Q9UNZ5',
 'O00267',
 'Q8BHE8',
 'Q8C2E4',
 'Q9UL46',
 'Q9D773',
 'Q9CQC7',
 'Q8BWF0',
 'Q9CZS1',
 'Q62425',
 'Q9CXJ1',
 'P05455',
 'P50454',
 'Q60597',
 'Q7L4I2',
 'O75940',
 'P28074',
 'P85094',
 'Q9CQZ5',
 'Q99M87',
 'Q9BYJ9',
 'Q9CQY9',
 'Q9CZ13',
 'Q8R404',
 'Q9QXX4',
 'P55060',

In [24]:
list_of_dfs = []
for cif_file, prot_id in zip(file_paths, protein_ids):
    list_of_dfs.append(get_phi_and_psi(prot_id, cif_file))


Model 0 Chain A
(part 1 of 1)
length 1153
from MET1
to CYS1153
Model 0 Chain A
(part 1 of 1)
length 459
from MET1
to MET459
Model 0 Chain A
(part 1 of 1)
length 551
from MET1
to CYS551
Model 0 Chain A
(part 1 of 1)
length 140
from MET1
to ALA140
Model 0 Chain A
(part 1 of 1)
length 669
from MET1
to MET669
Model 0 Chain A
(part 1 of 1)
length 612
from MET1
to ASP612
Model 0 Chain A
(part 1 of 1)
length 328
from MET1
to SER328
Model 0 Chain A
(part 1 of 1)
length 494
from MET1
to LYS494
Model 0 Chain A
(part 1 of 1)
length 732
from MET1
to ASP732
Model 0 Chain A
(part 1 of 1)
length 164
from MET1
to HIS164
Model 0 Chain A
(part 1 of 1)
length 1011
from MET1
to THR1011
Model 0 Chain A
(part 1 of 1)
length 288
from MET1
to THR288
Model 0 Chain A
(part 1 of 1)
length 424
from MET1
to LEU424
Model 0 Chain A
(part 1 of 1)
length 152
from MET1
to SER152
Model 0 Chain A
(part 1 of 1)
length 775
from MET1
to VAL775
Model 0 Chain A
(part 1 of 1)
length 1278
from MET1
to SER1278
Model 0 Chain A
(p

In [25]:
concat_dihedrals = pd.concat(list_of_dfs)

In [26]:
concat_dihedrals

Unnamed: 0,Protein ID,Residue Name,Residue Position,PHI,PSI
0,O14617,MET,1,,-1.015463
1,O14617,ALA,2,-0.954294,-0.783044
2,O14617,LEU,3,-1.047323,-0.604341
3,O14617,LYS,4,-1.189739,-0.662262
4,O14617,MET,5,-1.213749,-0.669750
...,...,...,...,...,...
251,Q8C3X2,PHE,252,-1.180788,-0.653133
252,Q8C3X2,TRP,253,-1.329307,-0.276255
253,Q8C3X2,LYS,254,-1.518433,-0.312795
254,Q8C3X2,GLU,255,-1.663209,0.034239


In [27]:
concat_dihedrals.to_csv('dihedral_angles.csv')

In [29]:
RvsS_df = pd.read_csv('../datasets/RvsS_peptides_completed_sequence_with_thresholds.csv')

In [33]:
RvsS_df

Unnamed: 0.1,Unnamed: 0,Peptide Sequence,Modified Peptide,Light Modified Peptide,Heavy Modified Peptide,1 Log2 Ratio HL,10 Log2 Ratio HL,11 Log2 Ratio HL,12 Log2 Ratio HL,2 Log2 Ratio HL,...,Log2HL avg,label,Complete Sequence,Sequence Location,Sequence Length,Left Prefix,Left Prefix Length,Methionine Location,Left 20,Right 20
0,0,AADTIGYPVMIR,AADTIGYPVMIR,AADTIGYPVM[649.3660]IR,AADTIGYPVM[655.3735]IR,,,3.269016,,,...,3.289988,green,MTRILTACKVVKTLKSGFGFANVTTKRQWDFSRPGIRLLSVKAKTA...,575,12,AADTIGYPV,9,584,FAVESMEDALKAADTIGYPV,IRSAYALGGLGSGICPNKET
1,1,IAMQTLDMGR,IAMQTLDMGR,IAM[649.3660]QTLDMGR,IAM[655.3735]QTLDMGR,,2.783695,3.114945,2.697822,,...,2.789195,green,MAAALLARARGPLRRALGVRDWRRLHTVYQSVELPETHQMLRQTCR...,262,10,IA,2,264,DCRIPKENLLGEPGMGFKIA,QTLDMGRIGIASQALGIAQA
2,2,FVGAVDPIMEK,FVGAVDPIMEK,FVGAVDPIM[649.3660]EK,FVGAVDPIM[655.3735]EK,,,,,,...,2.555706,green,MASESGKLWGGRFVGAVDPIMEKFNSSISYDRHLWNVDVQGSKAYS...,12,11,FVGAVDPI,8,20,MASESGKLWGGRFVGAVDPI,EKFNSSISYDRHLWNVDVQG
3,3,QAQYLGMPINGPFKPDHYRY,QAQYLGMPINGPFKPDHYRY,QAQYLGM[649.3660]PINGPFKPDHYRY,QAQYLGM[655.3735]PINGPFKPDHYRY,2.394458,2.380664,2.682897,,2.435014,...,2.538998,green,MSDKLPYKVADIGLAAWGRKALDIAENEMPGLMRMREMYSASKPLK...,412,20,QAQYLG,6,418,LGKLNVKLTKLTEKQAQYLG,PINGPFKPDHYRY
4,4,FADVIPMNLPHR,FADVIPMNLPHR,FADVIPM[649.3660]NLPHR,FADVIPM[655.3735]NLPHR,,,2.580647,,2.446614,...,2.389057,green,MDGVSTAILLLLLAVISLSLTFSSRGKGQLPPGPKPLPILGNLLQL...,358,12,FADVIP,6,364,SMPYTDAVIHEVQRFADVIP,NLPHRVTRDTPFRGFLIPKG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,198,AHMVTLDYTVQVPGTGR,AHMVTLDYTVQVPGTGR,AHM[649.3660]VTLDYTVQVPGTGR,AHM[655.3735]VTLDYTVQVPGTGR,-1.822067,-1.687501,-1.706146,-1.647231,-1.793966,...,-1.857135,yellow,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,213,17,AH,2,215,KSDLTKDITTSVLTVNNKAH,VTLDYTVQVPGTGRDGSPGF
199,199,KEQESEVDMK,KEQESEVDMK,KEQESEVDM[649.3660]K,KEQESEVDM[655.3735]K,-1.759694,,-1.821773,-1.960428,-1.781592,...,-1.871741,yellow,MYRLSSSMLPRALAQAMRTGHLNGQSLHSSAVAATYKYVNKKEQES...,41,10,KEQESEVD,8,49,SAVAATYKYVNKKEQESEVD,KSATDNAARILMWTELIRGL
200,200,RGVMLAVDAVIAELK,RGVMLAVDAVIAELK,RGVM[649.3660]LAVDAVIAELK,RGVM[655.3735]LAVDAVIAELK,-1.812349,-1.980371,-2.307386,-2.492858,-1.405837,...,-1.950460,yellow,MLRLPTVLRQMRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDL...,141,15,RGV,3,144,KEGFEKISKGANPVEIRRGV,LAVDAVIAELKKQSKPVTTP
201,201,MQLLEIITTDK,MQLLEIITTDK,M[649.3660]QLLEIITTDK,M[655.3735]QLLEIITTDK,-2.286954,-2.700322,-1.941018,-1.860306,-2.069488,...,-2.270072,yellow,MVASRAIGSLSRFSAFRILRSRGCICRSFTTSSALLTRTHINYGVK...,505,11,,0,505,AVSKRPEKVIGMHYFSPVDK,QLLEIITTDKTSKDTTASAV


In [None]:
# testing the positons of the phi and psi angles 


In [None]:
concat_dihedrals[]