# Extracted the Sequences and SMILE string from Peptoid Data Bank

#### Use the same conda environment as the one run the Peptoid Data Bank 

In [16]:
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt

## Extract data from url

In [17]:
#The link from RESTful API
#Change according to one's own server address

pep_url = 'http://127.0.0.1:5000/api/peptoids'
res_url = 'http://127.0.0.1:5000/api/residues'

In [18]:
#Sparce the json data

peptoids = requests.get(pep_url).json()
residues = requests.get(res_url).json()

#Save them into .json file

with open('data/peptoids.json', 'w') as pep_file:
    json.dump(peptoids, pep_file, indent=4)

with open('data/residues.json', 'w') as res_file:
    json.dump(residues, res_file, indent=4)

In [19]:
#Make residues dictionary 
flat_residues = {k: v for dictionary in residues for k, v in dictionary.items()} #flatten the residues dictionary 
df_res = pd.DataFrame(flat_residues.values(), index=flat_residues.keys()) #make a data frame
df_res.to_csv('data/residues.csv', index=False)
print(df_res)

                                                        SMILES  \
(S)-N-(1-cyclohexylethyl)glycine  *N([C@@H](C)C1CCCCC1)CC(=O)*   
(R)-N-(1-cyclohexylethyl)glycine   *N([C@H](C)C1CCCCC1)CC(=O)*   
(S)-N-(2-butyl)glycine                  *N([C@@H](C)CC)CC(=O)*   
(R)-N-(2-butyl)glycine                   *N([C@H](C)CC)CC(=O)*   
(S)-N-(1-phenylethyl)glycine      *N([C@@H](C)c1ccccc1)CC(=O)*   
...                                                        ...   
N-(cyclohexyl)methylglycine               *N(CC1CCCCC1)CC(=O)*   
N-pentylglycine                               *N(CCCCC)CC(=O)*   
N-hexylglycine                               *N(CCCCCC)CC(=O)*   
Glycine                                              *NCC(=O)*   
beta-Alanine                                        *NCCC(=O)*   

                                                 full_nomenclature  \
(S)-N-(1-cyclohexylethyl)glycine  (S)-N-(1-cyclohexylethyl)glycine   
(R)-N-(1-cyclohexylethyl)glycine  (R)-N-(1-cyclohexylethyl)glycine 

In [20]:
#Make peptoids dictionary 
peptoids_list = []
for peptoid in peptoids:
    for peptoid_code, peptoid_info in peptoid.items():
        peptoid_info['peptoid_code'] = peptoid_code #add a column for peptoid_code
        df = pd.json_normalize(peptoid_info) #create a df for individual peptoid, Flatten the JSON data using pd.json_normalize
        peptoids_list.append(df) 

df_pep = pd.concat(peptoids_list, axis=0) #stack all the df vertically 
df_pep.to_csv('data/peptoids.csv', index=False)   

print(df_pep)

   c_term                                           citation cyc_points  \
0      *N  \n\n\n@article{Wu_2003,\n\tdoi = {10.1021/ja03...              
0      *N  \n\n\n@article{Huang_2006,\n\tdoi = {10.1021/j...              
0      *2  \n\n\n@article{Shin_2007,\n\tdoi = {10.1021/ja...              
0      *2  \n\n\n@article{Shin_2007,\n\tdoi = {10.1021/ja...              
0      *1  \n\n\n@article{Roy_2008,\n\tdoi = {10.1021/ol7...              
..    ...                                                ...        ...   
0      *N  \n\n\n@article{Ghosh_2020,\n\tdoi = {10.1002/c...       None   
0      *N  \n\n\n@article{Ghosh_2020,\n\tdoi = {10.1002/c...       None   
0      *N  \n\n\n@article{Ghosh_2020,\n\tdoi = {10.1002/c...       None   
0      *N  \n\n\n@article{Ghosh_2020,\n\tdoi = {10.1002/c...       None   
0      *2  \n\n\n@article{SHIMIZU_2009,\n\tdoi = {10.1111...       None   

           experiment n_term                             pub_doi  \
0   X-Ray Diffraction      *   

## Make a sequence dictionary

In [21]:
#create a csv with each indivual peptoid code as a column, and the sequence list in peptoid data bank and input
sequences_dict = {}
experimental_method = 'Solution NMR' #select the experimental method 

for peptoid in peptoids:
    for peptoid_code, data in peptoid.items():
        if data['experiment'] == experimental_method:
            sequences = data['sequence'].split("ine,")
            sequences = [
                seq.strip() + "ine" if i < len(sequences) - 1 else seq.strip() #making sure the sequence list are splited as desired 
                for i, seq in enumerate(sequences)
            ]
            sequences_dict[peptoid_code] = sequences

# Convert sequences_dict to DataFrame
df_seq = pd.DataFrame.from_dict(sequences_dict, orient='index').transpose()
print(df_seq)
df_seq.to_csv('data/sequences.csv', index=False)

                      06AA1-9-A                       13AC1-6-A  \
0  (S)-N-(1-phenylethyl)glycine   N-(3,5-dimethylphenyl)glycine   
1  (S)-N-(1-phenylethyl)glycine  (S)-N-(1-naphthylethyl)glycine   
2  (S)-N-(1-phenylethyl)glycine   N-(2,6-dimethylphenyl)glycine   
3  (S)-N-(1-phenylethyl)glycine  (S)-N-(1-naphthylethyl)glycine   
4  (S)-N-(1-phenylethyl)glycine       N-(4-fluorophenyl)glycine   
5  (S)-N-(1-phenylethyl)glycine  (S)-N-(1-naphthylethyl)glycine   
6  (S)-N-(1-phenylethyl)glycine                            None   
7  (S)-N-(1-phenylethyl)glycine                            None   
8  (S)-N-(1-phenylethyl)glycine                            None   
9                          None                            None   

                        17AB1-5-A                       17AB2-8-A  \
0  (S)-N-(1-naphthylethyl)glycine   N-(2,6-dimethylphenyl)glycine   
1   N-(3,5-dimethylphenyl)glycine  (R)-N-(1-naphthylethyl)glycine   
2  (R)-N-(1-naphthylethyl)glycine       N-(4-fluorophen

## Make a SMILE dictionary

In [22]:

smiles_dict = {}

# Iterate over the columns of the DataFrame
for col in df_seq.columns:
    smiles_dict[col] = []  # Initialize the list for storing SMILES
    
    # Iterate over each sequence in the column
    for seq in df_seq[col]:
        # Iterate over each row in res_df to find the corresponding SMILES
        for index, row in df_res.iterrows():
            if seq == row['full_nomenclature']:
                smile = row['SMILES']
                smiles_dict[col].append(smile)
                break  # Exit the inner loop once a match is found

df_smiles = pd.DataFrame.from_dict(smiles_dict, orient='index').transpose()
print(df_smiles)

df_smiles.to_csv('data/smiles.csv', index=False)

                      06AA1-9-A                           13AC1-6-A  \
0  *N([C@@H](C)c1ccccc1)CC(=O)*           *N(c1cc(C)cc(C)c1)CC(=O)*   
1  *N([C@@H](C)c1ccccc1)CC(=O)*  *N([C@@H](C)c1cccc2ccccc21)CC(=O)*   
2  *N([C@@H](C)c1ccccc1)CC(=O)*           *N(c1c(C)cccc1(C))CC(=O)*   
3  *N([C@@H](C)c1ccccc1)CC(=O)*  *N([C@@H](C)c1cccc2ccccc21)CC(=O)*   
4  *N([C@@H](C)c1ccccc1)CC(=O)*              *N(c1ccc(F)cc1)CC(=O)*   
5  *N([C@@H](C)c1ccccc1)CC(=O)*  *N([C@@H](C)c1cccc2ccccc21)CC(=O)*   
6  *N([C@@H](C)c1ccccc1)CC(=O)*                                None   
7  *N([C@@H](C)c1ccccc1)CC(=O)*                                None   
8  *N([C@@H](C)c1ccccc1)CC(=O)*                                None   
9                          None                                None   

                            17AB1-5-A                           17AB2-8-A  \
0  *N([C@@H](C)c1cccc2ccccc21)CC(=O)*           *N(c1c(C)cccc1(C))CC(=O)*   
1           *N(c1cc(C)cc(C)c1)CC(=O)*   *N([C@H](C)c1cccc2ccccc2

In [23]:
#Capped all the smile for generating the mol2 file
capped_smiles_dict = {}

# Iterate over the columns of the DataFrame
for col in df_seq.columns:
    capped_smiles_dict[col] = []  # Initialize the list for storing SMILES
    
    # Iterate over each sequence in the column
    for seq in df_seq[col]:
        # Iterate over each row in res_df to find the corresponding SMILES
        for index, row in df_res.iterrows():
            if seq == row['full_nomenclature']:
                smile = row['SMILES']
                clean_smile = smile.replace('*','')
                capped_smile = f"CC(=O){clean_smile}N(C)C"
                capped_smiles_dict[col].append(capped_smile)
                break  # Exit the inner loop once a match is found

capped_smiles_df = pd.DataFrame.from_dict(capped_smiles_dict, orient='index').transpose()
print(capped_smiles_df)

capped_smiles_df.to_csv('data/capped_smiles.csv', index=False)

                               06AA1-9-A  \
0  CC(=O)N([C@@H](C)c1ccccc1)CC(=O)N(C)C   
1  CC(=O)N([C@@H](C)c1ccccc1)CC(=O)N(C)C   
2  CC(=O)N([C@@H](C)c1ccccc1)CC(=O)N(C)C   
3  CC(=O)N([C@@H](C)c1ccccc1)CC(=O)N(C)C   
4  CC(=O)N([C@@H](C)c1ccccc1)CC(=O)N(C)C   
5  CC(=O)N([C@@H](C)c1ccccc1)CC(=O)N(C)C   
6  CC(=O)N([C@@H](C)c1ccccc1)CC(=O)N(C)C   
7  CC(=O)N([C@@H](C)c1ccccc1)CC(=O)N(C)C   
8  CC(=O)N([C@@H](C)c1ccccc1)CC(=O)N(C)C   
9                                   None   

                                     13AC1-6-A  \
0           CC(=O)N(c1cc(C)cc(C)c1)CC(=O)N(C)C   
1  CC(=O)N([C@@H](C)c1cccc2ccccc21)CC(=O)N(C)C   
2           CC(=O)N(c1c(C)cccc1(C))CC(=O)N(C)C   
3  CC(=O)N([C@@H](C)c1cccc2ccccc21)CC(=O)N(C)C   
4              CC(=O)N(c1ccc(F)cc1)CC(=O)N(C)C   
5  CC(=O)N([C@@H](C)c1cccc2ccccc21)CC(=O)N(C)C   
6                                         None   
7                                         None   
8                                         None   
9              