# Extracted the Sequences and SMILE string from Peptoid Data Bank

#### Use the same conda environment as the one run the Peptoid Data Bank 

In [1]:
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt

## Extract data from url

In [2]:
#The link from RESTful API
#Change according to one's own server address

pep_url = 'http://127.0.0.1:5000/api/peptoids'
res_url = 'http://127.0.0.1:5000/api/residues'

In [28]:
#Sparce the json data

peptoids = requests.get(pep_url).json()
residues = requests.get(res_url).json()

#Save them into .json file

with open('peptoids.json', 'w') as pep_file:
    json.dump(peptoids, pep_file, indent=4)

with open('residues.json', 'w') as res_file:
    json.dump(residues, res_file, indent=4)

In [33]:
df = pd.read_json(pep_url, orient='columns')

print(df)

                                            03AA1-5-A  \
0   {'_links': {'2d_image': '/static/peptoids/03AA...   
1                                                 NaN   
2                                                 NaN   
3                                                 NaN   
4                                                 NaN   
..                                                ...   
83                                                NaN   
84                                                NaN   
85                                                NaN   
86                                                NaN   
87                                                NaN   

                                            06AA1-9-A  \
0                                                 NaN   
1   {'_links': {'2d_image': '/static/peptoids/06AA...   
2                                                 NaN   
3                                                 NaN   
4                             

In [31]:
df_pep = pd.read_json(peptoids)
df_rec = pd.read_json('residues.json')
print(df_pep)

ValueError: Invalid file path or buffer object type: <class 'list'>

In [12]:
#Make residues dictionary 
flat_residues = {k: v for dictionary in residues for k, v in dictionary.items()} #flatten the residues dictionary 
res_df = pd.DataFrame(flat_residues.values(), index=flat_residues.keys()) #make a data frame
res_df.to_csv('residues.csv', index=False)
print(res_df)

                                                        SMILES  \
(S)-N-(1-cyclohexylethyl)glycine  *N([C@@H](C)C1CCCCC1)CC(=O)*   
(R)-N-(1-cyclohexylethyl)glycine   *N([C@H](C)C1CCCCC1)CC(=O)*   
(S)-N-(2-butyl)glycine                  *N([C@@H](C)CC)CC(=O)*   
(R)-N-(2-butyl)glycine                   *N([C@H](C)CC)CC(=O)*   
(S)-N-(1-phenylethyl)glycine      *N([C@@H](C)c1ccccc1)CC(=O)*   
...                                                        ...   
N-(cyclohexyl)methylglycine               *N(CC1CCCCC1)CC(=O)*   
N-pentylglycine                               *N(CCCCC)CC(=O)*   
N-hexylglycine                               *N(CCCCCC)CC(=O)*   
Glycine                                              *NCC(=O)*   
beta-Alanine                                        *NCCC(=O)*   

                                                 full_nomenclature  \
(S)-N-(1-cyclohexylethyl)glycine  (S)-N-(1-cyclohexylethyl)glycine   
(R)-N-(1-cyclohexylethyl)glycine  (R)-N-(1-cyclohexylethyl)glycine 

In [54]:
#Make peptoids dictionary 

# Flatten the JSON data using pd.json_normalize
peptoids_list = []
for peptoid in peptoids:
    for peptoid_code, peptoid_info in peptoid.items():
        peptoid_info['peptoid_code'] = peptoid_code #add a column for peptoid_code
        df = pd.json_normalize(peptoid_info) #create a df for individual peptoid
        peptoids_list.append(df) 

df_pep = pd.concat(peptoids_list, axis=0) #stack all the df vertically 
df_pep.to_csv('peptoids.csv', index=False)        

## Make a sequence dictionary

In [18]:

sequences_dict = {}
experimental_method = 'Solution NMR'

for peptoid in peptoids:
    for peptoid_code, data in peptoid.items():
        if data['experiment'] == experimental_method:
            sequences = data['sequence'].split("ine,")
            sequences = [
                seq.strip() + "ine" if i < len(sequences) - 1 else seq.strip()
                for i, seq in enumerate(sequences)
            ]
            sequences_dict[peptoid_code] = sequences

# Convert sequences_dict to DataFrame
seq_df = pd.DataFrame.from_dict(sequences_dict, orient='index').transpose()
print(seq_df)
seq_df.to_csv('sequences1.csv', index=False)

                      06AA1-9-A                       13AC1-6-A  \
0  (S)-N-(1-phenylethyl)glycine   N-(3,5-dimethylphenyl)glycine   
1  (S)-N-(1-phenylethyl)glycine  (S)-N-(1-naphthylethyl)glycine   
2  (S)-N-(1-phenylethyl)glycine   N-(2,6-dimethylphenyl)glycine   
3  (S)-N-(1-phenylethyl)glycine  (S)-N-(1-naphthylethyl)glycine   
4  (S)-N-(1-phenylethyl)glycine       N-(4-fluorophenyl)glycine   
5  (S)-N-(1-phenylethyl)glycine  (S)-N-(1-naphthylethyl)glycine   
6  (S)-N-(1-phenylethyl)glycine                            None   
7  (S)-N-(1-phenylethyl)glycine                            None   
8  (S)-N-(1-phenylethyl)glycine                            None   
9                          None                            None   

                        17AB1-5-A                       17AB2-8-A  \
0  (S)-N-(1-naphthylethyl)glycine   N-(2,6-dimethylphenyl)glycine   
1   N-(3,5-dimethylphenyl)glycine  (R)-N-(1-naphthylethyl)glycine   
2  (R)-N-(1-naphthylethyl)glycine       N-(4-fluorophen

## Make a SMILE dictionary

In [12]:
smiles_dict = {}

# Iterate over the columns of the DataFrame
for col in seq_df.columns:
    smiles_dict[col] = []  # Initialize the list for storing SMILES
    
    # Iterate over each sequence in the column
    for seq in seq_df[col]:
        # Iterate over each row in res_df to find the corresponding SMILES
        for index, row in res_df.iterrows():
            if seq == row['full_nomenclature']:
                smile = row['SMILES']
                smiles_dict[col].append(smile)
                break  # Exit the inner loop once a match is found

smiles_df = pd.DataFrame.from_dict(smiles_dict, orient='index').transpose()
print(smiles_df)

smiles_df.to_csv('smiles.csv', index=False)

                      06AA1-9-A                           13AC1-6-A  \
0  *N([C@@H](C)c1ccccc1)CC(=O)*           *N(c1cc(C)cc(C)c1)CC(=O)*   
1  *N([C@@H](C)c1ccccc1)CC(=O)*  *N([C@@H](C)c1cccc2ccccc21)CC(=O)*   
2  *N([C@@H](C)c1ccccc1)CC(=O)*           *N(c1c(C)cccc1(C))CC(=O)*   
3  *N([C@@H](C)c1ccccc1)CC(=O)*  *N([C@@H](C)c1cccc2ccccc21)CC(=O)*   
4  *N([C@@H](C)c1ccccc1)CC(=O)*              *N(c1ccc(F)cc1)CC(=O)*   
5  *N([C@@H](C)c1ccccc1)CC(=O)*  *N([C@@H](C)c1cccc2ccccc21)CC(=O)*   
6  *N([C@@H](C)c1ccccc1)CC(=O)*                                None   
7  *N([C@@H](C)c1ccccc1)CC(=O)*                                None   
8  *N([C@@H](C)c1ccccc1)CC(=O)*                                None   
9                          None                                None   

                            17AB1-5-A                           17AB2-8-A  \
0  *N([C@@H](C)c1cccc2ccccc21)CC(=O)*           *N(c1c(C)cccc1(C))CC(=O)*   
1           *N(c1cc(C)cc(C)c1)CC(=O)*   *N([C@H](C)c1cccc2ccccc2

In [11]:
#Clean all the smile for generating the mol2 file
clean_smiles_dict = {}

# Iterate over the columns of the DataFrame
for col in seq_df.columns:
    clean_smiles_dict[col] = []  # Initialize the list for storing SMILES
    
    # Iterate over each sequence in the column
    for seq in seq_df[col]:
        # Iterate over each row in res_df to find the corresponding SMILES
        for index, row in res_df.iterrows():
            if seq == row['full_nomenclature']:
                smile = row['SMILES']
                clean_smile = smile.replace('*','')
                clean_smiles_dict[col].append(clean_smile)
                break  # Exit the inner loop once a match is found

clean_smiles_df = pd.DataFrame.from_dict(clean_smiles_dict, orient='index').transpose()
print(clean_smiles_df)

clean_smiles_df.to_csv('clean_smiles.csv', index=False)

                    06AA1-9-A                         13AC1-6-A  \
0  N([C@@H](C)c1ccccc1)CC(=O)           N(c1cc(C)cc(C)c1)CC(=O)   
1  N([C@@H](C)c1ccccc1)CC(=O)  N([C@@H](C)c1cccc2ccccc21)CC(=O)   
2  N([C@@H](C)c1ccccc1)CC(=O)           N(c1c(C)cccc1(C))CC(=O)   
3  N([C@@H](C)c1ccccc1)CC(=O)  N([C@@H](C)c1cccc2ccccc21)CC(=O)   
4  N([C@@H](C)c1ccccc1)CC(=O)              N(c1ccc(F)cc1)CC(=O)   
5  N([C@@H](C)c1ccccc1)CC(=O)  N([C@@H](C)c1cccc2ccccc21)CC(=O)   
6  N([C@@H](C)c1ccccc1)CC(=O)                              None   
7  N([C@@H](C)c1ccccc1)CC(=O)                              None   
8  N([C@@H](C)c1ccccc1)CC(=O)                              None   
9                        None                              None   

                          17AB1-5-A                         17AB2-8-A  \
0  N([C@@H](C)c1cccc2ccccc21)CC(=O)           N(c1c(C)cccc1(C))CC(=O)   
1           N(c1cc(C)cc(C)c1)CC(=O)   N([C@H](C)c1cccc2ccccc21)CC(=O)   
2   N([C@H](C)c1cccc2ccccc21)CC(=O)        