# Installed Python libraries

In [2]:
! pip install pubchempy



# Find chemical data for lonicerae flos

## The chemicals are recorded in a text file. 
These are reported in a chinese medicine article about the chemicals found in the herb through some analysis.


In [100]:
file_path = 'lonicerae_clean.txt'

# Open the file and read the lines into a list
with open(file_path, 'r') as file:
    lines = file.readlines()

# Strip newline characters from each line. Also remove empty lines. Also change′ to '
lines = [line.strip().replace('′','\'') for line in lines if line.strip()]
lines = [line.replace('β','beta') for line in lines]

#lines


In [73]:
def getCompounds(names):
    data = {}
    for chemical_name in names:
        compounds = pcp.get_compounds(chemical_name, 'name')
        data[chemical_name] = compounds
    return data


In [78]:
data = getCompounds(lines)

## Save object to file. It take time to load 100 chemical data from the PubChem. So we save it locally and reuse them later

In [75]:
import pickle

# File path to save the serialized object
file_path = 'data.pkl'

# Open the file in binary write mode and dump the object
with open(file_path, 'wb') as file:
    pickle.dump(data, file)

print("Object data has been serialized and saved to file.")

Object data has been serialized and saved to file.


In [35]:
import pickle

# File path to load the serialized object from
file_path = 'data.pkl'

# Open the file in binary read mode and load the object
with open(file_path, 'rb') as file:
    data = pickle.load(file)

print("Object has been deserialized and loaded from file.")


Object has been deserialized and loaded from file.


In [76]:
print(len(data.items()))
for name, compounds in data.items():
    print(name, compounds)


112
5,7-Dihydroxyflavone [Compound(5281607)]
5-Hydroxy-7,4'-dimethoxyflavone [Compound(5281601)]
5-Hydroxy-7,3',4'-trimethoxyflavone [Compound(5272653)]
5-Hydroxy-7,3',4',5'-tetramethoxyflavone []
3',4',5',5,7-Pentamethoxyflavone [Compound(493376)]
Luteolin [Compound(5280445)]
3'-Methoxyluteolin []
5,3'-Dimethoxyluteolin []
Luteolin-7-O-beta-D-glucoside []
Luteolin-7-O-beta-D-galactoside []
Luteolin-5-O-beta-D-glucoside [Compound(5317471)]
Luteolin-3'-O-L-rhamnoside []
Jaceosidin [Compound(5379096)]
Jaceosidin-7-O-neohesperidoside []
Lonicerin [Compound(5282152)]
Kaempferol-3-O-beta-D-glucoside []
Apigenin-7-O-α-L-rhamnoside []
Quercetin [Compound(5280343)]
Quercetin-3-O-beta-D-glucoside [Compound(25203368)]
Quercetin-7-O-beta-D-glucoside []
Hyperoside [Compound(5281643)]
Medioresinol-7-O-beta-D-glucoside []
Medioresinol-7-O-neohesperidoside []
Flavo-yadorinin-B []
Rhoifolin [Compound(5282150)]
Rutin [Compound(5280805)]
Loganin [Compound(87691)]
7-Epiloganin []
8-Epiloganin [Compound(1

### The following compounds, we could not find them in pubchem. 
I plan to ignore them for now. I imagine there should be other ways to find the data, beyond pubchem.

In [80]:
[key for key, compounds in data.items() if len(compounds)==0]


["5-Hydroxy-7,3',4',5'-tetramethoxyflavone",
 "3'-Methoxyluteolin",
 "5,3'-Dimethoxyluteolin",
 'Luteolin-7-O-beta-D-glucoside',
 'Luteolin-7-O-beta-D-galactoside',
 "Luteolin-3'-O-L-rhamnoside",
 'Jaceosidin-7-O-neohesperidoside',
 'Kaempferol-3-O-beta-D-glucoside',
 'Apigenin-7-O-α-L-rhamnoside',
 'Quercetin-7-O-beta-D-glucoside',
 'Medioresinol-7-O-beta-D-glucoside',
 'Medioresinol-7-O-neohesperidoside',
 'Flavoyadorinin B',
 '7-Epiloganin',
 'Ketologanin',
 '7-O-Ethyl Sweroside',
 'Secologanoside-7-Methyl Ester',
 '7α-Morroniside',
 '7beta-Morroniside',
 'Dehydromorroniside',
 'Secoxyloganin 7-Butyl Ester',
 'Secologanoside A',
 'Adinoside A',
 'Stryspinoside',
 'Loniceracetalide B',
 'L-Phenylalaninosecologanin',
 '7-O-(4-beta-D-Glucopyranosyloxy-3-Methoxy Benzoyl) Secologanolic Acid',
 "6'-O-(7α-Hydroxyswerosyloxy) Loganin",
 'Lonijaposide O',
 'Lonijaposide P',
 'Lonijaposide Q',
 'Lonijaposide R',
 'Lonijaposide S',
 'Lonijaposide T',
 'Lonijaposide U',
 'Lonijaposide V',
 'Lon

# Now get the known acne medicines

I search "acne" on PubChem website and found 37 medicines. PubChem allows me to download them as csv.

In [97]:
import csv

# the medicine list is downloaded from pubchem after searching "acne". It is stored in a csv
acne_filename = 'PubChem_compound_text_acne.csv'

# Reading from a CSV file with headers
with open(acne_filename, newline='') as csvfile:
    csvreader = csv.DictReader(csvfile)
    
    # check the fields. Get the header names
    headers = csvreader.fieldnames
    print("Headers:", headers) 
    
    # Check one line of data
    for row in csvreader:
        print(row)
        break
    csvreader

    




Headers: ['\ufeff cid', 'cmpdname', 'cmpdsynonym', 'mw', 'mf', 'polararea', 'complexity', 'xlogp', 'heavycnt', 'hbonddonor', 'hbondacc', 'rotbonds', 'inchi', 'isosmiles', 'canonicalsmiles', 'inchikey', 'iupacname', 'exactmass', 'monoisotopicmass', 'charge', 'covalentunitcnt', 'isotopeatomcnt', 'totalatomstereocnt', 'definedatomstereocnt', 'undefinedatomstereocnt', 'totalbondstereocnt', 'definedbondstereocnt', 'undefinedbondstereocnt', 'pclidcnt', 'gpidcnt', 'gpfamilycnt', 'neighbortype', 'meshheadings', 'annothits', 'annothitcnt', 'aids', 'cidcdate', 'sidsrcname', 'depcatg', 'annotation']
{'\ufeff cid': '5743', 'cmpdname': 'Dexamethasone', 'cmpdsynonym': 'dexamethasone|50-02-2|Decadron|Maxidex|Dexamethazone|Hexadecadrol|Dexasone|Hexadrol|Aeroseb-Dex|Fluormethylprednisolone|Desametasone|Superprednol|Visumetazone|Cortisumman|Decaderm|Decaspray|Dexacortal|Dexacortin|Gammacorten|Millicorten|Oradexon|Auxiron|Calonat|Dexason|Dexone|Dectancyl|Deltafluorene|Desamethasone|Desameton|Fortecortin|

# It seems that we need to collect features

Chemical properties: molecular weight, solubility, structure, functional groups, and other chemical properties.
Then we can build a model to predict if a model can help with acne based on these features

# Garbage old code bank

In [71]:
import pubchempy as pcp

# Search for a compound by name
chemical_name = "Apigenin-7-O-alpha-L-rhamnoside" #'aspirin'

compounds = pcp.get_compounds(chemical_name, 'name')

# Print information about the found compounds
for compound in compounds:
    print(f"IUPAC Name: {compound.iupac_name}")
    print(f"Molecular Formula: {compound.molecular_formula}")
    print(f"Molecular Weight: {compound.molecular_weight}")
    print(f"Canonical SMILES: {compound.canonical_smiles}")
    print(f"Isomeric SMILES: {compound.isomeric_smiles}")
    print(f"InChI: {compound.inchi}")
    print(f"InChI Key: {compound.inchikey}")
    print(f"CID: {compound.cid}")
    print("-" * 30)

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
import pandas as np

# Sample SMILES data
smiles_list = ["CCO", "CC(=O)O", "CCC", "CCN"]

# Convert SMILES to RDKit molecule objects
molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

# Calculate molecular descriptors
def calculate_descriptors(mol):
    descriptors = {
        'MolWt': Descriptors.MolWt(mol),
        'LogP': Descriptors.MolLogP(mol),
        'NumHDonors': Descriptors.NumHDonors(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol)
    }
    return descriptors

descriptor_list = [calculate_descriptors(mol) for mol in molecules]

# Create a DataFrame for features
df = pd.DataFrame(descriptor_list)

# Example labels (binary classification)
labels = [0, 1, 0, 1]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.2, random_state=42)

# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model using cross-validation
scores = cross_val_score(model, X_test, y_test, cv=5)
print("Model Accuracy: ", np.mean(scores))