**DSA103_DataProject: derive_chemistry in Python**

In [None]:
#Libraries
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors

In [6]:
#reading input data
df_tropical = pd.read_csv("/Users/riccarda/Desktop/DSA103_DataProject_RW/data/mtbs_tropical_annotations.tsv",
    sep="\t")
df_tropical.head()

Unnamed: 0,feature_id,component_id,libname,structure_inchikey,structure_smiles,structure_molecular_formula,structure_taxonomy_npclassifier_01pathway,structure_taxonomy_npclassifier_02superclass,structure_taxonomy_npclassifier_03class
0,64000,2173,MS1_match,SRBFZHDQGSBBOR-HWQSCIPKSA-N,OC1OC[C@H](O)[C@H](O)[C@H]1O,C5H10O5,Carbohydrates,Saccharides,Monosaccharides
1,64000,2173,MS1_match,PYMYPHUHKUWMLA-WDCZJNDASA-N,O=C[C@@H](O)[C@H](O)[C@H](O)CO,C5H10O5,Carbohydrates,Saccharides,Monosaccharides
2,63994,2138,MS1_match,GLDOVTGHNKAZLK-UHFFFAOYSA-N,CCCCCCCCCCCCCCCCCCO,C18H38O,Fatty acids,Fatty acyls,Fatty alcohols
3,63993,47,ISDB,FZFFGBOPCQADGY-UHFFFAOYSA-N,CC(C)=CCCC(C)=CCC1(CC=C(C)C)C(=O)C(O)=Cc2oc3cc...,C28H32O6,Shikimates and Phenylpropanoids,Xanthones,Plant xanthones
4,63991,664,MS1_match,KFXIUXCXSKTCNK-KLGAAMDDSA-N,C=C1C2=Nc3ccccc3[C@@]23CCN2C/C(=C/C)[C@H]1C[C@...,C19H20N2,Terpenoids,Diterpenoids,Valparane diterpenoids


In [7]:
#Filtering and cleaning Molecular Structures
mask = (df_tropical["structure_smiles"].notna() & 
    df_tropical["structure_smiles"].ne(""))

compounds_ok = df_tropical[mask]

In [8]:
#returns most frequently occurringnon-missing value
def most_common(series):
    counts = series.value_counts(dropna=True)
    if len(counts) == 0:
        return None
    else:
        return counts.idxmax()

In [9]:
#Group rows by SMILES structure
compReady = (compounds_ok.groupby("structure_smiles")["structure_taxonomy_npclassifier_01pathway"]
             .agg(most_common).reset_index()) #For each structure, compute the most common pathway

#Create a synthetic ID for each unique structure
compReady["SID"] = "S" + (compReady.index + 1).astype(str)
compReady.head()

Unnamed: 0,structure_smiles,structure_taxonomy_npclassifier_01pathway,SID
0,C#C/C=C\CCCC#C/C=C/CCCCCCC/C=C\C#C,Fatty acids,S1
1,C#C/C=C\CCCC#CCCCCCCCCCCC#C,Fatty acids,S2
2,C#C/C=C\CCCCC#CCCCCC#CCCCC#CCO,Fatty acids,S3
3,C#CC#CC/C=C/CCCCC/C=C/C(=O)N1CCCCC1,Alkaloids,S4
4,C#CC#CC=CC=CC=CCCO,Fatty acids,S5


In [10]:
# Build the redundant index list 
redundant_raw = {2, 7, 8, 11, 15, 17, 18, 20, 21, 24, 29,*range(33, 39),41,*range(43, 46)}
# Convert to zero-based indexing
redundant = {x - 1 for x in redundant_raw}
# Filter descriptors using list comprehension
desc_used = [name for idx, name in enumerate(Descriptors.descList) if idx not in redundant]

#Converting SMILES into RDKit Molecule Objects
compReady["mol"] = compReady["structure_smiles"].apply(Chem.MolFromSmiles)

In [13]:
#calculates a set of chemical descriptors for each molecule 
#and stores the results in a new DataFrame
rows = []
for mol in compReady["mol"]:
    row = {}
    for name, func in desc_used:
        row[name] = func(mol)

    rows.append(row)

describe_df = pd.DataFrame(rows)

In [14]:
#merges  molecule table (compReady) with  descriptor table (desc_df) into one combined DataFrame
compOut = pd.concat(
    [compReady.reset_index(drop=True), describe_df],axis=1)
compOut.head()


Unnamed: 0,structure_smiles,structure_taxonomy_npclassifier_01pathway,SID,mol,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,ExactMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C#C/C=C\CCCC#C/C=C/CCCCCCC/C=C\C#C,Fatty acids,S1,<rdkit.Chem.rdchem.Mol object at 0x160d99e00>,5.138392,0.937871,0.937871,0.332032,10.636364,292.219101,...,0,0,0,2,0,0,0,0,8,0
1,C#C/C=C\CCCC#CCCCCCCCCCCC#C,Fatty acids,S2,<rdkit.Chem.rdchem.Mol object at 0x160d99ee0>,5.223961,0.943318,0.943318,0.309396,9.809524,282.234751,...,0,0,0,2,0,0,0,0,11,0
2,C#C/C=C\CCCCC#CCCCCC#CCCCC#CCO,Fatty acids,S3,<rdkit.Chem.rdchem.Mol object at 0x160d99930>,8.490439,0.043231,-0.043231,0.492037,9.043478,308.214016,...,0,0,0,1,0,0,0,0,8,0
3,C#CC#CC/C=C/CCCCC/C=C/C(=O)N1CCCCC1,Alkaloids,S4,<rdkit.Chem.rdchem.Mol object at 0x160d99c40>,11.886906,0.190474,0.190474,0.285907,14.772727,297.209264,...,0,0,0,1,0,0,0,0,4,0
4,C#CC#CC=CC=CC=CCCO,Fatty acids,S5,<rdkit.Chem.rdchem.Mol object at 0x160d9a0a0>,8.439692,0.188318,0.188318,0.504961,10.461538,172.088815,...,0,0,0,1,0,0,0,0,0,0


In [15]:
#saves  combined dataset as a CSV file named classifier_pathway.csv
compOut.to_csv('classifier_pathway.csv') 