<a href="https://colab.research.google.com/github/rajendranast012-stack/in-silico-lab1/blob/main/scaffold_finding_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# =====================================================
# Scaffold-based Word report for rare flavonoids
# =====================================================

!pip install rdkit
!pip install python-docx


import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import Draw
from docx import Document
from docx.shared import Inches
from collections import Counter

# --------------------------
# Step 1: Load your existing dataframe
# --------------------------
# Assuming df has 'SMILES', 'common', 'Cluster', 'Sub-cluster' (optional), 'Avg_Tanimoto'
# If not, you can load from Excel:
# df = pd.read_excel("/content/flavonoid_cluster_full.xlsx")

# Keep only valid molecules
df['mol'] = df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x) if x else None)
df = df[df['mol'].notnull()].reset_index(drop=True)

# --------------------------
# Step 2: Compute Murcko scaffolds
# --------------------------
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        return Chem.MolToSmiles(scaffold)
    return None

df['scaffold'] = df['SMILES'].apply(get_scaffold)

# --------------------------
# Step 3: Count scaffold frequency
# --------------------------
scaffold_counts = Counter(df['scaffold'])
df['scaffold_freq'] = df['scaffold'].apply(lambda x: scaffold_counts[x])

# Rare scaffolds (frequency <=2)
rare_df = df[df['scaffold_freq'] <= 2].reset_index(drop=True)
print(f"Total rare scaffolds: {len(rare_df)}")

# --------------------------
# Step 4: Create folder for images
# --------------------------
img_dir = "rare_scaffold_images"
os.makedirs(img_dir, exist_ok=True)

# --------------------------
# Step 5: Create Word document
# --------------------------
doc = Document()
doc.add_heading("Rare Scaffold Flavonoids Report", level=1)
doc.add_paragraph(f"Total rare scaffolds: {len(rare_df)}")

# 3 molecules per row
table = doc.add_table(rows=0, cols=3)

for idx, row in rare_df.iterrows():
    if idx % 3 == 0:
        row_cells = table.add_row().cells

    mol = row['mol']
    img_path = f"{img_dir}/rare_mol_{idx}.png"
    Draw.MolToFile(mol, img_path, size=(250,250))

    cell = row_cells[idx % 3]

    # Add molecule info
    cluster_text = f"Cluster: {row['Cluster']}" if 'Cluster' in row else ""
    subcluster_text = f"Sub-cluster: {row['Sub-cluster']}" if 'Sub-cluster' in row else ""
    avg_tanimoto_text = f"Avg Tanimoto: {row['Avg_Tanimoto']:.3f}" if 'Avg_Tanimoto' in row else ""

    cell.paragraphs[0].add_run(
        f"{row['common']}\n{cluster_text} {subcluster_text}\n{avg_tanimoto_text}\n"
    ).bold = True

    cell.add_paragraph(f"SMILES:\n{row['SMILES']}")
    cell.add_paragraph().add_run().add_picture(img_path, width=Inches(1.5))

# --------------------------
# Step 6: Save Word file
# --------------------------
output_file = "rare_scaffold_flavonoids.docx"
doc.save(output_file)
print(f"Word report saved: {output_file}")


Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0
Total rare scaffolds: 47
Word report saved: rare_scaffold_flavonoids.docx


**pseudo QSAR**



In [9]:
# =====================================================
# Flavonoid novelty + pseudo-QSAR + Word report
# =====================================================

!pip install python-docx pandas scikit-learn

import pandas as pd
import numpy as np
import os
from rdkit import Chem, DataStructs
from rdkit.Chem import MACCSkeys, Draw
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from docx import Document
from docx.shared import Inches
from collections import Counter

# --------------------------
# Step 1: Load dataset
# --------------------------
input_file = "/content/flavonoid_smile.xlsx"
df = pd.read_excel(input_file)

# Convert SMILES to RDKit molecules safely
df['mol'] = df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x) if x else None)
df = df[df['mol'].notnull()].reset_index(drop=True)

# --------------------------
# Step 2: Generate MACCS fingerprints
# --------------------------
fps = [MACCSkeys.GenMACCSKeys(mol) for mol in df['mol']]
fps_array = np.zeros((len(fps), fps[0].GetNumBits()), dtype=int)
for i, fp in enumerate(fps):
    DataStructs.ConvertToNumpyArray(fp, fps_array[i])

# --------------------------
# Step 3: Pseudo-activity labeling
# --------------------------
known_active_smiles = [
    "COC1=CC=C(C=C1)C2=COC3=C(C2=O)C=CC(=C3)O",  # example
    "CC1=CC=C(C=C1)C2=OCC3=C(C2=O)C=CC(=C3)O"     # add more literature actives
]
# Convert SMILES to RDKit molecules safely and filter out invalid ones
known_active_mols = [Chem.MolFromSmiles(s) for s in known_active_smiles]
known_active_mols = [mol for mol in known_active_mols if mol is not None]

known_active_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in known_active_mols]

# Compute max Tanimoto similarity to known actives
pseudo_scores = []
for fp in fps:
    # Ensure there are known_active_fps to compare against
    if known_active_fps:
        sims = [DataStructs.TanimotoSimilarity(fp, kfp) for kfp in known_active_fps]
        pseudo_scores.append(max(sims))
    else:
        pseudo_scores.append(0.0) # Assign a default score if no valid known actives
df['Pseudo_Activity'] = pseudo_scores

# Convert to binary label (optional)
threshold = 0.6
df['Pseudo_Label'] = (df['Pseudo_Activity'] >= threshold).astype(int)

# --------------------------
# Step 4: Train Random Forest QSAR
# --------------------------
X = fps_array
y = df['Pseudo_Label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=500, random_state=42)
rf.fit(X_train, y_train)

# --------------------------
# Step 5: Detect Murcko scaffolds
# --------------------------
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        return Chem.MolToSmiles(scaffold)
    return None

df['scaffold'] = df['SMILES'].apply(get_scaffold)

# Compute scaffold frequency
scaffold_counts = Counter(df['scaffold'])
df['scaffold_freq'] = df['scaffold'].apply(lambda x: scaffold_counts[x])

# Rare scaffolds (frequency <= 2)
df_rare = df[df['scaffold_freq'] <= 2].reset_index(drop=True)

# --------------------------
# Step 6: Predict pseudo-activity for rare scaffolds
# --------------------------
rare_fps_array = np.zeros((len(df_rare), fps[0].GetNumBits()), dtype=int)
for i, mol in enumerate(df_rare['mol']):
    fp = MACCSkeys.GenMACCSKeys(mol)
    DataStructs.ConvertToNumpyArray(fp, rare_fps_array[i])

df_rare['Predicted_Activity'] = rf.predict(rare_fps_array)

# --------------------------
# Step 7: Generate Word report with images
# --------------------------
img_dir = "rare_scaffold_images"
os.makedirs(img_dir, exist_ok=True)

doc = Document()
doc.add_heading("Rare Scaffold Flavonoids with Predicted Activity", level=1)
doc.add_paragraph(f"Total rare scaffolds: {len(df_rare)}")

table = doc.add_table(rows=0, cols=3)

for idx, row in df_rare.iterrows():
    if idx % 3 == 0:
        row_cells = table.add_row().cells

    mol = row['mol']
    img_path = f"{img_dir}/rare_mol_{idx}.png"
    Draw.MolToFile(mol, img_path, size=(250,250))

    cell = row_cells[idx % 3]

    cluster_text = f"Cluster: {row['Cluster']}" if 'Cluster' in row else ""
    subcluster_text = f"Sub-cluster: {row['Sub-cluster']}" if 'Sub-cluster' in row else ""
    avg_tanimoto_text = f"Avg Tanimoto: {row['Avg_Tanimoto']:.3f}" if 'Avg_Tanimoto' in row else ""

    cell.paragraphs[0].add_run(
        f"{row['common']}\n{cluster_text} {subcluster_text}\n{avg_tanimoto_text}\n"
        f"Predicted Activity: {row['Predicted_Activity']}\n"
    ).bold = True

    cell.add_paragraph(f"SMILES:\n{row['SMILES']}")
    cell.add_paragraph().add_run().add_picture(img_path, width=Inches(1.5))

output_file = "rare_scaffold_flavonoids_report.docx"
doc.save(output_file)
print(f"Word report saved: {output_file}")



[19:18:04] Explicit valence for atom # 8 O, 3, is greater than permitted


Word report saved: rare_scaffold_flavonoids_report.docx
