<a href="https://colab.research.google.com/github/rajendranast012-stack/in-silico-lab1/blob/main/scaffold_finding_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# =====================================================
# Scaffold-based Word report for rare flavonoids
# =====================================================

!pip install rdkit
!pip install python-docx


import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import Draw
from docx import Document
from docx.shared import Inches
from collections import Counter

# --------------------------
# Step 1: Load your existing dataframe
# --------------------------
# Assuming df has 'SMILES', 'common', 'Cluster', 'Sub-cluster' (optional), 'Avg_Tanimoto'
# If not, you can load from Excel:
# df = pd.read_excel("/content/flavonoid_cluster_full.xlsx")

# Keep only valid molecules
df['mol'] = df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x) if x else None)
df = df[df['mol'].notnull()].reset_index(drop=True)

# --------------------------
# Step 2: Compute Murcko scaffolds
# --------------------------
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        return Chem.MolToSmiles(scaffold)
    return None

df['scaffold'] = df['SMILES'].apply(get_scaffold)

# --------------------------
# Step 3: Count scaffold frequency
# --------------------------
scaffold_counts = Counter(df['scaffold'])
df['scaffold_freq'] = df['scaffold'].apply(lambda x: scaffold_counts[x])

# Rare scaffolds (frequency <=2)
rare_df = df[df['scaffold_freq'] <= 2].reset_index(drop=True)
print(f"Total rare scaffolds: {len(rare_df)}")

# --------------------------
# Step 4: Create folder for images
# --------------------------
img_dir = "rare_scaffold_images"
os.makedirs(img_dir, exist_ok=True)

# --------------------------
# Step 5: Create Word document
# --------------------------
doc = Document()
doc.add_heading("Rare Scaffold Flavonoids Report", level=1)
doc.add_paragraph(f"Total rare scaffolds: {len(rare_df)}")

# 3 molecules per row
table = doc.add_table(rows=0, cols=3)

for idx, row in rare_df.iterrows():
    if idx % 3 == 0:
        row_cells = table.add_row().cells

    mol = row['mol']
    img_path = f"{img_dir}/rare_mol_{idx}.png"
    Draw.MolToFile(mol, img_path, size=(250,250))

    cell = row_cells[idx % 3]

    # Add molecule info
    cluster_text = f"Cluster: {row['Cluster']}" if 'Cluster' in row else ""
    subcluster_text = f"Sub-cluster: {row['Sub-cluster']}" if 'Sub-cluster' in row else ""
    avg_tanimoto_text = f"Avg Tanimoto: {row['Avg_Tanimoto']:.3f}" if 'Avg_Tanimoto' in row else ""

    cell.paragraphs[0].add_run(
        f"{row['common']}\n{cluster_text} {subcluster_text}\n{avg_tanimoto_text}\n"
    ).bold = True

    cell.add_paragraph(f"SMILES:\n{row['SMILES']}")
    cell.add_paragraph().add_run().add_picture(img_path, width=Inches(1.5))

# --------------------------
# Step 6: Save Word file
# --------------------------
output_file = "rare_scaffold_flavonoids.docx"
doc.save(output_file)
print(f"Word report saved: {output_file}")


Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0
Total rare scaffolds: 47
Word report saved: rare_scaffold_flavonoids.docx
