## AI-Powered Scientific Field Classifier

We can replace our current keyword-based `_identify_research_field` function with a much more robust machine learning classifier. This will provide more accurate and nuanced categorization of research articles.

### Scientific Purpose:

- *Accurate Thematic Grouping*: Automatically and accurately classify articles into specific scientific fields (e.g., 'Neuroscience', 'Molecular Biology', 'Immunology'), enabling better content organization and trend analysis.
- *Automated Style Adaptation*: Use the classification to automatically select the most appropriate podcast script template, voice style (e.g., Kore for firm, technical content), and illustration style for that specific field.
- *Content Discovery*: Identify and flag interdisciplinary research that bridges multiple classified fields.

In [1]:
# 🧠 Scientific Field Classifier using Embeddings
print("=" * 80)
print("🔬 ENHANCEMENT 1: AI-Powered Scientific Field Classifier")
print("=" * 80)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# This functionality would ideally be in its own module, e.g., `src/analysis/classifier.py`

class ScientificFieldClassifier:
    """
    A classifier to predict the scientific field of an article using its embedding.
    Inspired by Google's `Classify_text_with_embeddings.ipynb` cookbook example.
    """
    def __init__(self, embedding_provider):
        self.model = LogisticRegression(class_weight='balanced', max_iter=1000)
        self.embedder = embedding_provider
        self.is_trained = False
        self.class_names = []
        print("✅ ScientificFieldClassifier initialized.")

    async def train_from_articles(self, articles: list[dict]):
        """
        Trains the classifier on a list of articles with 'abstract' and 'field' keys.
        """
        print(f"🧠 Training classifier on {len(articles)} articles...")
        df = pd.DataFrame(articles)
        
        # Generate embeddings for training data
        print("   Generating embeddings for training data...")
        embeddings = []
        for text in df['abstract']:
            # Use the text provider to generate embeddings
            response = await self.embedder.client.models.embed_content_async(
                model="text-embedding-004", # A model optimized for embeddings
                content=text,
                task_type="CLASSIFICATION"
            )
            embeddings.append(response.embedding.values)
        
        X = np.array(embeddings)
        y = df['field'].astype('category').cat.codes
        self.class_names = df['field'].astype('category').cat.categories
        
        print(f"   Training data shape: {X.shape}")
        print(f"   Found {len(self.class_names)} unique fields: {list(self.class_names)}")
        
        # Train the logistic regression model
        self.model.fit(X, y)
        self.is_trained = True
        print("✅ Classifier training complete.")

    async def predict(self, article: dict) -> str:
        """Predicts the scientific field for a single new article."""
        if not self.is_trained:
            raise RuntimeError("Classifier must be trained before making predictions.")
            
        # Generate embedding for the new article's abstract
        response = await self.embedder.client.models.embed_content_async(
            model="text-embedding-004",
            content=article['abstract'],
            task_type="CLASSIFICATION"
        )
        embedding = np.array(response.embedding.values).reshape(1, -1)
        
        # Predict the class index and get the class name
        prediction_idx = self.model.predict(embedding)[0]
        return self.class_names[prediction_idx]

# --- Example Usage ---
async def test_classifier():
    if 'google_provider' in locals() and google_provider:
        # 1. Create dummy training data (in a real scenario, this would be a curated dataset)
        training_data = [
            {'abstract': 'This study explores neural pathways and brain activity using fMRI.', 'field': 'Neuroscience'},
            {'abstract': 'We investigated synaptic plasticity in the hippocampus.', 'field': 'Neuroscience'},
            {'abstract': 'Our research focuses on tumor suppressor genes and cell cycle regulation.', 'field': 'Cancer Research'},
            {'abstract': 'We analyzed mutations in the BRCA1 gene related to breast cancer.', 'field': 'Cancer Research'},
            {'abstract': 'This paper details the role of T-cells and cytokines in immune response.', 'field': 'Immunology'},
            {'abstract': 'We developed a new vaccine targeting viral antigens.', 'field': 'Immunology'},
        ]
        
        # 2. Initialize and train the classifier
        classifier = ScientificFieldClassifier(google_provider)
        await classifier.train_from_articles(training_data)
        
        # 3. Use the trained classifier on our test article
        if 'test_article' in locals() and test_article:
            print(f"\n🔍 Classifying article: '{test_article['title'][:50]}...'")
            predicted_field = await classifier.predict(test_article)
            print(f"   ➡️ Predicted Scientific Field: {predicted_field}")
            
            # This predicted_field can now be used to drive other pipeline steps
            print("\n💡 This classification can now automatically:")
            print("   - Select a field-specific script template.")
            print("   - Choose an appropriate voice (e.g., 'Kore' for technical fields).")
            print("   - Guide the style of the generated illustration.")

# Run the test
await test_classifier()

🔬 ENHANCEMENT 1: AI-Powered Scientific Field Classifier
