In [1]:
# Setup imports and path
import sys
import os
sys.path.append('../src')

import asyncio
import pandas as pd
from pathlib import Path


# Import our scraper - Fixed import paths
import sys
import os
from pathlib import Path

# Add src directory to path for imports
notebook_dir = Path().resolve()
src_dir = notebook_dir.parent / "src"
sys.path.insert(0, str(src_dir))

print(f"Notebook directory: {notebook_dir}")
print(f"Source directory: {src_dir}")
print(f"Source exists: {src_dir.exists()}")

# Now import our modules
from scrapers.ifc_scraper import IFCPublicationScraper
from utils.config import load_config
from utils.logger import setup_logger, get_logger

# Setup logging
setup_logger(level="INFO")
logger = get_logger(__name__)

print("✅ All imports successful!")

Notebook directory: /home/santi/Projects/UBMI-IFC-Podcast/notebooks
Source directory: /home/santi/Projects/UBMI-IFC-Podcast/src
Source exists: True
✅ All imports successful!


In [2]:
# Load configuration
config = load_config()
print("Configuration loaded:")
print(f"Base URL: {config['ifc']['base_url']}")
print(f"Years range: {config['ifc']['years_range']}")
print(f"Rate limit delay: {config['ifc']['rate_limit_delay']}s")

Configuration loaded:
Base URL: https://www.ifc.unam.mx
Years range: {'start': 2021, 'end': 2025}
Rate limit delay: 1.0s


In [3]:
# Initialize scraper
scraper = IFCPublicationScraper(config)
print("Scraper initialized successfully")

Scraper initialized successfully


In [None]:
# Test saving publications
output_dir = Path("../data/raw")
output_dir.mkdir(parents=True, exist_ok=True)

try:
    all_publications = await scraper.scrape_all_years(2021, 2025)
    print(f"Successfully scraped {len(all_publications)} total publications")
    
    # Save all data
    scraper.save_publications(all_publications, output_dir / "all_ifc_publications.json")
    
    # Analysis
    df = pd.DataFrame([{
        'title': pub.title,
        'authors': pub.authors,
        'journal': pub.journal,
        'yeear': pub.year,
        'has_abstract': bool(pub.abstract)
    } for pub in all_publications])
    
    print("\nData summary:")
    print(df.groupby('year').size())
    print(f"\nArticles with abstracts: {df['has_abstract'].sum()}/{len(df)}")
    
except Exception as e:
    print(f"Multi-year scraping failed: {e}")

print("Multi-year test commented out - uncomment when single year works")

# Verify saved data
import json
with open(output_dir / "test_ifc_publications.json", 'r') as f:
    saved_data = json.load(f)
    
print(f"Saved {len(saved_data)} publications to file")
print("Sample saved data:")
print(json.dumps(saved_data[0], indent=2, ensure_ascii=False))

[32m2025-09-19 13:54:34[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36mscrape_publications_by_year[0m:[36m61[0m - [1mScraping publications for year 2021[0m
[32m2025-09-19 13:54:36[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m97[0m - [1mFound 149 potential publication links[0m
[32m2025-09-19 13:54:36[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m158[0m - [1mSuccessfully parsed 131 publications[0m


CancelledError: 

> update scraper test

In [None]:
# Cell: Test scraping ALL available years
import asyncio
import pandas as pd
from pathlib import Path

async def scrape_all_available_publications():
    """Scrape publications from all available years (2020-2024)"""
    print("🚀 Starting comprehensive scraping of IFC publications...")
    
    all_publications = []
    years_to_scrape = range(2020, 2025)  # Adjust range as needed
    
    for year in years_to_scrape:
        print(f"\n📅 Scraping year {year}...")
        try:
            publications = await scraper.scrape_publications_by_year(year)
            if publications:
                all_publications.extend(publications)
                print(f"   ✅ Found {len(publications)} publications for {year}")
            else:
                print(f"   ⚠️ No publications found for {year}")
                
            # Rate limiting
            await asyncio.sleep(2)
            
        except Exception as e:
            print(f"   ❌ Error scraping {year}: {e}")
            continue
    
    print(f"\n🎉 Total publications collected: {len(all_publications)}")
    
    # Save raw data
    output_dir = Path("../data/raw")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    scraper.save_publications(all_publications, output_dir / "all_ifc_publications.json")
    
    # Create summary DataFrame
    df = pd.DataFrame([{
        'title': pub.title,
        'authors': pub.authors,
        'journal': pub.journal,
        'year': pub.year,
        'doi': pub.doi,
        'abstract': pub.abstract[:200] if pub.abstract else 'No abstract',
        'has_doi': bool(pub.doi),
        'has_abstract': bool(pub.abstract)
    } for pub in all_publications])
    
    print("\n📊 Data Summary:")
    print(f"   Publications by year:")
    print(df['year'].value_counts().sort_index())
    print(f"\n   Publications with DOI: {df['has_doi'].sum()}/{len(df)}")
    print(f"   Publications with abstract: {df['has_abstract'].sum()}/{len(df)}")
    
    return all_publications, df

# Run comprehensive scraping
all_pubs, summary_df = await scrape_all_available_publications()

[32m2025-09-19 13:59:03[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36mscrape_publications_by_year[0m:[36m61[0m - [1mScraping publications for year 2020[0m


🚀 Starting comprehensive scraping of IFC publications...

📅 Scraping year 2020...


[32m2025-09-19 13:59:05[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m97[0m - [1mFound 141 potential publication links[0m
[32m2025-09-19 13:59:05[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m158[0m - [1mSuccessfully parsed 128 publications[0m


   ✅ Found 128 publications for 2020


[32m2025-09-19 14:05:29[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36mscrape_publications_by_year[0m:[36m61[0m - [1mScraping publications for year 2021[0m



📅 Scraping year 2021...


[32m2025-09-19 14:05:31[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m97[0m - [1mFound 149 potential publication links[0m
[32m2025-09-19 14:05:31[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m158[0m - [1mSuccessfully parsed 131 publications[0m


   ✅ Found 131 publications for 2021


[32m2025-09-19 14:13:58[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36mscrape_publications_by_year[0m:[36m61[0m - [1mScraping publications for year 2022[0m



📅 Scraping year 2022...


[32m2025-09-19 14:14:00[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m97[0m - [1mFound 122 potential publication links[0m
[32m2025-09-19 14:14:00[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m158[0m - [1mSuccessfully parsed 26 publications[0m


   ✅ Found 26 publications for 2022


[32m2025-09-19 14:15:20[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36mscrape_publications_by_year[0m:[36m61[0m - [1mScraping publications for year 2023[0m



📅 Scraping year 2023...


[32m2025-09-19 14:15:22[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m97[0m - [1mFound 120 potential publication links[0m
[32m2025-09-19 14:15:22[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m158[0m - [1mSuccessfully parsed 32 publications[0m


   ✅ Found 32 publications for 2023


[32m2025-09-19 14:17:00[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36mscrape_publications_by_year[0m:[36m61[0m - [1mScraping publications for year 2024[0m



📅 Scraping year 2024...


[32m2025-09-19 14:17:02[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m97[0m - [1mFound 125 potential publication links[0m
[32m2025-09-19 14:17:02[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36m_parse_publications_page[0m:[36m158[0m - [1mSuccessfully parsed 87 publications[0m


   ✅ Found 87 publications for 2024


[32m2025-09-19 14:21:24[0m | [1mINFO[0m | [36mscrapers.ifc_scraper[0m:[36msave_publications[0m:[36m267[0m - [1mSaved 404 publications to ../data/raw/all_ifc_publications.json[0m



🎉 Total publications collected: 404

📊 Data Summary:
   Publications by year:
year
2018      1
2019      1
2020    130
2021    128
2022     25
2023     35
2024     83
2025      1
Name: count, dtype: int64

   Publications with DOI: 402/404
   Publications with abstract: 404/404


## Embeddings and ChromaDB

In [4]:
# Cell 7: Fixed Publication embedding system
"""
Publication embedding system using ChromaDB and UMAP
"""

import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import json
import hashlib
import uuid

# Embedding and ML libraries
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import umap
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Fixed imports for notebook usage
from utils.logger import get_logger
from utils.config import load_config, get_data_dir
from scrapers.ifc_scraper import Publication


class PublicationEmbeddingSystem:
    """System for creating and managing publication embeddings"""
    
    def __init__(self, config: Dict = None):
        self.config = config or load_config()
        self.logger = get_logger(__name__)
        
        # Initialize embedding model
        model_name = 'all-MiniLM-L6-v2'
        self.logger.info(f"Loading embedding model: {model_name}")
        self.embedding_model = SentenceTransformer(model_name)
        
        # Initialize ChromaDB
        self.data_dir = get_data_dir()
        self.chroma_dir = self.data_dir / "chromadb"
        self.chroma_dir.mkdir(parents=True, exist_ok=True)
        
        self.logger.info(f"Initializing ChromaDB at: {self.chroma_dir}")
        self.chroma_client = chromadb.PersistentClient(
            path=str(self.chroma_dir),
            settings=Settings(
                anonymized_telemetry=False,
                allow_reset=True
            )
        )
        
        # Collection for publications
        self.collection_name = "ifc_publications"
        
    def create_publication_embeddings(self, publications: List[Publication]) -> Dict:
        """
        Create embeddings for all publications
        
        Args:
            publications: List of Publication objects
            
        Returns:
            Dictionary with embeddings and metadata
        """
        self.logger.info(f"Creating embeddings for {len(publications)} publications...")
        
        # Prepare texts for embedding
        texts, metadata, ids = self._prepare_texts_and_metadata(publications)
        
        # Generate embeddings
        self.logger.info("Generating embeddings with SentenceTransformer...")
        embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
        
        # Store in ChromaDB
        self.logger.info("Storing embeddings in ChromaDB...")
        self._store_in_chromadb(ids, embeddings, texts, metadata)
        
        # Create UMAP projection
        self.logger.info("Creating UMAP projection...")
        umap_embeddings = self._create_umap_projection(embeddings)
        
        result = {
            'embeddings': embeddings,
            'umap_embeddings': umap_embeddings,
            'texts': texts,
            'metadata': metadata,
            'ids': ids,
            'publications': publications
        }
        
        # Save results
        self._save_embedding_results(result)
        
        self.logger.info("✅ Embedding creation complete!")
        return result
    
    def _prepare_texts_and_metadata(self, publications: List[Publication]) -> Tuple[List[str], List[Dict], List[str]]:
        """Prepare texts and metadata for embedding"""
        texts = []
        metadata = []
        ids = []
        
        for i, pub in enumerate(publications):
            # Create rich text for embedding
            text_parts = []
            
            if pub.title:
                text_parts.append(f"Title: {pub.title}")
            
            if pub.abstract:
                text_parts.append(f"Abstract: {pub.abstract}")
            else:
                # Fallback: use authors and journal if no abstract
                if pub.authors:
                    text_parts.append(f"Authors: {pub.authors}")
                if pub.journal:
                    text_parts.append(f"Journal: {pub.journal}")
            
            if pub.keywords:
                text_parts.append(f"Keywords: {', '.join(pub.keywords)}")
            
            combined_text = " ".join(text_parts)
            texts.append(combined_text)
            
            # Create unique ID
            pub_id = f"pub_{i}_{uuid.uuid4().hex[:8]}"
            ids.append(pub_id)
            
            # Metadata for ChromaDB (must be JSON serializable)
            meta = {
                'title': pub.title or '',
                'authors': pub.authors or '',
                'journal': pub.journal or '',
                'year': int(pub.year),
                'doi': pub.doi or '',
                'pubmed_id': pub.pubmed_id or '',
                'ifc_url': pub.ifc_url or '',
                'has_abstract': bool(pub.abstract),
                'text_length': len(combined_text),
                'index': i
            }
            metadata.append(meta)
        
        return texts, metadata, ids
    
    def _store_in_chromadb(self, ids: List[str], embeddings: np.ndarray, 
                          texts: List[str], metadata: List[Dict]):
        """Store embeddings in ChromaDB"""
        try:
            # Delete existing collection if it exists
            try:
                self.chroma_client.delete_collection(self.collection_name)
                self.logger.info(f"Deleted existing collection: {self.collection_name}")
            except Exception:
                pass  # Collection might not exist
            
            # Create new collection
            collection = self.chroma_client.create_collection(
                name=self.collection_name,
                metadata={"description": "IFC-UNAM Publications Embeddings"}
            )
            
            # Add documents in batches
            batch_size = 100
            total_added = 0
            
            for i in range(0, len(ids), batch_size):
                end_idx = min(i + batch_size, len(ids))
                
                batch_ids = ids[i:end_idx]
                batch_embeddings = embeddings[i:end_idx].tolist()
                batch_texts = texts[i:end_idx]
                batch_metadata = metadata[i:end_idx]
                
                collection.add(
                    ids=batch_ids,
                    embeddings=batch_embeddings,
                    documents=batch_texts,
                    metadatas=batch_metadata
                )
                
                total_added += len(batch_ids)
                self.logger.info(f"Added batch {i//batch_size + 1}, total: {total_added}/{len(ids)}")
            
            self.logger.info(f"✅ Stored {len(ids)} embeddings in ChromaDB collection '{self.collection_name}'")
            
        except Exception as e:
            self.logger.error(f"Error storing in ChromaDB: {e}")
            raise
    
    def _create_umap_projection(self, embeddings: np.ndarray) -> np.ndarray:
        """Create 2D UMAP projection of embeddings"""
        if len(embeddings) < 15:
            self.logger.warning("Too few embeddings for optimal UMAP. Using smaller n_neighbors.")
            n_neighbors = max(2, len(embeddings) - 1)
        else:
            n_neighbors = 15
        
        # Standardize embeddings
        scaler = StandardScaler()
        embeddings_scaled = scaler.fit_transform(embeddings)
        
        # UMAP projection
        umap_reducer = umap.UMAP(
            n_components=2,
            n_neighbors=n_neighbors,
            min_dist=0.1,
            metric='cosine',
            random_state=42
        )
        
        umap_embeddings = umap_reducer.fit_transform(embeddings_scaled)
        
        # Save UMAP model for future use
        import pickle
        models_dir = self.data_dir / "models"
        models_dir.mkdir(parents=True, exist_ok=True)
        
        with open(models_dir / "umap_model.pkl", 'wb') as f:
            pickle.dump(umap_reducer, f)
        
        with open(models_dir / "scaler.pkl", 'wb') as f:
            pickle.dump(scaler, f)
        
        return umap_embeddings
    
    def _save_embedding_results(self, results: Dict):
        """Save embedding results to disk"""
        output_dir = self.data_dir / "processed"
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save embeddings and UMAP coordinates
        np.save(output_dir / "publication_embeddings.npy", results['embeddings'])
        np.save(output_dir / "umap_coordinates.npy", results['umap_embeddings'])
        
        # Save metadata as DataFrame with UMAP coordinates
        df = pd.DataFrame(results['metadata'])
        df['umap_x'] = results['umap_embeddings'][:, 0]
        df['umap_y'] = results['umap_embeddings'][:, 1]
        df['id'] = results['ids']
        df.to_csv(output_dir / "publications_with_coordinates.csv", index=False)
        
        self.logger.info(f"Saved embedding results to {output_dir}")
    
    def visualize_embeddings(self, results: Dict, save_plots: bool = True):
        """Create visualizations of the embeddings"""
        self.logger.info("Creating embedding visualizations...")
        
        # Prepare data for plotting
        df = pd.DataFrame(results['metadata'])
        df['umap_x'] = results['umap_embeddings'][:, 0]
        df['umap_y'] = results['umap_embeddings'][:, 1]
        
        # Create visualizations
        figs = []
        
        # 1. Basic scatter plot colored by year
        fig1 = self._plot_basic_scatter(df)
        figs.append(('basic_scatter', fig1))
        
        # 2. Interactive plot with hover information
        fig2 = self._plot_interactive(df)
        figs.append(('interactive', fig2))
        
        # 3. Journal clustering (if we have multiple journals)
        if len(df['journal'].unique()) > 1:
            fig3 = self._plot_journal_clusters(df)
            figs.append(('journal_clusters', fig3))
        
        # 4. Year evolution (if we have multiple years)
        if len(df['year'].unique()) > 1:
            fig4 = self._plot_year_evolution(df)
            figs.append(('year_evolution', fig4))
        
        if save_plots:
            self._save_plots(figs)
        
        return figs
    
    def _plot_basic_scatter(self, df: pd.DataFrame):
        """Basic scatter plot of UMAP coordinates"""
        fig, ax = plt.subplots(figsize=(12, 8))
        
        scatter = ax.scatter(
            df['umap_x'], df['umap_y'], 
            c=df['year'], 
            cmap='viridis', 
            alpha=0.7, 
            s=60
        )
        
        ax.set_xlabel('UMAP Dimension 1')
        ax.set_ylabel('UMAP Dimension 2')
        ax.set_title('IFC Publications Embedding Space (UMAP Projection)')
        ax.grid(True, alpha=0.3)
        
        cbar = plt.colorbar(scatter, ax=ax)
        cbar.set_label('Publication Year')
        
        plt.tight_layout()
        return fig
    
    def _plot_interactive(self, df: pd.DataFrame):
        """Interactive plotly visualization"""
        # Truncate long titles for hover display
        df_plot = df.copy()
        df_plot['title_short'] = df_plot['title'].apply(lambda x: x[:60] + '...' if len(x) > 60 else x)
        df_plot['authors_short'] = df_plot['authors'].apply(lambda x: x[:40] + '...' if len(x) > 40 else x)
        
        fig = px.scatter(
            df_plot, 
            x='umap_x', 
            y='umap_y',
            color='year',
            hover_data=['title_short', 'authors_short', 'journal'],
            title='Interactive IFC Publications Map',
            labels={'umap_x': 'UMAP Dimension 1', 'umap_y': 'UMAP Dimension 2'},
            color_continuous_scale='viridis'
        )
        
        fig.update_traces(marker=dict(size=8, opacity=0.7))
        fig.update_layout(width=1000, height=700)
        
        return fig
    
    def _plot_journal_clusters(self, df: pd.DataFrame):
        """Plot clustering by journal"""
        # Get top journals (limit to avoid overcrowding)
        top_journals = df['journal'].value_counts().head(10).index
        df_filtered = df[df['journal'].isin(top_journals)]
        
        fig = px.scatter(
            df_filtered,
            x='umap_x',
            y='umap_y',
            color='journal',
            title='Publications Clustered by Journal (Top 10)',
            labels={'umap_x': 'UMAP Dimension 1', 'umap_y': 'UMAP Dimension 2'}
        )
        
        fig.update_traces(marker=dict(size=10, opacity=0.8))
        fig.update_layout(width=1200, height=800)
        
        return fig
    
    def _plot_year_evolution(self, df: pd.DataFrame):
        """Plot showing evolution over years"""
        years = sorted(df['year'].unique())
        n_years = len(years)
        
        if n_years <= 4:
            cols = 2
            rows = 2
        else:
            cols = 3
            rows = (n_years + 2) // 3
        
        fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 4*rows))
        if rows == 1 and cols == 1:
            axes = [axes]
        elif rows == 1 or cols == 1:
            axes = axes.flatten()
        else:
            axes = axes.flatten()
        
        colors = plt.cm.viridis(np.linspace(0, 1, n_years))
        
        for i, year in enumerate(years):
            if i < len(axes):
                year_data = df[df['year'] == year]
                axes[i].scatter(year_data['umap_x'], year_data['umap_y'], 
                              alpha=0.7, s=50, color=colors[i])
                axes[i].set_title(f'Publications {year} (n={len(year_data)})')
                axes[i].set_xlabel('UMAP Dimension 1')
                axes[i].set_ylabel('UMAP Dimension 2')
                axes[i].grid(True, alpha=0.3)
        
        # Hide empty subplots
        for i in range(len(years), len(axes)):
            axes[i].set_visible(False)
        
        plt.tight_layout()
        return fig
    
    def _save_plots(self, figs: List[Tuple[str, any]]):
        """Save plots to disk"""
        plots_dir = self.data_dir / "plots"
        plots_dir.mkdir(parents=True, exist_ok=True)
        
        for name, fig in figs:
            try:
                if hasattr(fig, 'write_html'):  # Plotly figure
                    fig.write_html(plots_dir / f"{name}.html")
                    self.logger.info(f"Saved {name}.html")
                else:  # Matplotlib figure
                    fig.savefig(plots_dir / f"{name}.png", dpi=300, bbox_inches='tight')
                    fig.savefig(plots_dir / f"{name}.pdf", bbox_inches='tight')
                    self.logger.info(f"Saved {name}.png and {name}.pdf")
                    plt.close(fig)  # Close to free memory
            except Exception as e:
                self.logger.error(f"Error saving plot {name}: {e}")
    
    def search_similar_publications(self, query: str, n_results: int = 5) -> Dict:
        """Search for similar publications using vector similarity"""
        try:
            collection = self.chroma_client.get_collection(self.collection_name)
            results = collection.query(
                query_texts=[query],
                n_results=n_results
            )
            return results
        except Exception as e:
            self.logger.error(f"Error searching publications: {e}")
            return {'error': str(e)}


def load_publications_from_file(file_path: str) -> List[Publication]:
    """Load publications from JSON file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    publications = []
    for item in data:
        pub = Publication(
            title=item.get('title', ''),
            authors=item.get('authors', ''),
            journal=item.get('journal', ''),
            year=item.get('year', 2024),
            doi=item.get('doi'),
            pubmed_id=item.get('pubmed_id'),
            ifc_url=item.get('ifc_url'),
            abstract=item.get('abstract'),
            keywords=item.get('keywords')
        )
        publications.append(pub)
    
    return publications

print("✅ PublicationEmbeddingSystem class loaded successfully!")

✅ PublicationEmbeddingSystem class loaded successfully!


## Create embedding pipeline

In [10]:
# Cell 9: Load publications and create embedding system
# Load the scraped publications data
data_file = Path("../data/raw/all_ifc_publications.json")

if data_file.exists():
    print(f"✅ Found publication data: {data_file}")
    
    # Load publications using the function from Cell 7
    publications = load_publications_from_file(str(data_file))
    print(f"📚 Loaded {len(publications)} publications")
    
    # Quick stats
    years = [pub.year for pub in publications]
    print(f"📅 Year range: {min(years)} - {max(years)}")
    print(f"📄 With abstracts: {sum(1 for pub in publications if pub.abstract)}")
    
    # Initialize the embedding system (using class from Cell 7)
    print("\n🔧 Initializing Publication Embedding System...")
    embedding_system = PublicationEmbeddingSystem()
    print("✅ Embedding system initialized!")
    
else:
    print("❌ No publication data found. Please run the scraper first.")
    publications = []
    embedding_system = None

print("🚀 Ready to create embeddings!")

[32m2025-09-19 14:33:57[0m | [1mINFO[0m | [36m__main__[0m:[36m__init__[0m:[36m42[0m - [1mLoading embedding model: all-MiniLM-L6-v2[0m


✅ Found publication data: ../data/raw/all_ifc_publications.json
📚 Loaded 404 publications
📅 Year range: 2018 - 2025
📄 With abstracts: 404

🔧 Initializing Publication Embedding System...


[32m2025-09-19 14:33:59[0m | [1mINFO[0m | [36m__main__[0m:[36m__init__[0m:[36m50[0m - [1mInitializing ChromaDB at: /home/santi/Projects/UBMI-IFC-Podcast/data/chromadb[0m


✅ Embedding system initialized!
🚀 Ready to create embeddings!


In [11]:
# Cell 10: Create embeddings and visualizations
if publications and embedding_system:
    print("🔄 Creating publication embeddings...")
    print("This will take a few minutes...")
    
    # Create embeddings (this does everything: embeddings, ChromaDB, UMAP, saving)
    results = embedding_system.create_publication_embeddings(publications)
    
    print(f"\n✅ Created embeddings for {len(results['publications'])} publications")
    print(f"📊 Embedding shape: {results['embeddings'].shape}")
    print(f"🗺️  UMAP coordinates shape: {results['umap_embeddings'].shape}")
    
    # Create visualizations
    print("\n🎨 Creating visualizations...")
    figs = embedding_system.visualize_embeddings(results, save_plots=True)
    print(f"📈 Created {len(figs)} visualizations")
    
    # Show basic stats
    df = pd.DataFrame(results['metadata'])
    print(f"\n📋 Data Summary:")
    print(f"   Years: {df['year'].min()} - {df['year'].max()}")
    print(f"   Unique journals: {df['journal'].nunique()}")
    print(f"   With abstracts: {df['has_abstract'].sum()}/{len(df)}")
    
else:
    print("⚠️ Skipping embedding creation - no data or system not initialized")

[32m2025-09-19 14:34:26[0m | [1mINFO[0m | [36m__main__[0m:[36mcreate_publication_embeddings[0m:[36m72[0m - [1mCreating embeddings for 404 publications...[0m
[32m2025-09-19 14:34:26[0m | [1mINFO[0m | [36m__main__[0m:[36mcreate_publication_embeddings[0m:[36m78[0m - [1mGenerating embeddings with SentenceTransformer...[0m


🔄 Creating publication embeddings...
This will take a few minutes...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

[32m2025-09-19 14:34:28[0m | [1mINFO[0m | [36m__main__[0m:[36mcreate_publication_embeddings[0m:[36m82[0m - [1mStoring embeddings in ChromaDB...[0m
[32m2025-09-19 14:34:28[0m | [1mINFO[0m | [36m__main__[0m:[36m_store_in_chromadb[0m:[36m190[0m - [1mAdded batch 1, total: 100/404[0m
[32m2025-09-19 14:34:28[0m | [1mINFO[0m | [36m__main__[0m:[36m_store_in_chromadb[0m:[36m190[0m - [1mAdded batch 2, total: 200/404[0m
[32m2025-09-19 14:34:28[0m | [1mINFO[0m | [36m__main__[0m:[36m_store_in_chromadb[0m:[36m190[0m - [1mAdded batch 3, total: 300/404[0m
[32m2025-09-19 14:34:28[0m | [1mINFO[0m | [36m__main__[0m:[36m_store_in_chromadb[0m:[36m190[0m - [1mAdded batch 4, total: 400/404[0m
[32m2025-09-19 14:34:28[0m | [1mINFO[0m | [36m__main__[0m:[36m_store_in_chromadb[0m:[36m190[0m - [1mAdded batch 5, total: 404/404[0m
[32m2025-09-19 14:34:28[0m | [1mINFO[0m | [36m__main__[0m:[36m_store_in_chromadb[0m:[36m192[0m - [1m✅ Sto


✅ Created embeddings for 404 publications
📊 Embedding shape: (404, 384)
🗺️  UMAP coordinates shape: (404, 2)

🎨 Creating visualizations...


[32m2025-09-19 14:34:37[0m | [1mINFO[0m | [36m__main__[0m:[36m_save_plots[0m:[36m405[0m - [1mSaved basic_scatter.png and basic_scatter.pdf[0m
[32m2025-09-19 14:34:37[0m | [1mINFO[0m | [36m__main__[0m:[36m_save_plots[0m:[36m401[0m - [1mSaved interactive.html[0m
[32m2025-09-19 14:34:37[0m | [1mINFO[0m | [36m__main__[0m:[36m_save_plots[0m:[36m401[0m - [1mSaved journal_clusters.html[0m
[32m2025-09-19 14:34:38[0m | [1mINFO[0m | [36m__main__[0m:[36m_save_plots[0m:[36m405[0m - [1mSaved year_evolution.png and year_evolution.pdf[0m


📈 Created 4 visualizations

📋 Data Summary:
   Years: 2018 - 2025
   Unique journals: 375
   With abstracts: 404/404


In [12]:
# Cell 11: Test similarity search
if embedding_system:
    print("🔍 Testing similarity search...")
    
    test_queries = [
        "neural mechanisms",
        "cardiac physiology", 
        "protein structure",
        "memory consolidation",
        "synaptic plasticity"
    ]
    
    for query in test_queries:
        print(f"\n🔎 Query: '{query}'")
        try:
            results = embedding_system.search_similar_publications(query, n_results=3)
            
            if 'error' not in results:
                docs = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                
                for i, (doc, meta, dist) in enumerate(zip(docs, metadatas, distances)):
                    print(f"   {i+1}. {meta['title'][:60]}...")
                    print(f"      Authors: {meta['authors'][:40]}...")
                    print(f"      Year: {meta['year']}, Distance: {dist:.3f}")
            else:
                print(f"   Error: {results['error']}")
        except Exception as e:
            print(f"   Error searching: {e}")
    
    print("\n✅ Similarity search test completed!")
else:
    print("⚠️ No embedding system available for search")

🔍 Testing similarity search...

🔎 Query: 'neural mechanisms'


/home/santi/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:14<00:00, 5.63MiB/s]


   1. Editorial: Mechanisms of Neuronal Recovery in the Central Ne...
      Authors: Tovar-y-Romo, L. B., Guemez-Gamboa, A., ...
      Year: 2021, Distance: 0.922
   2. A tale of two leeches: Toward the understanding of the evolu...
      Authors: Kuo, D., De?Miguel, F. F., Heath?Heckman...
      Year: 2020, Distance: 0.938
   3. The Neurobehavioral State hypothesis...
      Authors: Ontiveros-Araiza, L. F....
      Year: 2025, Distance: 0.950

🔎 Query: 'cardiac physiology'
   1. ?Funny? channels in cardiac mitochondria modulate membrane p...
      Authors: Padilla-Flores, T., López-González, Z., ...
      Year: 2020, Distance: 1.158
   2. Crucial role for sensory nerves and Na/H exchanger inhibitio...
      Authors: Forrester, E. A., Benítez-Angeles, M., R...
      Year: 2024, Distance: 1.266
   3. Sex Differences in the Physiological Network of Healthy Youn...
      Authors: Barajas-Martínez, A., Ibarra-Coronado, E...
      Year: 2021, Distance: 1.302

🔎 Query: 'protein structure'
  

In [13]:
# Cell 12: Display embedding visualizations
if 'results' in locals() and results:
    print("🎨 Displaying embedding visualizations...")
    
    # Display the basic scatter plot
    print("\n📊 Basic UMAP Scatter Plot:")
    plt.show()
    
    # Display interactive plot if available
    if 'figs' in locals() and figs:
        for name, fig in figs:
            print(f"\n📈 Showing {name}:")
            
            if hasattr(fig, 'show'):  # Plotly figure
                fig.show()
            else:  # Matplotlib figure
                plt.figure()
                plt.show()
    
    # Create a quick summary plot of the embedding space
    print("\n🗺️  UMAP Embedding Space Summary:")
    
    # Load the data
    df = pd.DataFrame(results['metadata'])
    df['umap_x'] = results['umap_embeddings'][:, 0]
    df['umap_y'] = results['umap_embeddings'][:, 1]
    
    # Create a comprehensive view
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Plot 1: All points colored by year
    scatter1 = axes[0, 0].scatter(df['umap_x'], df['umap_y'], c=df['year'], 
                                  cmap='viridis', alpha=0.7, s=50)
    axes[0, 0].set_title('Publications by Year')
    axes[0, 0].set_xlabel('UMAP Dimension 1')
    axes[0, 0].set_ylabel('UMAP Dimension 2')
    axes[0, 0].grid(True, alpha=0.3)
    plt.colorbar(scatter1, ax=axes[0, 0], label='Year')
    
    # Plot 2: Points colored by whether they have abstracts
    colors = ['red' if not has_abs else 'blue' for has_abs in df['has_abstract']]
    axes[0, 1].scatter(df['umap_x'], df['umap_y'], c=colors, alpha=0.7, s=50)
    axes[0, 1].set_title('Publications: Red=No Abstract, Blue=Has Abstract')
    axes[0, 1].set_xlabel('UMAP Dimension 1')
    axes[0, 1].set_ylabel('UMAP Dimension 2')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Plot 3: Density plot
    axes[1, 0].hexbin(df['umap_x'], df['umap_y'], gridsize=20, cmap='Blues')
    axes[1, 0].set_title('Publication Density')
    axes[1, 0].set_xlabel('UMAP Dimension 1')
    axes[1, 0].set_ylabel('UMAP Dimension 2')
    
    # Plot 4: Year distribution histogram
    axes[1, 1].hist(df['year'], bins=len(df['year'].unique()), 
                    alpha=0.7, color='skyblue', edgecolor='black')
    axes[1, 1].set_title('Publications by Year')
    axes[1, 1].set_xlabel('Year')
    axes[1, 1].set_ylabel('Count')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print some statistics about the embedding space
    print(f"\n📈 Embedding Space Statistics:")
    print(f"   • Total publications: {len(df)}")
    print(f"   • Year range: {df['year'].min()} - {df['year'].max()}")
    print(f"   • Publications with abstracts: {df['has_abstract'].sum()}/{len(df)} ({df['has_abstract'].mean():.1%})")
    print(f"   • Unique journals: {df['journal'].nunique()}")
    print(f"   • UMAP X range: {df['umap_x'].min():.2f} to {df['umap_x'].max():.2f}")
    print(f"   • UMAP Y range: {df['umap_y'].min():.2f} to {df['umap_y'].max():.2f}")
    
    # Show top journals
    print(f"\n📚 Top 5 Journals:")
    top_journals = df['journal'].value_counts().head(5)
    for journal, count in top_journals.items():
        print(f"   • {journal}: {count} publications")

else:
    print("⚠️ No embedding results available to display")

🎨 Displaying embedding visualizations...

📊 Basic UMAP Scatter Plot:

📈 Showing basic_scatter:

📈 Showing interactive:



FigureCanvasAgg is non-interactive, and thus cannot be shown




📈 Showing journal_clusters:



📈 Showing year_evolution:

🗺️  UMAP Embedding Space Summary:


KeyError: 'metadata'