In [1]:
# --- 1. Setup and Configuration ---
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm
from typing import Optional
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# filepath: article_downloader.ipynb (first cell)
import sys
import os
sys.path.append(os.path.abspath("app"))

In [9]:
# Import modules and config from utils.py
from utils import config, create_data_directories, logger
from pdf_downloader import download_pdf
from html_parser import parse_article_html

In [None]:
# Create data directories if they don't exist
create_data_directories()

In [None]:
# --- 2. Load Input Data ---
input_csv_filepath = 'articles.csv' # Replace with your CSV path
try:
    articles_df = pd.read_csv(input_csv_filepath).dropna(subset=['doi'])
    logger.info(f"Successfully loaded articles data from: {input_csv_filepath}. Shape: {articles_df.shape}")
except FileNotFoundError:
    logger.error(f"Input CSV file not found: {input_csv_filepath}")
    raise
except Exception as e:
    logger.error(f"Error loading input CSV file: {e}", exc_info=True)
    raise

In [None]:
# Display first few rows
articles_df.head()

In [None]:
# --- 3. Embedding Generation (Placeholder - To be implemented) ---
def generate_embeddings(text: str) -> Optional[list]:
    """
    Placeholder function for generating embeddings for article content.
    Replace with your actual embedding generation code (e.g., using OpenAI, Gemini).
    For now, returns None.
    """
    logger.warning("Embedding generation is a placeholder - returning None.")
    return None # Replace with actual embedding generation

In [None]:
# --- 4. Article Processing and Dataframe Population ---
def fetch_article_content(doi: str, title: str) -> dict:
    """
    Fetches article content (Markdown and PDF) using DOI and title.
    """
    article_data = {
        'doi': doi,
        'title': title,
        'full_text_markdown': None,
        'pdf_filepath': None,
        'retrieval_method': None,
        'download_success': False,
        'cluster_id': None # Will be added later if clustering is enabled
    }
    article_url = f"https://doi.org/{doi}"

    parsed_content = parse_article_html(article_url) # Use BeautifulSoup parser initially
    if parsed_content:
        article_data['full_text_markdown'] = parsed_content['content']
        article_data['retrieval_method'] = parsed_content['metadata'].get('parser', 'BeautifulSoup+html2text') # Capture parser info
        logger.info(f"Successfully parsed article HTML for DOI: {doi} using {article_data['retrieval_method']}")
    else:
        article_data['retrieval_method'] = 'HTML Parsing Failed'
        logger.warning(f"HTML parsing failed for DOI: {doi}")

    if config.use_clustering_in_pipeline: # Only generate embeddings if clustering is enabled in pipeline
        embeddings = generate_embeddings(article_data['full_text_markdown'] or '') # Generate embeddings for Markdown content or empty string
        article_data['abstract_embedding'] = embeddings # Add embedding to article data

    if config.use_clustering_in_pipeline: # Only download PDFs if clustering is enabled in pipeline
        cluster_id = article_data.get('cluster_id', 0)  # Default cluster ID if not assigned yet
        pdf_path = download_pdf(doi, cluster_id) # Download PDF and get path
        if pdf_path:
            article_data['pdf_filepath'] = pdf_path
            article_data['download_success'] = True
            logger.info(f"PDF download successful for DOI: {doi}, saved to: {pdf_path}")
        else:
            logger.warning(f"PDF download failed for DOI: {doi}")
    else: # If no clustering, still try to download PDF to a default cluster (cluster 0) - or you can skip PDF download if no clustering
        cluster_id = 0 # Default cluster ID for non-clustered PDFs
        pdf_path = download_pdf(doi, cluster_id) # Download PDF to default cluster
        if pdf_path:
            article_data['pdf_filepath'] = pdf_path
            article_data['download_success'] = True
            logger.info(f"PDF download successful for DOI: {doi}, saved to default cluster: {pdf_path}")
        else:
            logger.warning(f"PDF download failed for DOI: {doi} (default cluster)")


    return article_data

In [None]:
output_data = [] # List to store processed article data
for index, row in tqdm(articles_df.iterrows(), total=len(articles_df), desc="Processing articles"):
    doi = row['doi']
    title = row['title']
    try:
        processed_article_data = fetch_article_content(doi, title)
        output_data.append({**row.to_dict(), **processed_article_data}) # Merge original row data with processed data
    except Exception as e:
        logger.error(f"Error processing article with DOI: {doi}. Error: {e}", exc_info=True)
        output_data.append({**row.to_dict(), 'error': str(e)}) # Append error info

In [None]:
processed_articles_df = pd.DataFrame(output_data)

In [None]:
# --- 5. Clustering (Conditional - if enabled in config) ---
if config.use_clustering_in_pipeline:
    logger.info("Clustering pipeline enabled. Starting clustering...")
    # Placeholder for embedding retrieval from DataFrame (if you implemented embedding generation)
    embeddings_array = np.array([row['abstract_embedding'] for index, row in processed_articles_df.iterrows() if row['abstract_embedding'] is not None]) # Example - adjust based on your embedding column name
    if embeddings_array.size > 0: # Proceed only if embeddings were generated
        logger.info(f"Embeddings array shape for clustering: {embeddings_array.shape}")
        n_clusters_optimal = config.n_clusters # Use configured number of clusters
        kmeans = KMeans(n_clusters=n_clusters_optimal, random_state=42, n_init=10) # Explicitly set n_init
        clusters = kmeans.fit_predict(embeddings_array)
        processed_articles_df['cluster_id'] = -1 # Default to -1 (unassigned)
        valid_embedding_indices = [index for index, row in processed_articles_df.iterrows() if row['abstract_embedding'] is not None] # Get indices of rows with embeddings
        for i, index in enumerate(valid_embedding_indices):
            processed_articles_df.at[index, 'cluster_id'] = clusters[i] # Assign cluster IDs based on original indices
        logger.info(f"K-Means clustering completed with {n_clusters_optimal} clusters.")
    else:
        logger.warning("No embeddings found for clustering. Skipping clustering step.")
else:
    logger.info("Clustering pipeline disabled in config.")

In [None]:
# --- 6. Organize Clusters (PDFs) and Save Results ---
if config.use_clustering_in_pipeline:
    cluster_counts = processed_articles_df['cluster_id'].value_counts().sort_index()
    print("Cluster Distribution:")
    print(cluster_counts)

    for cluster_id in processed_articles_df['cluster_id'].unique():
        if cluster_id != -1: # Skip unassigned cluster (-1)
            cluster_dir = config.get_cluster_dir(cluster_id)
            os.makedirs(cluster_dir, exist_ok=True) # Ensure cluster directories exist
            cluster_df = processed_articles_df[processed_articles_df['cluster_id'] == cluster_id]
            logger.info(f"Cluster {cluster_id}: {len(cluster_df)} articles. Sample titles: {cluster_df['title'].head(3).tolist()}")
else:
    logger.info("Cluster organization (PDFs) skipped as clustering is disabled.")

In [None]:
output_json_filepath = 'processed_articles_fulltext.json' # Choose output filename
processed_articles_df.to_json(output_json_filepath, orient='records', lines=True)
logger.info(f"Processed data saved to: {output_json_filepath}")
print(f"Processed data saved to: {output_json_filepath}")

In [None]:
# --- 7. Basic Analysis and Summary (Optional) ---
print("\nRetrieval Method Distribution:")
print(processed_articles_df['retrieval_method'].value_counts())
print("\nDownload Success Rate:")
print(processed_articles_df['download_success'].value_counts(normalize=True))

In [None]:
# --- 8. Visualization (Optional - Clustering Results if enabled) ---
if config.use_clustering_in_pipeline and embeddings_array.size > 0:
    try:
        from sklearn.manifold import TSNE # Import here, only if needed
        tsne = TSNE(n_components=2, random_state=42, n_iter=300, perplexity=30) # Example TSNE parameters
        tsne_results = tsne.fit_transform(embeddings_array)

        plt.figure(figsize=(12, 8))
        sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], hue=processed_articles_df.loc[valid_embedding_indices, 'cluster_id'], palette='viridis', legend='full') # Use valid indices for hue
        plt.title('Article Clusters Visualized with t-SNE')
        plt.xlabel('TSNE Dimension 1')
        plt.ylabel('TSNE Dimension 2')
        plt.show()
    except ImportError:
        logger.warning("t-SNE visualization requires scikit-learn and matplotlib. Please install them to visualize clusters.")
    except Exception as e:
        logger.error(f"Error during t-SNE visualization: {e}", exc_info=True)
else:
    logger.info("t-SNE visualization skipped as clustering is disabled or no embeddings available.")