In [1]:
# --- 1. Setup and Configuration ---
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm
from typing import Optional
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# filepath: article_downloader.ipynb (first cell)
import sys
import os
sys.path.append(os.path.abspath("app"))

In [3]:
# Import modules and config from utils.py
from utils import config, create_data_directories, logger
from pdf_downloader import download_pdf
from html_parser import parse_article_html

In [4]:
# Create data directories if they don't exist
create_data_directories()

Data directories created or already exist at: ./data/pdfs


In [5]:
import os
import glob
import pandas as pd

# --- 2. Load Input Data ---
references_dir = "/Users/max/Documents/Code/tsi-sota-ai/data/references"
csv_pattern = os.path.join(references_dir, "*.csv")
csv_files = glob.glob(csv_pattern)

if not csv_files:
    logger.error(f"No CSV files found in the directory: {references_dir}")
    raise FileNotFoundError(f"No CSV files found in: {references_dir}")

data_frames = []
for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file).dropna(subset=['doi'])
        logger.info(f"Successfully loaded {csv_file}. Shape: {df.shape}")
        data_frames.append(df)
    except Exception as e:
        logger.error(f"Error loading CSV file {csv_file}: {e}", exc_info=True)
        raise

# Combine all dataframes into one
articles_df = pd.concat(data_frames, ignore_index=True)
logger.info(f"Combined articles data shape: {articles_df.shape}")

In [6]:
# Display first few rows
articles_df.head()

Unnamed: 0,date,title,doi,authors,journal,short_journal,volume,year,publisher,issue,page,abstract
0,2020-07-25,Predictive big data analytics for supply chain...,10.1186/s40537-020-00329-2,"[{'author_name': 'Seyedeh Mahya Seyedan', 'aut...",Journal of Big Data,J Big Data,7.0,2020,Springer Science and Business Media LLC,1.0,,Big data analytics (BDA) in supply chain manag...
1,2020-02-26,Overcoming Barriers in Supply Chain Analytics—...,10.3390/logistics4010005,"[{'author_name': 'Tino T. Herden', 'author_slu...",Logistics,Logistics,4.0,2020,MDPI AG,1.0,5.0,While supply chain analytics shows promise reg...
2,2022-11-30,Decision support system for handling control d...,10.1186/s40537-022-00653-9,"[{'author_name': 'Dimah Alahmadi', 'author_slu...",Journal of Big Data,J Big Data,9.0,2022,Springer Science and Business Media LLC,1.0,,Abstract\n Background\n ...
3,2021-09-08,A Systematic Investigation of the Integration ...,10.3390/logistics5030062,"[{'author_name': 'Meike Schroeder', 'author_sl...",Logistics,Logistics,5.0,2021,MDPI AG,3.0,62.0,The main objective of the paper is to analyze ...
4,2021-07-05,"Disorders, Vulnerabilities and Resilience in t...",10.3390/logistics5030048,"[{'author_name': 'Catarina Ferreira', 'author_...",Logistics,Logistics,5.0,2021,MDPI AG,3.0,48.0,The economic and social environment caused by ...


In [24]:
import google.generativeai as genai

text = "The present study presents a knowledge-based DSS framework for supporting the decision-maker and handling control decisions related to supply chains!"
result = genai.embed_content(
    model="models/text-embedding-004", content=text, output_dimensionality=10
)
print(result["embedding"])

[0.02527787, -0.031934384, -0.039232098, 0.044021856, -0.04235415, 0.03968052, 0.027044622, 0.026097119, 0.02612152, 0.0033844442]


In [27]:
# --- 3. Embedding Generation  ---
import google.generativeai as genai
import numpy as np  # Import numpy to handle embeddings as arrays

def generate_embeddings(text: str) -> Optional[list]:
    """
    Generates embeddings for the given text using the Gemini API (text-embedding-004 model).

    Args:
        text (str): The text to encode.

    Returns:
        Optional[list]: A list representing the embedding vector, or None if there was an error.
    """

    model_name = "models/text-embedding-004" # Specify the embedding model

    try:
        # Removed [:50] slicing from log message - now logging full text (or beginning of it if very long)
        logger.info(f"Generating Gemini embedding for text: %s...", text[:50]) # Showing up to 500 chars now for log preview

        # Use genai.embed with model name and content
        response = genai.embed_content(model=model_name, content=text, output_dimensionality=768)
        embedding_list = response.get('embedding')
        if isinstance(embedding_list, list):
            logger.info("Gemini embedding generated successfully.")
            return embedding_list
        else:
            raise ValueError("Embedding not returned as a list")

        logger.info("Gemini embedding generated successfully.")
        return embedding_list # Return the embedding as a list

    except Exception as e:
        logger.error(f"Error generating Gemini embedding: {e}", exc_info=True)
        return None

In [28]:
# --- 3.1. Test Embedding Generation (NEW SECTION) ---
print("Test result")
test_abstract_index = 2 # Third row (index 2)
test_abstract = articles_df['abstract'].iloc[test_abstract_index] # Get abstract from DataFrame

print(f"For abstract: {{}}\n{test_abstract[:200]}...") # Print first 200 chars of abstract for brevity

calculated_embedding = generate_embeddings(test_abstract) # Call embedding function

if calculated_embedding is not None:
    embedding_preview = str(calculated_embedding[:5]) + "... (truncated, full embedding vector has length " + str(len(calculated_embedding)) + ")" # Preview first 5 elements
    print(f"Calculated embedding is: \n{embedding_preview}")
else:
    print("Calculated embedding is: Error generating embedding (None returned)")

Test result
For abstract: {}
Abstract
                Background
                The present study presents a knowledge-based DSS framework for supporting the decision-maker and handling control decisions related to supply chains...
Calculated embedding is: 
[0.014291704, 0.006687503, -0.04357887, 0.039561834, -0.01886462]... (truncated, full embedding vector has length 768)


In [29]:
# --- 4. Article Processing and Dataframe Population ---
def fetch_article_content(doi: str, title: str) -> dict:
    """
    Fetches article content (Markdown) using DOI and title.
    This version focuses ONLY on full-text parsing and excludes embedding, clustering, and PDF download.
    """
    article_data = {
        'doi': doi,
        'title': title,
        'full_text_markdown': None,
        'retrieval_method': None,
    }
    article_url = f"https://doi.org/{doi}"

    # Use parse_article_html and pass the configuration option to choose between BeautifulSoup and Jina Reader API
    parsed_content = parse_article_html(article_url, use_jina_reader_api_config=config.use_jina_reader_api_config)
    if parsed_content:
        article_data['full_text_markdown'] = parsed_content['content']
        # Update retrieval_method to reflect which parser was actually used
        article_data['retrieval_method'] = parsed_content['metadata'].get('parser', 'BeautifulSoup+html2text') # Get parser info from metadata
        logger.info(f"Successfully parsed article HTML for DOI: {doi} using {article_data['retrieval_method']}") # Log actual parser used
    else:
        article_data['retrieval_method'] = 'HTML Parsing Failed'
        logger.warning(f"HTML parsing failed for DOI: {doi}")

    return article_data

In [30]:
import time
from tqdm import tqdm

output_data = []  # List to store processed article data
for index, row in tqdm(articles_df.iterrows(), total=len(articles_df), desc="Processing articles"):
    doi = row['doi']
    title = row['title']
    try:
        processed_article_data = fetch_article_content(doi, title)
        output_data.append({**row.to_dict(), **processed_article_data})  # Merge original row data with processed data
    except Exception as e:
        logger.error(f"Error processing article with DOI: {doi}. Error: {e}", exc_info=True)
        output_data.append({**row.to_dict(), 'error': str(e)})  # Append error info
    # Pause 1 second between requests to respect rate limiting
    time.sleep(1)

Processing articles: 100%|██████████| 593/593 [31:38<00:00,  3.20s/it]  


In [10]:
print(output_data[0])

{'date': '2020-07-25', 'title': 'Predictive big data analytics for supply chain demand forecasting: methods, applications, and research opportunities', 'doi': '10.1186/s40537-020-00329-2', 'authors': "[{'author_name': 'Seyedeh Mahya Seyedan', 'author_slug': 'seyedeh-mahya-seyedan-OjaaAj', 'author_sequence_number': '1', 'affiliation': None, 'affiliation_slug': None}, {'author_name': 'Fereshteh Mafakheri', 'author_slug': 'fereshteh-mafakheri-A3rYgv', 'author_sequence_number': '2', 'affiliation': None, 'affiliation_slug': None}]", 'journal': 'Journal of Big Data', 'short_journal': 'J Big Data', 'volume': 7.0, 'year': 2020, 'publisher': 'Springer Science and Business Media LLC', 'issue': 1.0, 'page': nan, 'abstract': 'Big data analytics (BDA) in supply chain management (SCM) is receiving a growing attention. This is due to the fact that BDA has a wide range of applications in SCM, including customer behavior analysis, trend analysis, and demand prediction. In this survey, we investigate th

In [32]:
processed_articles_df = pd.DataFrame(output_data)

In [33]:
processed_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593 entries, 0 to 592
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   date                593 non-null    object 
 1   title               593 non-null    object 
 2   doi                 593 non-null    object 
 3   authors             593 non-null    object 
 4   journal             593 non-null    object 
 5   short_journal       563 non-null    object 
 6   volume              589 non-null    float64
 7   year                593 non-null    int64  
 8   publisher           593 non-null    object 
 9   issue               403 non-null    float64
 10  page                415 non-null    object 
 11  abstract            587 non-null    object 
 12  full_text_markdown  535 non-null    object 
 13  retrieval_method    593 non-null    object 
dtypes: float64(2), int64(1), object(11)
memory usage: 65.0+ KB


In [34]:
processed_articles_df.head()

Unnamed: 0,date,title,doi,authors,journal,short_journal,volume,year,publisher,issue,page,abstract,full_text_markdown,retrieval_method
0,2020-07-25,Predictive big data analytics for supply chain...,10.1186/s40537-020-00329-2,"[{'author_name': 'Seyedeh Mahya Seyedan', 'aut...",Journal of Big Data,J Big Data,7.0,2020,Springer Science and Business Media LLC,1.0,,Big data analytics (BDA) in supply chain manag...,* Survey Paper\n * [Open access](https://www....,BeautifulSoup + html2text
1,2020-02-26,Overcoming Barriers in Supply Chain Analytics—...,10.3390/logistics4010005,"[{'author_name': 'Tino T. Herden', 'author_slu...",Logistics,Logistics,4.0,2020,MDPI AG,1.0,5.0,While supply chain analytics shows promise reg...,Open AccessEditor’s ChoiceArticle\n\n# Overco...,BeautifulSoup + html2text
2,2022-11-30,Decision support system for handling control d...,10.1186/s40537-022-00653-9,"[{'author_name': 'Dimah Alahmadi', 'author_slu...",Journal of Big Data,J Big Data,9.0,2022,Springer Science and Business Media LLC,1.0,,Abstract\n Background\n ...,* Research\n * [Open access](https://www.spri...,BeautifulSoup + html2text
3,2021-09-08,A Systematic Investigation of the Integration ...,10.3390/logistics5030062,"[{'author_name': 'Meike Schroeder', 'author_sl...",Logistics,Logistics,5.0,2021,MDPI AG,3.0,62.0,The main objective of the paper is to analyze ...,Open AccessReview\n\n# A Systematic Investiga...,BeautifulSoup + html2text
4,2021-07-05,"Disorders, Vulnerabilities and Resilience in t...",10.3390/logistics5030048,"[{'author_name': 'Catarina Ferreira', 'author_...",Logistics,Logistics,5.0,2021,MDPI AG,3.0,48.0,The economic and social environment caused by ...,Open AccessEditor’s ChoiceArticle\n\n# Disord...,BeautifulSoup + html2text


In [37]:
# --- 5. Output & Storage ---
output_parquet_filepath = '/Users/max/Data/TSI_PhD/1_sota/processed_articles_fulltext.parquet' # Define output Parquet filepath

In [35]:

processed_articles_df.to_parquet(output_parquet_filepath) # Save DataFrame to Parquet
logger.info(f"Processed data saved to: {output_parquet_filepath}")
print(f"Processed data saved to: {output_parquet_filepath}")

Processed data saved to: /Users/max/Data/TSI_PhD/1_sota/processed_articles_fulltext.parquet


In [38]:
output_json_filepath = '/Users/max/Data/TSI_PhD/1_sota/processed_articles_fulltext.json' # Define output JSON filepath
processed_articles_df.to_json(output_json_filepath, orient='records', lines=True) # Save DataFrame to JSON
logger.info(f"Processed data saved to JSON: {output_json_filepath}")
print(f"Processed data saved to JSON: {output_json_filepath}")

Processed data saved to JSON: /Users/max/Data/TSI_PhD/1_sota/processed_articles_fulltext.json


In [None]:
# --- 5. Clustering (Conditional - if enabled in config) ---
if config.use_clustering_in_pipeline:
    logger.info("Clustering pipeline enabled. Starting HDBSCAN clustering...")

    # 1. Retrieve Embeddings from DataFrame
    embeddings_array = np.array(
        [row['abstract_embedding'] for index, row in processed_articles_df.iterrows() if isinstance(row['abstract_embedding'], list)] # Ensure embeddings are lists (vectors)
    )

    if embeddings_array.size > 0: # Proceed only if embeddings were generated and are valid
        logger.info(f"Embeddings array shape for clustering: {embeddings_array.shape}")

        # 2. Perform HDBSCAN Clustering
        import hdbscan # Import hdbscan library
        clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=2) # Adjust parameters as needed - configurable later?
        clusters = clusterer.fit_predict(embeddings_array)

        # 3. Assign Cluster IDs back to DataFrame
        processed_articles_df['cluster_id'] = clusters # HDBSCAN cluster IDs are directly assigned (can include -1 for noise)

        # 4. Log Noise Points (Outliers) - HDBSCAN identifies noise as cluster -1
        n_noise = np.sum(clusters == -1)
        logger.info(f"HDBSCAN found {n_noise} noise points (outliers).")
        print(f"HDBSCAN found {n_noise} noise points (outliers).") # Print to notebook output

        logger.info("HDBSCAN clustering completed.")


    else:
        logger.warning("No valid embeddings found for clustering. Skipping clustering step.")
else:
    logger.info("Clustering pipeline disabled in config.")

In [None]:
# --- 6. Organize Clusters (PDFs) and Save Results ---
if config.use_clustering_in_pipeline:
    cluster_counts = processed_articles_df['cluster_id'].value_counts().sort_index()
    print("Cluster Distribution:")
    print(cluster_counts)

    for cluster_id in processed_articles_df['cluster_id'].unique():
        if cluster_id != -1: # Skip unassigned cluster (-1)
            cluster_dir = config.get_cluster_dir(cluster_id)
            os.makedirs(cluster_dir, exist_ok=True) # Ensure cluster directories exist
            cluster_df = processed_articles_df[processed_articles_df['cluster_id'] == cluster_id]
            logger.info(f"Cluster {cluster_id}: {len(cluster_df)} articles. Sample titles: {cluster_df['title'].head(3).tolist()}")
else:
    logger.info("Cluster organization (PDFs) skipped as clustering is disabled.")

In [None]:
output_json_filepath = 'processed_articles_fulltext.json' # Choose output filename
processed_articles_df.to_json(output_json_filepath, orient='records', lines=True)
logger.info(f"Processed data saved to: {output_json_filepath}")
print(f"Processed data saved to: {output_json_filepath}")

In [None]:
# --- 7. Basic Analysis and Summary (Optional) ---
print("\nRetrieval Method Distribution:")
print(processed_articles_df['retrieval_method'].value_counts())
print("\nDownload Success Rate:")
print(processed_articles_df['download_success'].value_counts(normalize=True))

In [None]:
# --- 8. Visualization (Optional - Clustering Results if enabled) ---
if config.use_clustering_in_pipeline and embeddings_array.size > 0:
    try:
        from sklearn.manifold import TSNE # Import here, only if needed
        tsne = TSNE(n_components=2, random_state=42, n_iter=300, perplexity=30) # Example TSNE parameters
        tsne_results = tsne.fit_transform(embeddings_array)

        plt.figure(figsize=(12, 8))
        sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], hue=processed_articles_df.loc[valid_embedding_indices, 'cluster_id'], palette='viridis', legend='full') # Use valid indices for hue
        plt.title('Article Clusters Visualized with t-SNE')
        plt.xlabel('TSNE Dimension 1')
        plt.ylabel('TSNE Dimension 2')
        plt.show()
    except ImportError:
        logger.warning("t-SNE visualization requires scikit-learn and matplotlib. Please install them to visualize clusters.")
    except Exception as e:
        logger.error(f"Error during t-SNE visualization: {e}", exc_info=True)
else:
    logger.info("t-SNE visualization skipped as clustering is disabled or no embeddings available.")