# Reference List Analysis

This notebook analyzes all the articles references from files stored in our B2 bucket. We load the files directly from B2 using the native B2 SDK (b2sdk) into memory, combine them into a single DataFrame, and produce summary statistics such as:

- Total number of articles
- Distribution by publication year
- List of unique journals
- Additional descriptive statistics

This analysis will help us quickly get an overview of the articles we plan to download.

In [4]:
import os
import shutil

# Update: The source .env file is located in the "app" folder,
# and the target location is now the "notebooks" folder.
source_env_file = os.path.join('app', '.env')
dest_env_file = os.path.join('notebooks', '.env')

try:
    shutil.copy(source_env_file, dest_env_file)
    print("Successfully copied .env file from the app folder to the notebooks folder.")
except Exception as e:
    print(f"Error copying file: {e}")


Error copying file: [Errno 2] No such file or directory: 'app/.env'


In [5]:
import os
import io
import json
import pandas as pd
from b2sdk.v2 import InMemoryAccountInfo, B2Api
from dotenv import load_dotenv, find_dotenv

# Load environment variables from repo root
load_dotenv(find_dotenv())

# Get B2 credentials from environment
application_key = os.getenv("B2_APPLICATION_KEY")
application_key_id = os.getenv("B2_APPLICATION_KEY_ID")
bucket_name = os.getenv("B2_BUCKET_NAME")

if not all([application_key, application_key_id, bucket_name]):
    raise Exception("One or more required B2 credentials are missing from the environment.")

In [9]:
# Set up B2 connection
info = InMemoryAccountInfo()
b2_api = B2Api(info)
b2_api.authorize_account("production", application_key_id, application_key)

# Get bucket object
bucket = b2_api.get_bucket_by_name(bucket_name)

# List all Excel files in the references/xls directory
prefix = 'references/xls/'
files_info = [
    file_version_info
    for file_version_info, _ in bucket.ls(prefix)
    if file_version_info.file_name.endswith(('.csv'))
]

print("Files in B2 bucket under", prefix)
for file_info in files_info:
    print(file_info.file_name)


Files in B2 bucket under references/xls/
references/xls/1.2.2.1 LR - The Specialist Shortage and its Impact.csv
references/xls/1.2.2.2 LR - AI Applications in SCM Decision Support.csv
references/xls/1.2.2.3 LR - Human-AI Collaboration in SCM.csv
references/xls/1.2.2.4 LR - Challenges and Limitations of LLMs in SCM.csv
references/xls/1.2.2.5 LR - Decision-Making Processes.csv
references/xls/1.2.2.6 LR - Agents.csv


In [15]:
import pandas as pd
import io

# Dictionary with exact filenames
file_mapping = {
    'references/xls/1.2.2.1 LR - The Specialist Shortage and its Impact.csv': '1_specialists_df',
    'references/xls/1.2.2.2 LR - AI Applications in SCM Decision Support.csv': '2_aiscm_df',
    'references/xls/1.2.2.3 LR - Human-AI Collaboration in SCM.csv': '3_humanai_df',
    'references/xls/1.2.2.4 LR - Challenges and Limitations of LLMs in SCM.csv': '4_challenges_df',
    'references/xls/1.2.2.5 LR - Decision-Making Processes.csv': '5_decision_df',
    'references/xls/1.2.2.6 LR - Agents.csv': '6_agents_df'
}

# Initialize dictionary to store DataFrames
dataframes = {}

# Download and read each file
for file_info in files_info:
    filename = file_info.file_name
    if filename in file_mapping:
        # Download file content
        downloaded_file = bucket.download_file_by_name(filename)
        
        # Read directly from the downloaded file (no .open() needed)
        file_data = io.StringIO(downloaded_file.read().decode('utf-8')) # Corrected - call .read() as a method
        
        # Read CSV into DataFrame
        df = pd.read_csv(file_data)
        df_name = file_mapping[filename]
        dataframes[df_name] = df
        print(f"Loaded {filename} into {df_name} with shape {df.shape}")
    else:
        print(f"Skipping unmapped file: {filename}")

# Assign DataFrames to individual variables
locals().update(dataframes)

# Print basic info about each DataFrame
for name, df in dataframes.items():
    print(f"\n{name} info:")
    print(df.info())

AttributeError: 'DownloadedFile' object has no attribute 'read'

## Data Overview

Below are the first few rows of the combined references DataFrame to inspect its structure.

In [None]:
references_df.head()

## Summary Statistics

In [None]:
# Total number of articles
total_articles = references_df.shape[0]
print(f"Total number of articles: {total_articles}")

# Distribution by publication year (assuming 'year' column exists)
if 'year' in references_df.columns:
    year_distribution = references_df['year'].value_counts().sort_index()
    print("\nPublication Year Distribution:")
    print(year_distribution)
else:
    print("The column 'year' is not found in the data.")

# List of unique journals (assuming 'journal' column exists)
if 'journal' in references_df.columns:
    unique_journals = references_df['journal'].unique()
    print(f"\nUnique journals ({len(unique_journals)}):")
    print(unique_journals)
else:
    print("The column 'journal' is not found in the data.")

# Publisher distribution if 'publisher' column exists
if 'publisher' in references_df.columns:
    publisher_distribution = references_df['publisher'].value_counts()
    print("\nPublisher Distribution (top 10):")
    print(publisher_distribution.head(10))
else:
    print("The column 'publisher' is not found in the data.")

## DOI Analysis and Missing Data

In [None]:
# Check for missing DOIs
missing_dois = references_df['doi'].isna().sum()
print(f"Number of entries with missing DOIs: {missing_dois}")

# Check DOI patterns
if not missing_dois == len(references_df):
    print("\nSample of DOI patterns:")
    print(references_df['doi'].value_counts().head())

## Abstract Analysis

Analyzing abstracts can help us understand the content distribution and identify potential data quality issues.

In [None]:
if 'abstract' in references_df.columns:
    # Calculate abstract lengths
    references_df['abstract_length'] = references_df['abstract'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)
    
    # Basic statistics
    print("Abstract Statistics:")
    print(f"Mean length: {references_df['abstract_length'].mean():.2f} characters")
    print(f"Median length: {references_df['abstract_length'].median():.0f} characters")
    print(f"Shortest abstract: {references_df['abstract_length'].min()} characters")
    print(f"Longest abstract: {references_df['abstract_length'].max()} characters")
    
    # Check for missing abstracts
    missing_abstracts = references_df['abstract'].isna().sum()
    print(f"\nNumber of entries with missing abstracts: {missing_abstracts}")
else:
    print("The column 'abstract' is not found in the data.")

## Save Processed Data

Save the processed DataFrame for future use.

In [None]:
# Save to JSON for easier reading
output_json = os.path.join('data', 'references_analysis.json')
references_df.to_json(output_json, orient='records', indent=2)
print(f"Saved processed data to: {output_json}")

# Save basic statistics to a separate file
stats_dict = {
    'total_articles': total_articles,
    'unique_journals': len(references_df['journal'].unique()) if 'journal' in references_df.columns else 0,
    'year_range': f"{references_df['year'].min()}-{references_df['year'].max()}" if 'year' in references_df.columns else 'N/A',
    'missing_dois': missing_dois if 'doi' in references_df.columns else 'N/A',
    'missing_abstracts': missing_abstracts if 'abstract' in references_df.columns else 'N/A'
}

stats_json = os.path.join('data', 'references_stats.json')
with open(stats_json, 'w') as f:
    json.dump(stats_dict, f, indent=2)
print(f"Saved statistics to: {stats_json}")