# Reference List Analysis

This notebook analyzes all the articles references extracted from the CSV files located in the `data/references` folder. We will load all CSV files, combine them into a single DataFrame, and produce summary statistics such as:

- Total number of articles
- Distribution by publication year
- List of unique journals
- Additional descriptive statistics

This analysis will help us quickly get an overview of the articles we plan to download.

In [None]:
import os
import glob
import pandas as pd

# Define the path to the references folder
references_dir = os.path.join('data', 'references')

# Create a glob pattern to match all CSV files in the references directory
csv_pattern = os.path.join(references_dir, '*.csv')
csv_files = glob.glob(csv_pattern)

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in the directory: {references_dir}")

# Load each CSV file into a DataFrame and store in a list
data_frames = []
for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        data_frames.append(df)
        print(f"Loaded {csv_file} with shape {df.shape}")
    except Exception as e:
        print(f"Error loading {csv_file}: {e}")

# Combine all DataFrames into one
references_df = pd.concat(data_frames, ignore_index=True)
print(f"Combined references data shape: {references_df.shape}")

## Data Overview

Below are the first few rows of the combined references DataFrame to inspect its structure.

In [None]:
references_df.head()

## Summary Statistics

In [None]:
# Total number of articles
total_articles = references_df.shape[0]
print(f"Total number of articles: {total_articles}")

# Distribution by publication year (assuming 'year' column exists)
if 'year' in references_df.columns:
    year_distribution = references_df['year'].value_counts().sort_index()
    print("\nPublication Year Distribution:")
    print(year_distribution)
else:
    print("The column 'year' is not found in the data.")

# List of unique journals (assuming 'journal' column exists)
if 'journal' in references_df.columns:
    unique_journals = references_df['journal'].unique()
    print(f"\nUnique journals ({len(unique_journals)}):")
    print(unique_journals)
else:
    print("The column 'journal' is not found in the data.")

# Publisher distribution if 'publisher' column exists
if 'publisher' in references_df.columns:
    publisher_distribution = references_df['publisher'].value_counts()
    print("\nPublisher Distribution (top 10):")
    print(publisher_distribution.head(10))
else:
    print("The column 'publisher' is not found in the data.")

## DOI Analysis and Missing Data

In [None]:
# Check for missing DOIs
missing_dois = references_df['doi'].isna().sum()
print(f"Number of entries with missing DOIs: {missing_dois}")

# Check DOI patterns
if not missing_dois == len(references_df):
    print("\nSample of DOI patterns:")
    print(references_df['doi'].value_counts().head())

## Abstract Analysis

Analyzing abstracts can help us understand the content distribution and identify potential data quality issues.

In [None]:
if 'abstract' in references_df.columns:
    # Calculate abstract lengths
    references_df['abstract_length'] = references_df['abstract'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)
    
    # Basic statistics
    print("Abstract Statistics:")
    print(f"Mean length: {references_df['abstract_length'].mean():.2f} characters")
    print(f"Median length: {references_df['abstract_length'].median():.0f} characters")
    print(f"Shortest abstract: {references_df['abstract_length'].min()} characters")
    print(f"Longest abstract: {references_df['abstract_length'].max()} characters")
    
    # Check for missing abstracts
    missing_abstracts = references_df['abstract'].isna().sum()
    print(f"\nNumber of entries with missing abstracts: {missing_abstracts}")
else:
    print("The column 'abstract' is not found in the data.")

## Save Processed Data

Save the processed DataFrame for future use.

In [None]:
# Save to JSON for easier reading
output_json = os.path.join('data', 'references_analysis.json')
references_df.to_json(output_json, orient='records', indent=2)
print(f"Saved processed data to: {output_json}")

# Save basic statistics to a separate file
stats_dict = {
    'total_articles': total_articles,
    'unique_journals': len(references_df['journal'].unique()) if 'journal' in references_df.columns else 0,
    'year_range': f"{references_df['year'].min()}-{references_df['year'].max()}" if 'year' in references_df.columns else 'N/A',
    'missing_dois': missing_dois if 'doi' in references_df.columns else 'N/A',
    'missing_abstracts': missing_abstracts if 'abstract' in references_df.columns else 'N/A'
}

stats_json = os.path.join('data', 'references_stats.json')
with open(stats_json, 'w') as f:
    json.dump(stats_dict, f, indent=2)
print(f"Saved statistics to: {stats_json}")