In [1]:
import pandas as pd
from collections import defaultdict
import string
import re

Get relevant articles

In [None]:
def normalize_text(text):
    """Normalize text by removing brackets, special characters, and trimming whitespace."""
    if pd.isnull(text):  # Handle missing data
        return ""
    # Remove brackets, special characters, and strip whitespace
    return re.sub(r'[\[\](){}]', '', text).strip().lower()

def contains_keywords(text, keywords):
    """Check if any keyword is present in the given text as a whole word."""
    text_normalized = normalize_text(text)
    # Use regex to ensure keywords are matched as whole words
    return any(re.search(rf'\b{re.escape(keyword)}\b', text_normalized) for keyword in keywords)

def filter_articles(articles_df, keywords, sample_size=5):
    """
    Filter articles based on keywords in titles and abstracts.
    Randomly sample non-relevant articles for manual review.

    Args:
        articles_df (pd.DataFrame): DataFrame containing article metadata.
        keywords (list): List of keywords to check for relevance.
        sample_size (int): Number of non-relevant articles to sample.

    Returns:
        relevant_articles (pd.DataFrame): Articles marked as relevant.
        non_relevant_sample (pd.DataFrame): Random sample of non-relevant articles.
        discarded_articles (pd.DataFrame): All non-relevant articles in original format.
    """
    # Apply keyword detection to Title and Abstract fields
    articles_df['Relevant'] = articles_df['Title'].apply(lambda x: contains_keywords(x, keywords)) | \
                              articles_df['Abstract'].apply(lambda x: contains_keywords(x, keywords))

    # Separate relevant and non-relevant articles
    relevant_articles = articles_df[articles_df['Relevant']]
    non_relevant_articles = articles_df[~articles_df['Relevant']]

    # Randomly sample non-relevant articles for manual review
    non_relevant_sample = non_relevant_articles.sample(n=min(sample_size, len(non_relevant_articles)), random_state=42)

    return relevant_articles, non_relevant_sample, non_relevant_articles

def generate_relevant_authors(relevant_articles_df, authors_df, output_path):
    """
    Generate the relevant authors table based on the relevant articles and authors tables.

    Args:
        relevant_articles_df (pd.DataFrame): DataFrame of relevant articles.
        authors_df (pd.DataFrame): DataFrame of authors.
        output_path (str): Path to save the resulting relevant authors CSV file.
    """
    # Filter the authors table to include only PMIDs from the relevant articles
    relevant_authors_df = authors_df[authors_df['PMID'].isin(relevant_articles_df['PMID'])]

    # Save the resulting relevant authors table
    relevant_authors_df.to_csv(output_path, index=False)

# Define file paths
articles_path = 'data/articles.schistosomiasis.csv'
authors_path = 'data/authors.schistosomiasis.csv'

# Load data
articles = pd.read_csv(articles_path)
authors = pd.read_csv(authors_path)

# Define relevant keywords
keywords = ["schistosomiasis", "schistosoma", "parasitic disease", "schistosomal",
    "japonicum", "oncomelania", "cercariae", "molluscicide", "bilharzia", "schistosome", "antischistosomal", "schistosomes", "molluscicidal", "snail control"]

# Filter articles and sample non-relevant ones
relevant_articles, non_relevant_sample, discarded_articles = filter_articles(articles, keywords, sample_size=5)

# Save the relevant articles and discarded articles
relevant_articles.to_csv('data/relevant_articles.csv', index=False)
discarded_articles.to_csv('data/discarded_articles.csv', index=False)

# Generate relevant authors
output_authors_path = 'data/relevant_authors.csv'
generate_relevant_authors(relevant_articles, authors, output_authors_path)

# Print sample of non-relevant articles for manual review
print("Sample of non-relevant articles for manual review:")
print(non_relevant_sample[['PMID', 'Abstract']])


Get final relevant articles table

In [4]:

# Step 1: Load Datasets
articles_path = 'data/relevant_articles.csv'
scimagojr_path = 'data/scimagojr_2023.csv'

articles_df = pd.read_csv(articles_path)
scimagojr = pd.read_csv(scimagojr_path, sep=';')

# Step 2: Clean and Standardize Columns
# Remove spaces from ISSN and ensure consistent formatting
articles_df['ISSN'] = articles_df['ISSN'].str.replace(' ', '').str.lower()
scimagojr['Issn'] = scimagojr['Issn'].str.replace(' ', '').str.lower()

# Standardize journal names for matching
articles_df['Journal'] = articles_df['Journal'].str.strip().str.lower()
scimagojr['Title'] = scimagojr['Title'].str.strip().str.lower()

# Step 3: Clean SJR Values
# Replace commas with dots and convert to numeric
scimagojr['SJR'] = scimagojr['SJR'].astype(str).str.replace(',', '.').astype(float)

# Step 4: Match and Add Impact Factor
# Match by ISSN first
merged = pd.merge(
    articles_df,
    scimagojr[['Issn', 'SJR']],
    left_on='ISSN',
    right_on='Issn',
    how='left'
)

# Fill missing SJR using journal names as a fallback
merged['SJR'] = merged['SJR'].fillna(
    pd.merge(
        articles_df,
        scimagojr[['Title', 'SJR']],
        left_on='Journal',
        right_on='Title',
        how='left'
    )['SJR']
)

# Replace missing SJR with 0
merged['SJR'] = merged['SJR'].fillna(0)

# Step 5: Add "Field" Column
# Define academic fields and associated keywords
fields_keywords = {
    "Medical Sciences": ["infection", "hospitalised", "hospitalized", "circulating", "treatment", "disease"],
    "Epidemiology and Public Health": ["epidemiology", "public health", "population", "spread", "incidence", "prevalence"],
    "Parasitology and Tropical Medicine": ["schistosoma", "parasite", "parasitology", "tropical medicine"],
    "Immunology": ["antigen", "immune", "immunology", "immune response", "antigenic"],
    "Biological and Biomedical Research": ["nitric oxide", "biomedical", "biology", "cellular", "biochemistry"]
}

# Function to classify articles based on keyword frequency
def classify_article_by_frequency(title, abstract):
    combined_text = f"{title} {abstract}".lower() if not pd.isnull(title) and not pd.isnull(abstract) else ""
    field_counts = defaultdict(int)
    
    for field, keywords in fields_keywords.items():
        for keyword in keywords:
            field_counts[field] += combined_text.count(keyword.lower())
    
    # Return the field with the highest count, or "Other" if all counts are zero
    if field_counts:
        return max(field_counts, key=field_counts.get)
    return "Other"

# Apply classification to each article
merged["Field"] = merged.apply(lambda row: classify_article_by_frequency(row["Title"], row["Abstract"]), axis=1)

# Step 6: Drop Unnecessary Columns
merged = merged.drop(columns=['Issn'])

# Step 7: Save the Consolidated Dataset
output_path = 'data/relevant_articles_final.csv'
merged.to_csv(output_path, index=False)

print(f"Final dataset saved to {output_path}")


Final dataset saved to data/relevant_articles_final.csv


Get relevent authors

In [6]:
# Load the classified articles and relevant authors
articles_with_field_path = 'data/relevant_articles_final.csv'
relevant_authors_path = 'data/relevant_authors.csv'

articles_df = pd.read_csv(articles_with_field_path)
authors_df = pd.read_csv(relevant_authors_path)

# Ensure columns exist
if "PMID" not in articles_df.columns or "Field" not in articles_df.columns or "SJR" not in articles_df.columns:
    raise KeyError("The 'relevant_articles_final.csv' must contain 'PMID', 'Field', and 'SJR' columns.")
if "PMID" not in authors_df.columns or "AuthorForename" not in authors_df.columns or "AuthorLastname" not in authors_df.columns:
    raise KeyError("The 'relevant_authors.csv' must contain 'PMID', 'AuthorForename', and 'AuthorLastname' columns.")

# Merge authors with article fields and SJR (impact factors) using PMID
authors_with_field_df = pd.merge(authors_df, articles_df[['PMID', 'Field', 'SJR']], on='PMID', how='left')

# Calculate the average impact factor and article count for each author
author_stats = (
    authors_with_field_df.groupby(['AuthorForename', 'AuthorLastname'])
    .agg(
        AverageImpactFactor=('SJR', 'mean'),
        ArticleCount=('PMID', 'count')
    )
    .reset_index()
)

# Merge the calculated statistics back into the authors DataFrame
authors_with_field_df = pd.merge(
    authors_with_field_df, 
    author_stats, 
    on=['AuthorForename', 'AuthorLastname'], 
    how='left'
)

# Save the resulting DataFrame to a new CSV file
output_path = 'data/relevant_authors_with_field.csv'
authors_with_field_df.to_csv(output_path, index=False)

# Inform the user
print(f"Relevant authors with fields, average impact factors, and article counts saved to: {output_path}")


Relevant authors with fields, average impact factors, and article counts saved to: data/relevant_authors_with_field.csv
