In [1]:
# Standard library imports
import numpy as np  # Efficient numerical computations
import pandas as pd  # Data manipulation and analysis
from io import StringIO  # Reading and writing string data

# External library imports
from bs4 import BeautifulSoup  # Parsing HTML and XML documents
import requests  # Sending HTTP requests

# Visualization and Interactive Dashboard libraries
import panel as pn  # Creation of interactive dashboards
import hvplot.pandas  # Extends Pandas plotting capabilities for interactive plots

# Initialize Panel extension
pn.extension(sizing_mode="stretch_width", design='material', template="fast")

## Webscrape JEL Codes

In [2]:
# Define URL to scrape
url = 'https://cran.r-project.org/web/classifications/JEL.html'

# Fetch page content
response = requests.get(url)

# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')

# Find <li> elements with 'code:' prefix in ID
jel_codes = soup.find_all('li', id=lambda x: x and x.startswith('code:'))

# Initialize list for descriptions
jel_code_descriptions = []

# Extract JEL code descriptions
for code in jel_codes:
    # Extract JEL code using ':' as delimiter
    jel_code = code['id'].split(':')[1]
    
    # Include only codes of length 3
    if len(jel_code) == 3:
        # Get text, either from <li> or child <a>
        description = code.get_text(strip=True)
        jel_code_descriptions.append(description)

## Panel for Searching Papers

In [3]:
# Load data from CSV file
df = pd.read_csv('Derived/All-Journals-Cleaned.csv')

# Identify author and JEL columns
author_columns = [col for col in df.columns if col.startswith("Author")]
jel_columns = [col for col in df.columns if col.startswith("JEL")]

# Extract and clean unique authors
unique_authors = pd.unique(df[author_columns].values.ravel('K'))
unique_authors = [author for author in unique_authors if pd.notna(author)]

# Concatenate author names into a single column; drop original author columns
df['Authors'] = df[author_columns].apply(lambda x: '; '.join(x.dropna()), axis=1)
df.drop(columns=author_columns, inplace=True)

# Concatenate JEL codes into a single column; drop original JEL columns
df['JELs'] = df[jel_columns].apply(lambda x: '; '.join(x.dropna()), axis=1)
df.drop(columns=jel_columns, inplace=True)

# Generate a list of years from 1999 to 2024
years_list = list(range(1999, 2025))

# Extract distinct journal names
journal_list = df['Journal'].unique().tolist()

In [4]:
# Global variable for storing the latest filtered DataFrame
global_filtered_df = None

# Initialize interactive widgets for filtering data
year_input = pn.widgets.MultiChoice(name='Year', options=years_list)  # Options based on available data years
journal_input = pn.widgets.MultiChoice(name='Journal', options=journal_list)  # Options from journal data
author_input = pn.widgets.MultiChoice(name='Author Name', options=unique_authors)  # Autocomplete from author data
multi_choice = pn.widgets.MultiChoice(name='JEL Codes', options=jel_code_descriptions)  # Options from JEL codes
abstract_search = pn.widgets.TextInput(name='Search Keywords')  # Text input for abstract search

def filter_data(selected_years, selected_journals, selected_jel_options, selected_authors, abstract_query):
    """
    Filter the DataFrame based on selected widget criteria.
    
    Args:
        selected_years: List of selected years.
        selected_journals: List of selected journals.
        selected_jel_options: List of selected JEL codes.
        author_name: Input string for author name.
        abstract_query: Input string for abstract search.
        
    Returns:
        Filtered DataFrame according to selected filters.
    """
    global global_filtered_df  # Reference global variable to store the filtered results
    filtered_df = df  # Start with the full dataset for filtering
    
    # Apply filters based on user selection from widgets
    if selected_years:
        year_strings = [str(year) for year in selected_years]
        filtered_df = filtered_df[filtered_df['Issue'].apply(lambda issue: any(year in issue for year in year_strings))]
        
    if selected_journals:
        filtered_df = filtered_df[filtered_df['Journal'].isin(selected_journals)]
    
    if selected_authors:
        # Ensure all selected authors must be present in the 'Authors' column for a row to be included
        filtered_df = filtered_df[filtered_df['Authors'].apply(lambda authors: all(author in authors for author in selected_authors))]

        
    if selected_jel_options:
        cleaned_jel_options = [option.split(':')[0] for option in selected_jel_options]
        filtered_df = filtered_df[filtered_df['JELs'].apply(lambda x: all(jel_option in x for jel_option in cleaned_jel_options))]
        
    if abstract_query:
        # Apply filter to both Abstract and Title columns
        filtered_df = filtered_df[
            filtered_df['Abstract'].str.contains(abstract_query, case=False, na=False) |
            filtered_df['Title'].str.contains(abstract_query, case=False, na=False) |
            filtered_df['Issue'].str.contains(abstract_query, case=False, na=False) |
            filtered_df['Journal'].str.contains(abstract_query, case=False, na=False) |
            filtered_df['Authors'].str.contains(abstract_query, case=False, na=False)
        ]
    
    global_filtered_df = filtered_df[['Title', 'Issue', 'Journal', 'Abstract', 'Authors', 'Link']]
    return global_filtered_df

def get_filtered_data():
    """
    Generates CSV data from the latest filtered DataFrame for downloading.
    
    Returns:
        CSV string of the filtered DataFrame.
    """
    if global_filtered_df is not None:
        return StringIO(global_filtered_df.to_csv(index=False))  # Convert DataFrame to CSV
    return StringIO("Title,Issue,Journal,Abstract,Authors,Link\n")  # CSV header if no data

# Binding filter function with widgets to update display dynamically
dynamic_view = pn.bind(
    filter_data,
    selected_years=year_input.param.value,
    selected_journals=journal_input.param.value,
    selected_jel_options=multi_choice.param.value,
    selected_authors=author_input.param.value,
    abstract_query=abstract_search.param.value
)

# Initialize FileDownload widget for filtered DataFrame download
download_button = pn.widgets.FileDownload(callback=get_filtered_data, filename="filtered_data.csv", button_type="primary")

# Organize layout with widgets and download button
layout = pn.Column(
    pn.Row(year_input, journal_input, multi_choice, author_input, abstract_search),  # Widget row
    dynamic_view,  # Area where filtered data will be displayed
    download_button  # Download button for filtered data
)

# Make the layout available for serving or displaying
layout.show()

Launching server at http://localhost:49449


<panel.io.server.Server at 0x177c05a50>