In [1]:
# Import relevant libraries
import numpy as np  # For data transformation
print("numpy v", np.__version__)

import pandas as pd  # For data analysis
print("pandas v", pd.__version__)

import matplotlib  # For creating visualizations (bar charts)
import matplotlib.pyplot as plt
print("matplotlib v", matplotlib.__version__)

import wordcloud  # For creating word clouds
from wordcloud import WordCloud
print("wordcloud v", wordcloud.__version__)

import sys  # For system-specific parameters and functions
from collections import Counter # For specialized container types
print("Python v", sys.version)

import re  # For regular expressions
print("re v", re.__version__)

numpy v 1.24.4
pandas v 1.2.4
matplotlib v 3.3.4
wordcloud v 1.9.4
Python v 3.8.8 (default, Apr 13 2021, 12:59:45) 
[Clang 10.0.0 ]
re v 2.2.1


In [2]:
# Load data from CSV
df = pd.read_csv("CUP-XXIV_Presenter-Metadata-Clean.csv")

In [3]:
# Who was presenting at CUP XXIV 2025? (3) bar charts

# Function to calculate percentages for bar charts
def calculate_percentages(df, column, labels):
    total_counts = df[column].value_counts().reindex(labels, fill_value=0)
    speaker_counts = df[df["Presenter Type"].str.contains("Speaker")][column].value_counts().reindex(labels, fill_value=0)
    poster_counts = df[df["Presenter Type"].str.contains("Poster Lead Author")][column].value_counts().reindex(labels, fill_value=0)
    
    # Calculate percentages
    total_percentages = (total_counts / 74) * 100  # Total presenters out of 74
    speaker_percentages = (speaker_counts / 32) * 100  # Speakers out of 32
    poster_percentages = (poster_counts / 43) * 100  # Poster Lead Authors out of 43
    
    return total_percentages, speaker_percentages, poster_percentages

# Function to prepare demographics data for the second chart
def prepare_demographics_data(df, location_labels, percentage_poc_speaker, percentage_poc_poster, percentage_poc_total,
                              correct_percentage_female_speaker, correct_percentage_female_poster, correct_percentage_female_total):
    
    bar_labels_location = location_labels + ["Person of Color", "Female"]
    percentages_speaker_location = list(percentage_location_speaker.values) + [percentage_poc_speaker, correct_percentage_female_speaker]
    percentages_poster_location = list(percentage_location_poster.values) + [percentage_poc_poster, correct_percentage_female_poster]
    percentages_total_location = list(percentage_location_total.values) + [percentage_poc_total, correct_percentage_female_total]
    
    return bar_labels_location, percentages_speaker_location, percentages_poster_location, percentages_total_location

# Function to create a bar chart
def create_bar_chart(ax, bar_labels, percentages_speaker, percentages_poster, percentages_total, title, include_legend_and_note=True):

    # Bar positions
    bar_width = 0.25
    bar_positions_speaker = np.arange(len(bar_labels))
    bar_positions_poster = bar_positions_speaker + bar_width
    bar_positions_total = bar_positions_poster + bar_width

    # Colors
    color_speaker = "#98D8AA"  # Soft mint green
    color_poster = "#C8A2C8"  # Soft lavender purple
    color_total = "#E69A45"  # Soft orange
    alpha_value = 0.5  # Transparency setting
    border_thickness = 4

    # Plot bars
    ax.bar(bar_positions_speaker, percentages_speaker, color=color_speaker, alpha=1 - alpha_value, 
           edgecolor=color_speaker, linewidth=border_thickness, width=bar_width, label="Speaker (n=32)")
    ax.bar(bar_positions_poster, percentages_poster, color=color_poster, alpha=1 - alpha_value, 
           edgecolor=color_poster, linewidth=border_thickness, width=bar_width, label="Poster Lead Author (n=43)")
    ax.bar(bar_positions_total, percentages_total, color=color_total, alpha=1 - alpha_value, 
           edgecolor=color_total, linewidth=border_thickness, width=bar_width, label="Total Presenters (n=74)")

    # Error bars
    std_dev = np.std([percentages_speaker, percentages_poster], axis=0)
    ax.errorbar(bar_positions_total, percentages_total, yerr=std_dev, fmt='none', ecolor='black', capsize=5, capthick=2)

    # Formatting
    ax.set_xlabel("")
    ax.set_ylabel("Percentage (%)", fontsize=12)
    ax.set_ylim(0, 100)
    ax.set_xticks(bar_positions_speaker + bar_width)
    ax.set_xticklabels(bar_labels, rotation=45, ha="right")
    ax.set_title(title, fontsize=14, pad=20)

    # Conditionally add legend 
    if include_legend_and_note:
        ax.legend(loc="upper right", frameon=True, facecolor="white", fontsize=15)

    # Annotate bars with values
    for i in range(len(bar_labels)):
        ax.text(bar_positions_speaker[i], percentages_speaker[i] + 2, f"{percentages_speaker[i]:.2f}", ha='center', fontsize=10)
        ax.text(bar_positions_poster[i], percentages_poster[i] + 2, f"{percentages_poster[i]:.2f}", ha='center', fontsize=10)
        ax.text(bar_positions_total[i], percentages_total[i] + std_dev[i] + 2, f"{percentages_total[i]:.2f}", ha='center', fontsize=10)

# Define desired order of locations, sectors, and career stages
location_labels = ["CA", "MA", "NM", "NY", "PA", "Other State"]
sector_labels = ["OpenEye", "Biotech", "Academia", "Other Sector"]
career_stage_labels = ["Early", "Mid", "Senior", "Executive"]

# Calculate percentages for Person of Color and Female demographics
percentage_poc_speaker = (df[(df["Person of Color"] == "Person of Color") & (df["Presenter Type"].str.contains("Speaker"))].shape[0] / 32) * 100
percentage_poc_poster = (df[(df["Person of Color"] == "Person of Color") & (df["Presenter Type"].str.contains("Poster Lead Author"))].shape[0] / 43) * 100
percentage_poc_total = ((df["Person of Color"] == "Person of Color").sum() / 74) * 100

correct_percentage_female_speaker = (7 / 32) * 100
correct_percentage_female_poster = (12 / 43) * 100
correct_percentage_female_total = ((7 + 12) / 74) * 100

# Calculate percentages for Location, Sector, and Career Stage
percentage_location_total, percentage_location_speaker, percentage_location_poster = calculate_percentages(df, "Location", location_labels)
percentage_sector_total, percentage_sector_speaker, percentage_sector_poster = calculate_percentages(df, "Sector", sector_labels)
percentage_career_stage_total, percentage_career_stage_speaker, percentage_career_stage_poster = calculate_percentages(df, "Career Stage", career_stage_labels)

# Prepare demographics data for the second chart
bar_labels_location, percentages_speaker_location, percentages_poster_location, percentages_total_location = prepare_demographics_data(
    df, location_labels, percentage_poc_speaker, percentage_poc_poster, percentage_poc_total,
    correct_percentage_female_speaker, correct_percentage_female_poster, correct_percentage_female_total
)

# Create a single figure with three subplots for bar charts
fig, axes = plt.subplots(3, 1, figsize=(14, 18))

# Chart 1: Sectors of CUP XXIV, by Presenter Type
create_bar_chart(
    axes[0], 
    sector_labels, 
    list(percentage_sector_speaker.values), 
    list(percentage_sector_poster.values), 
    list(percentage_sector_total.values), 
    "Sectors of CUP XXIV, by Presenter Type", 
    include_legend_and_note=True
)

# Chart 2: Presenter Demographics of CUP XXIV
create_bar_chart(
    axes[1], 
    bar_labels_location, 
    percentages_speaker_location, 
    percentages_poster_location, 
    percentages_total_location, 
    "Presenter Demographics of CUP XXIV", 
    include_legend_and_note=False
)

# Chart 3: Career Stage of Presenters at CUP XXIV
create_bar_chart(
    axes[2], 
    career_stage_labels, 
    list(percentage_career_stage_speaker.values), 
    list(percentage_career_stage_poster.values), 
    list(percentage_career_stage_total.values), 
    "Career Stage of Presenters at CUP XXIV", 
    include_legend_and_note=False
)

# Adjust layout and save the combined bar charts figure
plt.tight_layout()
plt.savefig("CUPXXIV-BarCharts.png", dpi=300, bbox_inches='tight')
plt.close()  # Close the figure to free up memory

In [4]:
# What were the topics presented at CUP XXIV 2025? (2) word clouds

# Function to preprocess text for word clouds
def preprocess_text(text, stopwords):

    if pd.isna(text):
        return ""
    words = re.findall(r'\b\w+\b', text.lower())
    return " ".join([word for word in words if word not in stopwords])

# Function to combine specific phrases in a frequency dictionary
def combine_phrases(freq_dict, phrases_to_combine):

    for phrase, combined_word in phrases_to_combine.items():
        if phrase in " ".join(freq_dict.keys()):
            combined_freq = sum(freq_dict.pop(word, 0) for word in phrase.split())
            freq_dict[combined_word] = combined_freq
    return freq_dict

# Function to generate a word cloud
def generate_word_cloud(frequencies, colormap):

    return WordCloud(
        width=800, height=520, background_color='white', colormap=colormap,
        max_words=100, contour_width=2, contour_color='black', relative_scaling=0.5,
        min_font_size=10, max_font_size=300
    ).generate_from_frequencies(frequencies)

# Function to plot word clouds
def plot_word_clouds(topics_wc, titles_wc):

    fig, axes = plt.subplots(2, 1, figsize=(10, 14.5))

    # Top Word Cloud: Presenter Topics
    axes[0].imshow(topics_wc, interpolation='bilinear')
    axes[0].axis("off")
    axes[0].set_title("Presenter Topics", fontsize=28, pad=10)
    axes[0].add_patch(plt.Rectangle((0, 0), 1, 1, fill=False, edgecolor='black', linewidth=2, transform=axes[0].transAxes))

    # Bottom Word Cloud: Poster Titles
    axes[1].imshow(titles_wc, interpolation='bilinear')
    axes[1].axis("off")
    axes[1].set_title("Poster Titles", fontsize=28, pad=10)
    axes[1].add_patch(plt.Rectangle((0, 0), 1, 1, fill=False, edgecolor='black', linewidth=2, transform=axes[1].transAxes))

    plt.tight_layout()
    plt.savefig("CUPXXIV-WordClouds.png", dpi=300, bbox_inches='tight')
    plt.close()  # Close the figure to free up memory

# Define stopwords and phrases to combine
stopwords = set(["and", "or", "the", "of", "in", "to", "with", "on", "for", "a", "an", "as", "by", "at", "from"])
phrases_to_combine = {
    "search engine": "search engine",
    "molecular binding": "molecular binding",
    "dna encoded libraries": "dna encoded libraries",
    "extreme scale": "extreme scale"
}

# Extract and preprocess the "Presentation Topic(s)" and "Poster Title" columns
topics = df["Presentation Topic(s)"].apply(preprocess_text, stopwords=stopwords).str.cat(sep=" ")
titles = df["Poster Title"].apply(preprocess_text, stopwords=stopwords).str.cat(sep=" ")

# Calculate word frequencies and remove words with frequency = 1
topics_freq = Counter(topics.split())
titles_freq = Counter(titles.split())
topics_freq = {word: freq for word, freq in topics_freq.items() if freq > 1}
titles_freq = {word: freq for word, freq in titles_freq.items() if freq > 1}

# Combine specific phrases in "Presenter Topics"
topics_freq = combine_phrases(topics_freq, phrases_to_combine)

# Remove specific words from the "Poster Titles" word cloud
words_to_remove = {"using", "based", "fro", "exploring", "enhancing", "learning", "against", "efficient", "what", "flexible", "cov", "2"}
titles_freq = {word: freq for word, freq in titles_freq.items() if word not in words_to_remove}

# Ensure "sars-cov-2" is treated as one word and then remove "cov" and "2"
titles_freq = {word.replace("sars", "sars-cov-2") if "sars" in word else word: freq for word, freq in titles_freq.items()}
titles_freq = {word: freq for word, freq in titles_freq.items() if word not in {"cov", "2"}}

# Generate word clouds
topics_wc = generate_word_cloud(topics_freq, 'viridis')
titles_wc = generate_word_cloud(titles_freq, 'plasma')

# Plot the word clouds
plot_word_clouds(topics_wc, titles_wc)