# IWRC Comprehensive Project Type Breakdown Analysis

## Purpose
This analysis generates a comprehensive set of 13 visualizations comparing **Total Projects** vs. **Seed Funding (104B)** across 5 and 10-year periods.

**Output Directory:** `FINAL_DELIVERABLES_2_backup_20251125_194954 copy 2/visualizations/static_breakdown/`

---

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import warnings
warnings.filterwarnings('ignore')

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)

# Configure visualization settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.size'] = 11

# IWRC Brand Colors
COLORS = {
    'teal': '#258372',
    'olive': '#639757',
    'dark_gray': '#333333',
    'light_gray': '#f0f0f0',
    'blue': '#1f77b4',
    'orange': '#ff7f0e'
}

# Create output directory
output_dir = '/Users/shivpat/seed-fund-tracking/FINAL_DELIVERABLES_2_backup_20251125_194954 copy 2/visualizations/static_breakdown'
os.makedirs(output_dir, exist_ok=True)

print(f'✓ Libraries imported and output directory created at: {output_dir}')

✓ Libraries imported and output directory created at: /Users/shivpat/seed-fund-tracking/FINAL_DELIVERABLES_2_backup_20251125_194954 copy 2/visualizations/static_breakdown


In [2]:
# Load Data
file_path = '../../data/consolidated/IWRC Seed Fund Tracking.xlsx'
df = pd.read_excel(file_path, sheet_name='Project Overview')

# Column Mapping
col_map = {
    'Project ID ': 'project_id',
    'Award Type': 'award_type',
    'Project Title': 'project_title',
    'Project PI': 'pi_name',
    'Academic Institution of PI': 'institution',
    'Award Amount Allocated ($) this must be filled in for all lines': 'award_amount',
    'Number of PhD Students Supported by WRRA $': 'phd_students',
    'Number of MS Students Supported by WRRA $': 'ms_students',
    'Number of Undergraduate Students Supported by WRRA $': 'undergrad_students',
    'Number of Post Docs Supported by WRRA $': 'postdoc_students',
    "Award, Achievement, or Grant\n (This may include awards and achievements for projects from the previous year to this 5-year cycle, so long as they were not already included in last year's report)": 'awards_grants',
    'Monetary Benefit of Award or Achievement (if applicable; use NA if not applicable)': 'monetary_benefit',
    "Description of Award, Achievement, or Grant\n (This may include awards and achievements for projects from the previous year to this 5-year cycle, so long as they were not already included in last year's report)": 'award_description',
    'WRRI Science Priority that Best Aligns with this Project': 'science_priority',
    'Keyword (Primary)': 'keyword_primary'
}

df_work = df.rename(columns=col_map)

# Clean Student Columns
student_cols = ['phd_students', 'ms_students', 'undergrad_students', 'postdoc_students']
for col in student_cols:
    df_work[col] = pd.to_numeric(df_work[col], errors='coerce').fillna(0)

# Year Extraction Logic
def extract_year(project_id):
    if pd.isna(project_id): return None
    s = str(project_id).strip()
    match = re.search(r'(20\d{2}|19\d{2})', s)
    if match: return int(match.group(1))
    match_fy = re.search(r'FY(\d{2})', s, re.IGNORECASE)
    if match_fy:
        y = int(match_fy.group(1))
        return 2000 + y if y < 100 else y
    return None

df_work['project_year'] = df_work['project_id'].apply(extract_year)

# Define Tracks
df_seed_only = df_work[df_work['award_type'] == 'Base Grant (104b)'].copy()

print(f'✓ Data Loaded & Processed')
print(f'  Total Projects: {len(df_work)}')
print(f'  Seed Funding (104B): {len(df_seed_only)}')

✓ Data Loaded & Processed
  Total Projects: 354
  Seed Funding (104B): 142


In [3]:
# --- 1. INVESTMENT & GENERAL METRICS ---

def plot_investment_comparison(filename, title):
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Calculate totals
    total_10 = df_work[df_work['project_year'].between(2015, 2024)]['award_amount'].sum()
    total_5 = df_work[df_work['project_year'].between(2020, 2024)]['award_amount'].sum()
    seed_10 = df_seed_only[df_seed_only['project_year'].between(2015, 2024)]['award_amount'].sum()
    seed_5 = df_seed_only[df_seed_only['project_year'].between(2020, 2024)]['award_amount'].sum()
    
    labels = ['10-Year (2015-2024)', '5-Year (2020-2024)']
    x = np.arange(len(labels))
    width = 0.35
    
    rects1 = ax.bar(x - width/2, [total_10, total_5], width, label='Total Projects', color=COLORS['teal'])
    rects2 = ax.bar(x + width/2, [seed_10, seed_5], width, label='Seed Funding (104B)', color=COLORS['olive'])
    
    ax.set_ylabel('Investment Amount ($)', fontweight='bold')
    ax.set_title(title, fontweight='bold', pad=20)
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    
    # Labels
    for rect in rects1 + rects2:
        h = rect.get_height()
        ax.annotate(f'${h:,.0f}', xy=(rect.get_x() + rect.get_width()/2, h),
                    xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=9)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/{filename}', dpi=300, bbox_inches='tight')
    plt.close()
    print(f'✓ Saved {filename}')

# Generate both investment charts (likely similar content requested)
plot_investment_comparison('investment_comparison.png', 'IWRC Investment Comparison: Total vs. Seed')
plot_investment_comparison('iwrc_investment_comparison.png', 'IWRC Investment Comparison (Detailed)')

# Projects by Year
def plot_projects_by_year():
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Group by year
    total_counts = df_work[df_work['project_year'].between(2015, 2024)].groupby('project_year').size()
    seed_counts = df_seed_only[df_seed_only['project_year'].between(2015, 2024)].groupby('project_year').size()
    
    years = sorted(list(set(total_counts.index) | set(seed_counts.index)))
    x = np.arange(len(years))
    width = 0.35
    
    # Reindex to ensure all years present
    t_vals = total_counts.reindex(years, fill_value=0)
    s_vals = seed_counts.reindex(years, fill_value=0)
    
    ax.bar(x - width/2, t_vals, width, label='Total Projects', color=COLORS['teal'])
    ax.bar(x + width/2, s_vals, width, label='Seed Funding (104B)', color=COLORS['olive'])
    
    ax.set_ylabel('Number of Projects', fontweight='bold')
    ax.set_title('Number of Projects by Year (2015-2024)', fontweight='bold', pad=20)
    ax.set_xticks(x)
    ax.set_xticklabels([int(y) for y in years])
    ax.legend()
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/projects_by_year.png', dpi=300, bbox_inches='tight')
    plt.close()
    print('✓ Saved projects_by_year.png')

plot_projects_by_year()

✓ Saved investment_comparison.png


✓ Saved iwrc_investment_comparison.png


✓ Saved projects_by_year.png


In [4]:
# --- 2. STUDENT ANALYSIS ---

def plot_students_trained():
    # Calculate totals for 10-year period
    t_10 = df_work[df_work['project_year'].between(2015, 2024)][student_cols].sum()
    s_10 = df_seed_only[df_seed_only['project_year'].between(2015, 2024)][student_cols].sum()
    
    labels = ['PhD', 'MS', 'Undergrad', 'Post-Doc']
    x = np.arange(len(labels))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(10, 6))
    rects1 = ax.bar(x - width/2, t_10, width, label='Total Projects', color=COLORS['teal'])
    rects2 = ax.bar(x + width/2, s_10, width, label='Seed Funding (104B)', color=COLORS['olive'])
    
    ax.set_ylabel('Number of Students', fontweight='bold')
    ax.set_title('Students Trained by Type (2015-2024)', fontweight='bold', pad=20)
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    
    # Labels
    for rect in rects1 + rects2:
        h = rect.get_height()
        if h > 0:
            ax.annotate(f'{int(h)}', xy=(rect.get_x() + rect.get_width()/2, h),
                        xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=9)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/students_trained.png', dpi=300, bbox_inches='tight')
    plt.savefig(f'{output_dir}/students_trained_breakdown.png', dpi=300, bbox_inches='tight') # Duplicate as requested
    plt.close()
    print('✓ Saved students_trained.png & students_trained_breakdown.png')

plot_students_trained()

def plot_student_distribution_pie():
    # 10-year totals
    t_10 = df_work[df_work['project_year'].between(2015, 2024)][student_cols].sum()
    s_10 = df_seed_only[df_seed_only['project_year'].between(2015, 2024)][student_cols].sum()
    
    labels = ['PhD', 'MS', 'Undergrad', 'Post-Doc']
    colors = [COLORS['teal'], COLORS['olive'], '#ffbf00', '#a6a6a6'] # Custom palette
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))
    
    ax1.pie(t_10, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
    ax1.set_title(f'Total Projects\n(N={int(t_10.sum())})', fontweight='bold')
    
    ax2.pie(s_10, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
    ax2.set_title(f'Seed Funding (104B)\n(N={int(s_10.sum())})', fontweight='bold')
    
    plt.suptitle('Student Distribution Comparison (2015-2024)', fontweight='bold', fontsize=14)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/student_distribution_pie.png', dpi=300, bbox_inches='tight')
    plt.close()
    print('✓ Saved student_distribution_pie.png')

✓ Saved students_trained.png & students_trained_breakdown.png


In [5]:
plot_student_distribution_pie()

✓ Saved student_distribution_pie.png


In [6]:
# --- 3. INSTITUTIONAL ANALYSIS ---

def plot_institutional_reach():
    # Count unique institutions 2015-2024
    t_inst = df_work[df_work['project_year'].between(2015, 2024)]['institution'].nunique()
    s_inst = df_seed_only[df_seed_only['project_year'].between(2015, 2024)]['institution'].nunique()
    
    fig, ax = plt.subplots(figsize=(8, 6))
    bars = ax.bar(['Total Projects', 'Seed Funding (104B)'], [t_inst, s_inst], 
                  color=[COLORS['teal'], COLORS['olive']], width=0.5)
    
    ax.set_ylabel('Number of Unique Institutions', fontweight='bold')
    ax.set_title('Institutional Reach (2015-2024)', fontweight='bold', pad=20)
    
    for bar in bars:
        h = bar.get_height()
        ax.annotate(f'{h}', xy=(bar.get_x() + bar.get_width()/2, h),
                    xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=12, fontweight='bold')
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/institutional_reach.png', dpi=300, bbox_inches='tight')
    plt.close()
    print('✓ Saved institutional_reach.png')

plot_institutional_reach()

def plot_university_funding_comparison():
    # Top 10 institutions by TOTAL funding
    df_10 = df_work[df_work['project_year'].between(2015, 2024)]
    top_inst = df_10.groupby('institution')['award_amount'].sum().sort_values(ascending=False).head(10).index
    
    # Get values for these top 10
    t_vals = df_10[df_10['institution'].isin(top_inst)].groupby('institution')['award_amount'].sum().reindex(top_inst)
    
    df_s_10 = df_seed_only[df_seed_only['project_year'].between(2015, 2024)]
    s_vals = df_s_10[df_s_10['institution'].isin(top_inst)].groupby('institution')['award_amount'].sum().reindex(top_inst).fillna(0)
    
    x = np.arange(len(top_inst))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.barh(x - width/2, t_vals, width, label='Total Projects', color=COLORS['teal'])
    ax.barh(x + width/2, s_vals, width, label='Seed Funding (104B)', color=COLORS['olive'])
    
    ax.set_yticks(x)
    ax.set_yticklabels([i[:30] + '...' if len(i)>30 else i for i in top_inst]) # Truncate long names
    ax.set_xlabel('Total Funding ($)', fontweight='bold')
    ax.set_title('Top 10 Institutions by Funding (2015-2024)', fontweight='bold', pad=20)
    ax.legend()
    ax.invert_yaxis() # Top at top
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/university_funding_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    print('✓ Saved university_funding_comparison.png')

✓ Saved institutional_reach.png


In [7]:
plot_university_funding_comparison()

✓ Saved university_funding_comparison.png


In [8]:
# --- 4. TOPIC AREA ANALYSIS ---

def plot_topic_areas():
    # Use 'science_priority' as topic
    df_10 = df_work[df_work['project_year'].between(2015, 2024)]
    df_s_10 = df_seed_only[df_seed_only['project_year'].between(2015, 2024)]
    
    # Get top topics
    top_topics = df_10['science_priority'].value_counts().head(8).index
    
    t_counts = df_10[df_10['science_priority'].isin(top_topics)]['science_priority'].value_counts().reindex(top_topics)
    s_counts = df_s_10[df_s_10['science_priority'].isin(top_topics)]['science_priority'].value_counts().reindex(top_topics).fillna(0)
    
    # 1. Stacked Bar
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(top_topics, t_counts, label='Other Projects', color=COLORS['teal'], alpha=0.7)
    ax.bar(top_topics, s_counts, label='Seed Funding (104B)', color=COLORS['olive'])
    plt.xticks(rotation=45, ha='right')
    plt.title('Topic Areas: Total vs Seed (Stacked)', fontweight='bold')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'{output_dir}/topic_areas_stacked.png', dpi=300)
    plt.close()
    
    # 2. Pyramid (Side-by-Side Horizontal)
    fig, ax = plt.subplots(figsize=(12, 8))
    y = np.arange(len(top_topics))
    ax.barh(y, -t_counts, color=COLORS['teal'], label='Total Projects')
    ax.barh(y, s_counts, color=COLORS['olive'], label='Seed Funding')
    ax.set_yticks(y)
    ax.set_yticklabels(top_topics)
    ax.axvline(0, color='black', linewidth=0.8)
    plt.title('Topic Areas Pyramid (2015-2024)', fontweight='bold')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'{output_dir}/topic_areas_pyramid_stacked.png', dpi=300)
    plt.savefig(f'{output_dir}/topic_areas_pyramid_stacked_preagg.png', dpi=300) # Duplicate as requested
    plt.close()
    
    # 3. Overlapping (Grouped Bar)
    fig, ax = plt.subplots(figsize=(12, 6))
    x = np.arange(len(top_topics))
    width = 0.35
    ax.bar(x - width/2, t_counts, width, label='Total Projects', color=COLORS['teal'])
    ax.bar(x + width/2, s_counts, width, label='Seed Funding', color=COLORS['olive'])
    ax.set_xticks(x)
    ax.set_xticklabels(top_topics, rotation=45, ha='right')
    plt.title('Topic Areas Overlapping Comparison', fontweight='bold')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'{output_dir}/topic_areas_overlapping.png', dpi=300)
    plt.close()
    
    print('✓ Saved topic_areas_stacked.png, topic_areas_pyramid_stacked.png, topic_areas_overlapping.png')

plot_topic_areas()

✓ Saved topic_areas_stacked.png, topic_areas_pyramid_stacked.png, topic_areas_overlapping.png


In [9]:
# --- 5. AWARD BREAKDOWN ---

def plot_award_breakdown():
    def categorize_award(text):
        if pd.isna(text): return None
        t = str(text).lower()
        if 'grant' in t: return 'Grant'
        if 'award' in t: return 'Award'
        if 'achievement' in t: return 'Achievement'
        return 'Other'
    
    df_10 = df_work[df_work['project_year'].between(2015, 2024)].copy()
    df_s_10 = df_seed_only[df_seed_only['project_year'].between(2015, 2024)].copy()
    
    df_10['cat'] = df_10['awards_grants'].apply(categorize_award)
    df_s_10['cat'] = df_s_10['awards_grants'].apply(categorize_award)
    
    t_counts = df_10['cat'].value_counts()
    s_counts = df_s_10['cat'].value_counts()
    
    cats = ['Grant', 'Award', 'Achievement']
    t_vals = [t_counts.get(c, 0) for c in cats]
    s_vals = [s_counts.get(c, 0) for c in cats]
    
    x = np.arange(len(cats))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(x - width/2, t_vals, width, label='Total Projects', color=COLORS['teal'])
    ax.bar(x + width/2, s_vals, width, label='Seed Funding (104B)', color=COLORS['olive'])
    
    ax.set_ylabel('Count', fontweight='bold')
    ax.set_title('Follow-on Awards by Type (2015-2024)', fontweight='bold', pad=20)
    ax.set_xticks(x)
    ax.set_xticklabels(cats)
    ax.legend()
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/award_breakdown.png', dpi=300, bbox_inches='tight')
    plt.close()
    print('✓ Saved award_breakdown.png')

plot_award_breakdown()

✓ Saved award_breakdown.png


In [10]:
# --- 6. SPECIFIC AWARD TYPE VISUALIZATIONS (REQUESTED) ---

def plot_specific_award_types():
    # Filter for 2015-2024
    df_10 = df_work[df_work['project_year'].between(2015, 2024)].copy()
    
    # 1. Award Type Overview (Project Counts)
    counts = df_10['award_type'].value_counts()
    
    fig, ax = plt.subplots(figsize=(10, 6))
    bars = ax.bar(counts.index, counts.values, color=COLORS['teal'])
    
    ax.set_ylabel('Number of Projects', fontweight='bold')
    ax.set_title('Project Count by Award Type (2015-2024)', fontweight='bold', pad=20)
    plt.xticks(rotation=45, ha='right')
    
    for bar in bars:
        h = bar.get_height()
        ax.annotate(f'{h}', xy=(bar.get_x() + bar.get_width()/2, h),
                    xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/award_type_overview.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Investment Comparison (Total Funding by Type)
    funding = df_10.groupby('award_type')['award_amount'].sum().sort_values(ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    bars = ax.bar(funding.index, funding.values, color=COLORS['olive'])
    
    ax.set_ylabel('Total Investment ($)', fontweight='bold')
    ax.set_title('Total Investment by Award Type (2015-2024)', fontweight='bold', pad=20)
    plt.xticks(rotation=45, ha='right')
    
    for bar in bars:
        h = bar.get_height()
        ax.annotate(f'${h:,.0f}', xy=(bar.get_x() + bar.get_width()/2, h),
                    xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=9)
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/award_type_investment_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Average per Project
    avg_funding = df_10.groupby('award_type')['award_amount'].mean().sort_values(ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    bars = ax.bar(avg_funding.index, avg_funding.values, color=COLORS['blue']) # Use blue for distinction or teal
    
    ax.set_ylabel('Average Investment ($)', fontweight='bold')
    ax.set_title('Average Investment per Project by Award Type (2015-2024)', fontweight='bold', pad=20)
    plt.xticks(rotation=45, ha='right')
    
    for bar in bars:
        h = bar.get_height()
        ax.annotate(f'${h:,.0f}', xy=(bar.get_x() + bar.get_width()/2, h),
                    xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/award_type_avg_per_project.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print('✓ Saved award_type_overview.png, award_type_investment_comparison.png, award_type_avg_per_project.png')

plot_specific_award_types()

✓ Saved award_type_overview.png, award_type_investment_comparison.png, award_type_avg_per_project.png
