Group Names: Quinlan OConnell, Samay Mohapatra, Sebastian Martinez, Shyam Patel, Vinay Sangamalli, Justin Yang

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


# ============================================
# TASK C: CREATE BINARY COLUMN
# ============================================

def create_binary_target(df):
    """
    Create binary column: 1 = high $$ (above median), 0 = low $$ (below median)
    """
    median_amount = df['amount_raised'].median()
    df['binary'] = (df['amount_raised'] >= median_amount).astype(int)

    print("=" * 60)
    print("TASK C: Binary Target Variable Created")
    print("=" * 60)
    print(f"Median amount raised: ${median_amount:,.2f}")
    print(f"\nClass distribution:")
    print(df['binary'].value_counts().sort_index())
    print(f"\nHigh $$ (binary=1): {(df['binary'] == 1).sum()} campaigns")
    print(f"Low $$ (binary=0): {(df['binary'] == 0).sum()} campaigns")
    print("=" * 60)

    return df, median_amount


# ============================================
# TASK D: LOGISTIC REGRESSION MODELS
# ============================================

def prepare_features(df, text_column, include_duration=True, max_features=100):
    """
    Convert text to Bag-of-Words features and optionally add duration
    FIXED: Now handles NaN values properly and returns complete feature names
    """
    # Fill NaN values in text column with empty string
    text_data = df[text_column].fillna('')

    # Create Bag of Words from text
    vectorizer = CountVectorizer(
        max_features=max_features,  # Top 100 most common words
        lowercase=True,
        strip_accents='unicode',
        stop_words='english'  # Remove common words like 'the', 'and', etc.
    )

    # Transform text to BoW features
    bow_features = vectorizer.fit_transform(text_data)
    feature_names = list(vectorizer.get_feature_names_out())  # Convert to list

    # Convert to DataFrame
    X = pd.DataFrame(bow_features.toarray(), columns=feature_names, index=df.index)

    # Add duration as a feature if requested
    if include_duration:
        # Fill NaN values in duration_days with median
        duration_median = df['duration_days'].median()
        X['duration_days'] = df['duration_days'].fillna(duration_median).values

        # Add 'duration_days' to feature names list
        feature_names.append('duration_days')

        print(
            f"   ‚ÑπÔ∏è  Duration: filled {df['duration_days'].isna().sum()} missing values with median ({duration_median})")

    y = df['binary'].values

    # Double-check for any remaining NaN values
    if X.isna().any().any():
        print("   ‚ö†Ô∏è  WARNING: Still found NaN values, filling with 0")
        X = X.fillna(0)

    # Convert feature_names back to numpy array for consistency
    feature_names = np.array(feature_names)

    return X, y, vectorizer, feature_names

def train_and_evaluate_model(X, y, model_name):
    """
    Train logistic regression and evaluate performance
    """
    # Double-check for NaN before splitting
    assert not X.isna().any().any(), "X contains NaN values!"
    assert not np.isnan(y).any(), "y contains NaN values!"

    # Split data: 80% train, 20% test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train Logistic Regression
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # Alternative accuracy calculation as specified in assignment
    prediction_errors = np.sum(y_test != y_pred)
    total_cases = len(y_test)
    accuracy_alt = 1 - (prediction_errors / total_cases)

    print(f"\n{'=' * 60}")
    print(f"MODEL: {model_name}")
    print(f"{'=' * 60}")
    print(f"Training samples: {len(X_train)}")
    print(f"Testing samples: {len(X_test)}")
    print(f"Number of features: {X.shape[1]}")

    print(f"\nüìä ACCURACY: {accuracy:.4f} ({accuracy * 100:.2f}%)")
    print(f"   (Formula: 1 - {prediction_errors}/{total_cases} = {accuracy_alt:.4f})")

    print(f"\nüìã CONFUSION MATRIX:")
    print(f"                  Predicted")
    print(f"                  Low(0)  High(1)")
    print(f"Actual  Low(0)     {cm[0, 0]:3d}     {cm[0, 1]:3d}")
    print(f"        High(1)    {cm[1, 0]:3d}     {cm[1, 1]:3d}")

    print(f"\nüìà CLASSIFICATION REPORT:")
    print(classification_report(y_test, y_pred, target_names=['Low $$', 'High $$']))

    # Plot confusion matrix - FIXED: escape dollar signs for matplotlib
    plt.figure(figsize=(8, 6))

    # Use raw string or escape dollar signs to avoid LaTeX interpretation
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Low Money', 'High Money'],  # Changed labels
                yticklabels=['Low Money', 'High Money'])  # Changed labels

    plt.title(f'Confusion Matrix: {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()

    # Save figure
    filename = f'confusion_matrix_{model_name.replace(" ", "_").replace("+", "").replace("  ", "_")}.png'
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()

    print(f"\n‚úì Confusion matrix saved as: {filename}")

    return {
        'model': model,
        'accuracy': accuracy,
        'confusion_matrix': cm,
        'predictions': y_pred,
        'y_test': y_test,
        'X_train': X_train,
        'X_test': X_test
    }


def get_top_predictive_words(model, feature_names, top_n=15):
    """
    Get the most predictive words for high vs low fundraising
    """
    # Get coefficients
    coefs = model.coef_[0]

    # Filter out 'duration_days' if present
    mask = np.array([name != 'duration_days' for name in feature_names])
    filtered_coefs = coefs[mask]
    filtered_names = feature_names[mask]

    if len(filtered_coefs) == 0:
        print("\n‚ö†Ô∏è  No text features to analyze (only duration)")
        return

    # Words that predict HIGH fundraising (positive coefficients)
    top_high_idx = np.argsort(filtered_coefs)[-top_n:][::-1]
    top_high_words = [(filtered_names[i], filtered_coefs[i]) for i in top_high_idx]

    # Words that predict LOW fundraising (negative coefficients)
    top_low_idx = np.argsort(filtered_coefs)[:top_n]
    top_low_words = [(filtered_names[i], filtered_coefs[i]) for i in top_low_idx]

    print(f"\nüîù TOP {min(top_n, len(top_high_words))} WORDS PREDICTING HIGH FUNDRAISING:")
    for word, coef in top_high_words:
        print(f"   {word:20s} ({coef:+.4f})")

    print(f"\nüîª TOP {min(top_n, len(top_low_words))} WORDS PREDICTING LOW FUNDRAISING:")
    for word, coef in top_low_words:
        print(f"   {word:20s} ({coef:+.4f})")


# ============================================
# MAIN EXECUTION
# ============================================

def main():
    print("\n" + "=" * 60)
    print("GOFUNDME PREDICTIVE ANALYSIS - TASKS C & D")
    print("=" * 60)

    # Load data
    print("\nüìÇ Loading data...")
    df = pd.read_csv('campaigns_with_labels.csv')
    print(f"‚úì Loaded {len(df)} campaigns")

    # Check for missing values
    print("\nüîç Checking for missing values...")
    missing_summary = df[['image_labels', 'description', 'amount_raised', 'duration_days']].isna().sum()
    print(missing_summary)

    # TASK C: Create binary target
    df, median_amount = create_binary_target(df)

    # Save dataset with binary column
    df.to_csv('campaigns_with_binary.csv', index=False)
    print(f"\n‚úì Saved to 'campaigns_with_binary.csv'")

    # ============================================
    # TASK D: THREE MODELS
    # ============================================

    results = {}

    # MODEL 1: Image Labels Only
    print("\n" + "=" * 60)
    print("MODEL 1: Using IMAGE_LABELS")
    print("=" * 60)
    X1, y1, vec1, features1 = prepare_features(df, 'image_labels', include_duration=True)
    results['image_labels'] = train_and_evaluate_model(X1, y1, "Image Labels + Duration")
    get_top_predictive_words(results['image_labels']['model'], features1)

    # MODEL 2: Description Text Only
    print("\n" + "=" * 60)
    print("MODEL 2: Using DESCRIPTION TEXT")
    print("=" * 60)
    X2, y2, vec2, features2 = prepare_features(df, 'description', include_duration=True)
    results['description'] = train_and_evaluate_model(X2, y2, "Description + Duration")
    get_top_predictive_words(results['description']['model'], features2)

    # MODEL 3: Combined (Image Labels + Description)
    print("\n" + "=" * 60)
    print("MODEL 3: Using COMBINED (Image Labels + Description)")
    print("=" * 60)
    # Concatenate image_labels and description
    df['combined_text'] = df['image_labels'].fillna('') + ' ' + df['description'].fillna('')
    X3, y3, vec3, features3 = prepare_features(df, 'combined_text', include_duration=True)
    results['combined'] = train_and_evaluate_model(X3, y3, "Combined + Duration")
    get_top_predictive_words(results['combined']['model'], features3)

    # ============================================
    # COMPARISON & CONCLUSIONS
    # ============================================

    print("\n" + "=" * 60)
    print("üìä FINAL COMPARISON")
    print("=" * 60)

    comparison_df = pd.DataFrame({
        'Model': ['Image Labels + Duration', 'Description + Duration', 'Combined + Duration'],
        'Accuracy': [
            results['image_labels']['accuracy'],
            results['description']['accuracy'],
            results['combined']['accuracy']
        ],
        'Features': [X1.shape[1], X2.shape[1], X3.shape[1]]
    })

    print(comparison_df.to_string(index=False))

    # Find best model
    best_model_name = comparison_df.loc[comparison_df['Accuracy'].idxmax(), 'Model']
    best_accuracy = comparison_df['Accuracy'].max()

    print(f"\nüèÜ BEST MODEL: {best_model_name}")
    print(f"   Accuracy: {best_accuracy:.4f} ({best_accuracy * 100:.2f}%)")

    # ============================================
    # CONCLUSIONS
    # ============================================

    print("\n" + "=" * 60)
    print("üí° CONCLUSIONS")
    print("=" * 60)

    print("\n1. FEATURE COMPARISON:")
    if results['image_labels']['accuracy'] > results['description']['accuracy']:
        print("   ‚Üí Image labels are MORE predictive than text descriptions")
        print("   ‚Üí Visual elements matter more for fundraising success")
    else:
        print("   ‚Üí Text descriptions are MORE predictive than image labels")
        print("   ‚Üí The story/narrative matters more than visuals")

    acc_diff = abs(results['combined']['accuracy'] - max(results['image_labels']['accuracy'],
                                                         results['description']['accuracy']))

    print(f"\n2. COMBINATION EFFECT:")
    if results['combined']['accuracy'] > max(results['image_labels']['accuracy'],
                                             results['description']['accuracy']):
        print(f"   ‚Üí Combining features IMPROVES accuracy by {acc_diff:.4f}")
        print("   ‚Üí Both visual and textual elements contribute unique information")
    else:
        print("   ‚Üí Combining features does NOT significantly improve accuracy")
        print("   ‚Üí One feature type captures most of the predictive power")

    print(f"\n3. DURATION IMPACT:")
    print("   ‚Üí Duration is included as a feature in all models")
    print("   ‚Üí Longer campaigns may have more time to accumulate donations")

    print(f"\n4. OVERALL MODEL PERFORMANCE:")
    if best_accuracy > 0.70:
        print(f"   ‚Üí {best_accuracy * 100:.1f}% accuracy is GOOD for this prediction task")
        print("   ‚Üí The model can help predict successful campaigns")
    elif best_accuracy > 0.60:
        print(f"   ‚Üí {best_accuracy * 100:.1f}% accuracy is MODERATE")
        print("   ‚Üí Better than random, but room for improvement")
    else:
        print(f"   ‚Üí {best_accuracy * 100:.1f}% accuracy is LIMITED")
        print("   ‚Üí Other factors (not in data) may drive success")

    # Save comparison
    comparison_df.to_csv('model_comparison.csv', index=False)
    print(f"\n‚úì Saved comparison to 'model_comparison.csv'")

    print("\n" + "=" * 60)
    print("‚úÖ TASKS C & D COMPLETE!")
    print("=" * 60)
    print("\nGenerated files:")
    print("  ‚Ä¢ campaigns_with_binary.csv")
    print("  ‚Ä¢ confusion_matrix_Image_Labels_Duration.png")
    print("  ‚Ä¢ confusion_matrix_Description_Duration.png")
    print("  ‚Ä¢ confusion_matrix_Combined_Duration.png")
    print("  ‚Ä¢ model_comparison.csv")


if __name__ == "__main__":
    main()


GOFUNDME PREDICTIVE ANALYSIS - TASKS C & D

üìÇ Loading data...
‚úì Loaded 1000 campaigns

üîç Checking for missing values...
image_labels      1
description       0
amount_raised     0
duration_days    10
dtype: int64
TASK C: Binary Target Variable Created
Median amount raised: $1,585.00

Class distribution:
binary
0    498
1    502
Name: count, dtype: int64

High $$ (binary=1): 502 campaigns
Low $$ (binary=0): 498 campaigns

‚úì Saved to 'campaigns_with_binary.csv'

MODEL 1: Using IMAGE_LABELS
   ‚ÑπÔ∏è  Duration: filled 10 missing values with median (9.0)

MODEL: Image Labels + Duration
Training samples: 800
Testing samples: 200
Number of features: 101

üìä ACCURACY: 0.8400 (84.00%)
   (Formula: 1 - 32/200 = 0.8400)

üìã CONFUSION MATRIX:
                  Predicted
                  Low(0)  High(1)
Actual  Low(0)      85      15
        High(1)     17      83

üìà CLASSIFICATION REPORT:
              precision    recall  f1-score   support

      Low $$       0.83      0.85  

Model 3: TOP 15 WORDS PREDICTING HIGH FUNDRAISING:
   read                 (+2.0915)
   fees                 (+1.8599)
   2025                 (+1.7273)
   sport                (+1.3740)
   football             (+1.2822)
   photo                (+1.2348)
   raised               (+1.1912)
   coaches              (+1.1830)
   just                 (+1.0647)
   make                 (+1.0423)
   soccer               (+0.9963)
   usa                  (+0.9377)
   competitive          (+0.9318)
   represent            (+0.9001)
   ve                   (+0.8989)

Model 3: TOP 15 WORDS PREDICTING LOW FUNDRAISING:
   hi                   (-1.9581)
   expenses             (-1.9387)
   family               (-1.8517)
   training             (-1.7211)
   thank                (-1.7066)
   game                 (-1.6419)
   donate               (-1.6227)
   players              (-1.6023)
   organized            (-1.3507)
   gofundme             (-1.3339)
   equipment            (-1.1276)
   happy                (-1.0753)
   goal                 (-1.0600)
   women                (-1.0034)
   teams                (-0.9358)

Model Comparison
- Image Labels + Duration ->    Accuracy: 0.840       Features: 101
- Description + Duration  ->   Accuracy: 0.975       Features: 101
- Combined + Duration ->    Accuracy: 0.985       Features: 101

Conclusions:

1. Feature Comparison:
   ‚Üí Text descriptions are MORE predictive than image labels
   ‚Üí The story/narrative matters more than visuals

2. Combination Effect:
   ‚Üí Combining features IMPROVES accuracy by 0.0100
   ‚Üí Both visual and textual elements contribute unique information

3. Duration Impact:
   ‚Üí Duration is included as a feature in all models
   ‚Üí Longer campaigns may have more time to accumulate donations

4. Overall Model Performance:
   ‚Üí 98.5% accuracy is GOOD for this prediction task
   ‚Üí The model can help predict successful campaigns

In [None]:
# ============================================
# TASK E: TOPIC MODELING (LDA)
# ============================================

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# 1Ô∏è‚É£ Load the dataset from Task D
df = pd.read_csv("campaigns_with_binary.csv")

# 2Ô∏è‚É£ Combine the text columns (best-performing combo from Task D)
df["combined_text"] = df["image_labels"].fillna('') + ' ' + df["description"].fillna('')

# 3Ô∏è‚É£ Vectorize text using Bag-of-Words
vectorizer = CountVectorizer(
    max_features=1000,
    lowercase=True,
    stop_words='english',
    strip_accents='unicode'
)
X = vectorizer.fit_transform(df["combined_text"])

# 4Ô∏è‚É£ Fit Latent Dirichlet Allocation (LDA) with 5 topics
lda = LatentDirichletAllocation(
    n_components=5,
    random_state=42,
    learning_method='batch',
    max_iter=20
)
lda.fit(X)

# 5Ô∏è‚É£ Display top words for each topic
def display_topics(model, feature_names, n_top_words=10):
    print("\n============================================================")
    print("TOP WORDS PER TOPIC")
    print("============================================================")
    for idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic {idx + 1}: {', '.join(top_words)}")

display_topics(lda, vectorizer.get_feature_names_out(), n_top_words=10)

# 6Ô∏è‚É£ Get topic weights for each campaign
topic_weights = lda.transform(X)
topic_cols = [f"Topic_{i+1}" for i in range(lda.n_components)]
topic_df = pd.DataFrame(topic_weights, columns=topic_cols)
df = pd.concat([df, topic_df], axis=1)

# 7Ô∏è‚É£ Compare top (Q4) vs bottom (Q1) fundraising quartiles
q1_cutoff = df["amount_raised"].quantile(0.25)
q4_cutoff = df["amount_raised"].quantile(0.75)

low_df = df[df["amount_raised"] <= q1_cutoff]
high_df = df[df["amount_raised"] >= q4_cutoff]

comparison = pd.DataFrame({
    "Topic": topic_cols,
    "Low_Q1_Avg": [low_df[col].mean() for col in topic_cols],
    "High_Q4_Avg": [high_df[col].mean() for col in topic_cols]
})

print("\n============================================================")
print("AVERAGE TOPIC WEIGHTS BY FUNDRAISING QUARTILE")
print("============================================================")
print(comparison.round(3).to_string(index=False))

# 8Ô∏è‚É£ Save the results
df.to_csv("campaigns_with_topics.csv", index=False)
comparison.to_csv("topic_quartile_comparison.csv", index=False)

print("\n============================================================")
print("‚úÖ TASK E COMPLETE!")
print("============================================================")
print("Generated files:")
print("  ‚Ä¢ campaigns_with_topics.csv")
print("  ‚Ä¢ topic_quartile_comparison.csv")



TOP WORDS PER TOPIC
Topic 1: share, organizer, donate, fundraiser, sports, help, support, goal, donation, protected
Topic 2: sports, fundraiser, donate, team, share, organizer, support, football, goal, youth
Topic 3: organizer, team, sports, share, fundraiser, soccer, donate, support, donations, goal
Topic 4: fundraiser, sports, team, organizer, donate, support, share, help, goal, gofundme
Topic 5: team, fundraiser, sports, donate, share, organizer, support, goal, help, donation

AVERAGE TOPIC WEIGHTS BY FUNDRAISING QUARTILE
  Topic  Low_Q1_Avg  High_Q4_Avg
Topic_1       0.134        0.170
Topic_2       0.221        0.114
Topic_3       0.079        0.225
Topic_4       0.141        0.250
Topic_5       0.425        0.241

‚úÖ TASK E COMPLETE!
Generated files:
  ‚Ä¢ campaigns_with_topics.csv
  ‚Ä¢ topic_quartile_comparison.csv


In [None]:
# ============================================
# TASK E: TOPIC MODELING (LDA) ‚Äî 3 Topics Version
# ============================================

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# 1Ô∏è‚É£ Load the dataset from Task D
df = pd.read_csv("campaigns_with_binary.csv")

# 2Ô∏è‚É£ Combine the text columns (best-performing combo from Task D)
df["combined_text"] = df["image_labels"].fillna('') + ' ' + df["description"].fillna('')

# 3Ô∏è‚É£ Vectorize text using Bag-of-Words
vectorizer = CountVectorizer(
    max_features=1500,     # slightly larger vocabulary for more variety
    lowercase=True,
    stop_words='english',
    strip_accents='unicode'
)
X = vectorizer.fit_transform(df["combined_text"])

# 4Ô∏è‚É£ Fit Latent Dirichlet Allocation (LDA) with 3 topics
lda = LatentDirichletAllocation(
    n_components=3,        # <‚Äî reduced from 5 to 3 for broader, cleaner themes
    random_state=42,
    learning_method='batch',
    max_iter=30            # a few extra passes for stability
)
lda.fit(X)

# 5Ô∏è‚É£ Display top words for each topic
def display_topics(model, feature_names, n_top_words=10):
    print("\n============================================================")
    print("TOP WORDS PER TOPIC (3-Topic Model)")
    print("============================================================")
    for idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic {idx + 1}: {', '.join(top_words)}")

display_topics(lda, vectorizer.get_feature_names_out(), n_top_words=10)

# 6Ô∏è‚É£ Get topic weights for each campaign
topic_weights = lda.transform(X)
topic_cols = [f"Topic_{i+1}" for i in range(lda.n_components)]
topic_df = pd.DataFrame(topic_weights, columns=topic_cols)
df = pd.concat([df, topic_df], axis=1)

# 7Ô∏è‚É£ Compare top (Q4) vs bottom (Q1) fundraising quartiles
q1_cutoff = df["amount_raised"].quantile(0.25)
q4_cutoff = df["amount_raised"].quantile(0.75)

low_df = df[df["amount_raised"] <= q1_cutoff]
high_df = df[df["amount_raised"] >= q4_cutoff]

comparison = pd.DataFrame({
    "Topic": topic_cols,
    "Low_Q1_Avg": [low_df[col].mean() for col in topic_cols],
    "High_Q4_Avg": [high_df[col].mean() for col in topic_cols]
})

print("\n============================================================")
print("AVERAGE TOPIC WEIGHTS BY FUNDRAISING QUARTILE")
print("============================================================")
print(comparison.round(3).to_string(index=False))

# 8Ô∏è‚É£ Save the results
df.to_csv("campaigns_with_topics_3topics.csv", index=False)
comparison.to_csv("topic_quartile_comparison_3topics.csv", index=False)

print("\n============================================================")
print("‚úÖ TASK E (3-Topic Model) COMPLETE!")
print("============================================================")
print("Generated files:")
print("  ‚Ä¢ campaigns_with_topics_3topics.csv")
print("  ‚Ä¢ topic_quartile_comparison_3topics.csv")



TOP WORDS PER TOPIC (3-Topic Model)
Topic 1: team, sports, fundraiser, share, donate, organizer, support, goal, help, donations
Topic 2: fundraiser, donate, share, organizer, support, sports, team, help, goal, donation
Topic 3: sports, fundraiser, donate, team, organizer, share, support, goal, help, donations

AVERAGE TOPIC WEIGHTS BY FUNDRAISING QUARTILE
  Topic  Low_Q1_Avg  High_Q4_Avg
Topic_1       0.359        0.238
Topic_2       0.353        0.402
Topic_3       0.288        0.360

‚úÖ TASK E (3-Topic Model) COMPLETE!
Generated files:
  ‚Ä¢ campaigns_with_topics_3topics.csv
  ‚Ä¢ topic_quartile_comparison_3topics.csv


We performed Latent Dirichlet Allocation (LDA) on the combined text of image labels and descriptions to identify common themes among GoFundMe campaigns. After testing both a 3-topic and a 5-topic model, the 5-topic configuration was retained because it produced more interpretable and differentiated clusters of words, even though overlap remained due to the repetitive nature of fundraising language (for example, frequent use of ‚Äúdonate,‚Äù ‚Äúhelp,‚Äù and ‚Äúsupport‚Äù). The 3-topic version collapsed several subtle patterns into broader categories, while the 5-topic model revealed clearer distinctions such as youth-oriented sports campaigns versus general community fundraisers.

The five topics were interpreted and labeled as follows:
Topic 1 ‚Äì **Community Outreach and Sharing**
Topic 2 ‚Äì **Youth Sports Fundraisers**
Topic 3 ‚Äì **Club or Soccer Campaigns**
Topic 4 ‚Äì **General Fundraising Appeals**
Topic 5 ‚Äì **Team Spirit and Collective Support**

When comparing topic weights between high- and low-fundraising quartiles, higher-performing campaigns showed greater emphasis on Topics 3 and 4, suggesting that broader and sport-specific appeals attract more donations. In contrast, lower-fundraising campaigns leaned toward Topics 2 and 5, which focus more narrowly on smaller youth or team-based efforts. Overall, the 5-topic model offered clearer interpretability and stronger insights into how different campaign narratives relate to fundraising success.


### Part F

See attached .docx file
