In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from convokit import Corpus, download

gap_corpus = Corpus(filename=download("gap-corpus"))
print("Download complete!")

# Print summary statistics to verify the dataset
gap_corpus.print_summary_stats()

# Function to extract leadership scores from the corpus
def extract_leadership_data(corpus):
    """
    Extract leadership scores for all speakers in the corpus,
    grouped by conversation.
    """
    data = []
    
    # Get all conversations
    for conv_id, conversation in corpus.conversations.items():
        # Get conversation metadata
        conv_metadata = conversation.meta
        group_number = conv_metadata.get('Group Number', 'N/A')
        
        # Get speakers in this conversation
        speakers_in_conv = set()
        for utterance in conversation.iter_utterances():
            speakers_in_conv.add(utterance.speaker.id)
        
        # For each speaker in this conversation, get their leadership score
        for speaker_id in speakers_in_conv:
            speaker = corpus.get_speaker(speaker_id)
            leadership_score = speaker.meta.get('Ind_Lead', 'N/A')
            
            # Add to our data list
            data.append({
                'Conversation ID': conv_id,
                'Group Number': group_number,
                'Speaker ID': speaker_id,
                'Leadership Score': leadership_score
            })
    
    # Convert to DataFrame
    return pd.DataFrame(data)

# Extract leadership data and convert to DataFrame
print("Extracting leadership data...")
leadership_df = extract_leadership_data(gap_corpus)

# Sort by conversation ID and leadership score (highest first)
leadership_df = leadership_df.sort_values(
    by=['Conversation ID', 'Leadership Score'], 
    ascending=[True, False]
)

# Display the leadership data grouped by conversation
print("\nLeadership scores by conversation:")
pd.set_option('display.max_rows', None)  # Show all rows
print(leadership_df)

# Find speakers with highest leadership scores
print("\nTop 10 speakers with highest self-rated leadership scores:")
top_leaders = leadership_df.sort_values(by='Leadership Score', ascending=False).head(10)
print(top_leaders)

# Print leadership data in a pretty format grouped by conversation
print("\nLeadership Scores by Conversation (Pretty Print):")
for conv_id, group in leadership_df.groupby('Conversation ID'):
    print(f"\n{'='*50}")
    print(f"Conversation ID: {conv_id}")
    print(f"{'='*50}")
    
    # Format the data for pretty printing
    pretty_data = group[['Speaker ID', 'Leadership Score']].reset_index(drop=True)
    pretty_data.index = pretty_data.index + 1  # Start index from 1 for readability
    
    # Print with formatting
    print(pretty_data.to_string(index=True, col_space=20))
    
    # Print the person with highest leadership score in this conversation
    highest_leader = group.iloc[0]  # Already sorted by leadership score (highest first)
    print(f"\nHighest self-rated leader: {highest_leader['Speaker ID']} with score: {highest_leader['Leadership Score']}")

# Calculate leadership statistics
print("\nLeadership score statistics:")
print(leadership_df['Leadership Score'].describe())

# Count how many people gave themselves each leadership score
print("\nDistribution of leadership scores:")
score_distribution = leadership_df['Leadership Score'].value_counts().sort_index()
print(score_distribution)

# Print overall highest leadership scores across all conversations
print("\n" + "="*50)
print("TOP LEADERS ACROSS ALL CONVERSATIONS")
print("="*50)
top_overall = leadership_df.sort_values(by='Leadership Score', ascending=False).head(10)
print(top_overall[['Conversation ID', 'Speaker ID', 'Leadership Score']].to_string(index=False, col_space=20))

# Print leadership score distribution in a nice format
print("\n" + "="*50)
print("LEADERSHIP SCORE DISTRIBUTION")
print("="*50)
score_counts = leadership_df['Leadership Score'].value_counts().sort_index()
for score, count in score_counts.items():
    print(f"Score {score}: {count} people ({count/len(leadership_df)*100:.1f}%)")

Downloading the GAP corpus...
Dataset already exists at /Users/sofiansyed/.convokit/saved-corpora/gap-corpus
Download complete!
Number of Speakers: 84
Number of Utterances: 8009
Number of Conversations: 28
Extracting leadership data...

Leadership scores by conversation:
   Conversation ID Group Number Speaker ID  Leadership Score
2         1.Pink.1            1     1.Pink                 5
0         1.Pink.1            1    1.Green                 4
1         1.Pink.1            1     1.Blue                 2
4      10.Orange.1           10  10.Orange                 4
3      10.Orange.1           10    10.Pink                 3
5        11.Pink.1           11   11.Green                 5
6        11.Pink.1           11    11.Pink                 4
8        12.Blue.1           12    12.Pink                 4
9        12.Blue.1           12  12.Orange                 4
7        12.Blue.1           12    12.Blue                 3
10       12.Blue.1           12   12.Green               

In [2]:
# Enhanced Politeness Analysis for GAP Corpus
print("Running enhanced politeness analysis...")

# A more sophisticated function to estimate politeness with a wider range of indicators
def enhanced_politeness_estimate(corpus):
    data = []
    
    # Expanded lists of politeness indicators with weights
    politeness_indicators = {
        # Direct politeness markers (strong)
        'please': 2.0, 
        'thank you': 2.0,
        'thanks': 1.5,
        'appreciate': 1.5,
        'grateful': 1.5,
        'sorry': 1.5,
        'excuse': 1.5,
        'pardon': 1.5,
        
        # Hedges and indirect requests (moderate)
        'would you': 1.0,
        'could you': 1.0,
        'may i': 1.0,
        'might': 0.8,
        'perhaps': 0.8,
        'possibly': 0.8,
        'i think': 0.7,
        'i believe': 0.7,
        'wondering': 0.7,
        
        # Agreement and acknowledgment (moderate)
        'good point': 1.2,
        'good idea': 1.2,
        'you\'re right': 1.2,
        'true': 0.7,
        'agree': 1.0,
        'makes sense': 1.0,
        'see what you mean': 1.0,
        'understand': 0.7,
        
        # Collaborative language (subtle)
        'we': 0.5,
        'us': 0.5,
        'our': 0.5,
        'together': 0.7,
        'let\'s': 0.7,
        'why don\'t we': 0.8,
        'what if we': 0.8,
        'we could': 0.7,
        
        # Deference and respect (subtle)
        'everyone': 0.5,
        'your thoughts': 0.8,
        'what do you think': 0.8,
        'opinion': 0.6,
        'feel free': 0.8,
        'up to you': 0.8
    }
    
    impoliteness_indicators = {
        # Direct impoliteness (strong)
        'shut up': 2.0,
        'stupid': 2.0,
        'idiot': 2.0,
        'dumb': 1.8,
        'ridiculous': 1.5,
        'nonsense': 1.5,
        'useless': 1.5,
        'waste': 1.5,
        
        # Dismissal and disagreement (moderate)
        'whatever': 1.2,
        'not true': 1.2,
        'wrong': 1.0,
        'incorrect': 1.0,
        'false': 1.0,
        'bad idea': 1.0,
        'terrible idea': 1.2,
        'not going to work': 1.0,
        
        # Commanding language (moderate)
        'must': 0.7,
        'have to': 0.7,
        'need to': 0.6,
        'do this': 0.6,
        'do that': 0.6,
        
        # Interruption and impatience (subtle)
        'anyway': 0.5,
        'moving on': 0.5,
        'get to the point': 0.8,
        'already said': 0.7,
        'already know': 0.7,
        
        # Dismissive transitions (subtle)
        'but,': 0.4,
        'however,': 0.4,
        'actually,': 0.4,
        'no,': 0.5,
        'not really': 0.6
    }
    
    print("Analyzing utterances with enhanced politeness detection...")
    
    # Process each conversation
    for conv_id, conversation in corpus.conversations.items():
        try:
            group_number = conversation.meta.get('Group Number', 'N/A')
            speaker_politeness = {}
            speaker_utterance_count = {}
            
            # Process each utterance
            for utterance in conversation.iter_utterances():
                speaker_id = utterance.speaker.id
                text = utterance.text.lower() if hasattr(utterance, 'text') and utterance.text else ""
                
                # Skip very short utterances or non-verbal indicators
                if len(text) < 2 or text in ["$", "%", "#"]:
                    continue
                
                # Initialize politeness score for this utterance
                utterance_politeness = 0.0
                
                # Check for polite indicators
                for indicator, weight in politeness_indicators.items():
                    count = text.count(indicator)
                    if count > 0:
                        utterance_politeness += count * weight
                
                # Check for impolite indicators
                for indicator, weight in impoliteness_indicators.items():
                    count = text.count(indicator)
                    if count > 0:
                        utterance_politeness -= count * weight
                
                # Add sentence length factor (longer sentences tend to be more formal/polite)
                words = text.split()
                if len(words) > 10:
                    utterance_politeness += 0.2
                
                # Update speaker totals
                if speaker_id not in speaker_politeness:
                    speaker_politeness[speaker_id] = 0.0
                    speaker_utterance_count[speaker_id] = 0
                
                speaker_politeness[speaker_id] += utterance_politeness
                speaker_utterance_count[speaker_id] += 1
            
            # Calculate average politeness for each speaker
            for speaker_id in speaker_politeness:
                if speaker_utterance_count[speaker_id] > 0:
                    avg_politeness = speaker_politeness[speaker_id] / speaker_utterance_count[speaker_id]
                    
                    data.append({
                        'Conversation ID': conv_id,
                        'Group Number': group_number,
                        'Speaker ID': speaker_id,
                        'Politeness Score': avg_politeness,
                        'Utterance Count': speaker_utterance_count[speaker_id]
                    })
        
        except Exception as e:
            print(f"Error processing conversation {conv_id}: {str(e)}")
    
    print(f"Analysis complete. Found data for {len(data)} speakers.")
    return pd.DataFrame(data)

try:
    # Extract politeness data using our enhanced method
    print("Extracting enhanced politeness metrics...")
    politeness_df = enhanced_politeness_estimate(gap_corpus)
    
    if politeness_df.empty:
        print("No politeness data could be extracted. Please check the corpus content.")
    else:
        # Sort by conversation ID and politeness score
        politeness_df = politeness_df.sort_values(
            by=['Conversation ID', 'Politeness Score'], 
            ascending=[True, False]
        )
        
        # Print politeness data in a pretty format grouped by conversation
        print("\nPoliteness Scores by Conversation:")
        for conv_id, group in politeness_df.groupby('Conversation ID'):
            print(f"\n{'='*50}")
            print(f"Conversation ID: {conv_id}")
            print(f"{'='*50}")
            
            # Format the data for pretty printing
            pretty_data = group[['Speaker ID', 'Politeness Score', 'Utterance Count']].reset_index(drop=True)
            pretty_data.index = pretty_data.index + 1  # Start index from 1 for readability
            
            # Format politeness score to 4 decimal places
            pretty_data['Politeness Score'] = pretty_data['Politeness Score'].map(lambda x: f"{x:.4f}")
            
            # Print with formatting
            print(pretty_data.to_string(index=True, col_space=20))
            
            # Print the most and least polite people in this conversation
            if len(group) > 0:
                most_polite = group.iloc[0]  # Already sorted by politeness score (highest first)
                least_polite = group.iloc[-1]  # The last one is the least polite
                
                print(f"\nMost polite: {most_polite['Speaker ID']} with score: {most_polite['Politeness Score']}")
                print(f"Least polite: {least_polite['Speaker ID']} with score: {least_polite['Politeness Score']}")
            else:
                print("\nNo speakers with sufficient data in this conversation.")

        # Print overall highest and lowest politeness scores across all conversations
        print("\n" + "="*50)
        print("TOP 10 MOST POLITE SPEAKERS ACROSS ALL CONVERSATIONS")
        print("="*50)
        top_polite = politeness_df.sort_values(by='Politeness Score', ascending=False).head(10)
        top_polite['Politeness Score'] = top_polite['Politeness Score'].map(lambda x: f"{x:.4f}")
        print(top_polite[['Conversation ID', 'Speaker ID', 'Politeness Score']].to_string(index=False, col_space=20))

        print("\n" + "="*50)
        print("TOP 10 LEAST POLITE SPEAKERS ACROSS ALL CONVERSATIONS")
        print("="*50)
        bottom_polite = politeness_df.sort_values(by='Politeness Score', ascending=True).head(10)
        bottom_polite['Politeness Score'] = bottom_polite['Politeness Score'].map(lambda x: f"{x:.4f}")
        print(bottom_polite[['Conversation ID', 'Speaker ID', 'Politeness Score']].to_string(index=False, col_space=20))
        
except Exception as e:
    print(f"An error occurred: {str(e)}")

Running enhanced politeness analysis...
Extracting enhanced politeness metrics...
Analyzing utterances with enhanced politeness detection...
Analysis complete. Found data for 84 speakers.

Politeness Scores by Conversation:

Conversation ID: 1.Pink.1
                               Speaker ID     Politeness Score      Utterance Count
1                                 1.Green               0.2305                   95
2                                  1.Blue               0.2288                  132
3                                  1.Pink               0.1877                  122

Most polite: 1.Green with score: 0.23052631578947363
Least polite: 1.Pink with score: 0.18770491803278685

Conversation ID: 10.Orange.1
                               Speaker ID     Politeness Score      Utterance Count
1                               10.Orange               0.2983                   59
2                                 10.Pink               0.2788                   52

Most polite: 10.Orange 

In [3]:
# Politeness and Leadership Correlation Analysis
print("Analyzing correlation between politeness and self-perceived leadership...")

# Function to extract leadership scores from the corpus
def extract_leadership_data(corpus):
    data = []
    
    # Get all conversations
    for conv_id, conversation in corpus.conversations.items():
        # Get conversation metadata
        conv_metadata = conversation.meta
        group_number = conv_metadata.get('Group Number', 'N/A')
        
        # Get speakers in this conversation
        speakers_in_conv = set()
        for utterance in conversation.iter_utterances():
            speakers_in_conv.add(utterance.speaker.id)
        
        # For each speaker in this conversation, get their leadership score
        for speaker_id in speakers_in_conv:
            speaker = corpus.get_speaker(speaker_id)
            leadership_score = speaker.meta.get('Ind_Lead', None)
            
            # Add to our data list
            if leadership_score is not None:
                data.append({
                    'Conversation ID': conv_id,
                    'Group Number': group_number,
                    'Speaker ID': speaker_id,
                    'Leadership Score': leadership_score
                })
    
    # Convert to DataFrame
    return pd.DataFrame(data)

# Compare politeness and leadership
def analyze_politeness_leadership_relationship(politeness_df, leadership_df):
    # Merge the two dataframes on Conversation ID and Speaker ID
    merged_df = pd.merge(
        politeness_df, 
        leadership_df, 
        on=['Conversation ID', 'Speaker ID'],
        how='inner'
    )
    
    # Analysis results
    results = {
        'total_conversations': 0,
        'most_polite_is_leader': 0,
        'least_polite_is_leader': 0,
        'conversations_with_match': [],
        'conversations_without_match': []
    }
    
    # Analyze each conversation
    for conv_id, group in merged_df.groupby('Conversation ID'):
        # Skip conversations with only one person (no comparison possible)
        if len(group) <= 1:
            continue
            
        results['total_conversations'] += 1
        
        # Find the top leader(s) in this conversation
        max_leadership = group['Leadership Score'].max()
        leaders = group[group['Leadership Score'] == max_leadership]['Speaker ID'].tolist()
        
        # Find the most and least polite people
        group_sorted = group.sort_values(by='Politeness Score', ascending=False)
        most_polite = group_sorted.iloc[0]['Speaker ID']
        least_polite = group_sorted.iloc[-1]['Speaker ID']
        
        # Check if most polite is a leader
        most_polite_is_leader = most_polite in leaders
        if most_polite_is_leader:
            results['most_polite_is_leader'] += 1
            
        # Check if least polite is a leader
        least_polite_is_leader = least_polite in leaders
        if least_polite_is_leader:
            results['least_polite_is_leader'] += 1
            
        # Record detailed match information
        match_info = {
            'Conversation ID': conv_id,
            'Leaders': leaders,
            'Most Polite': most_polite,
            'Least Polite': least_polite,
            'Most Polite Is Leader': most_polite_is_leader,
            'Least Polite Is Leader': least_polite_is_leader
        }
        
        if most_polite_is_leader or least_polite_is_leader:
            results['conversations_with_match'].append(match_info)
        else:
            results['conversations_without_match'].append(match_info)
            
    return results, merged_df

try:
    # Extract leadership data
    print("Extracting leadership data...")
    leadership_df = extract_leadership_data(gap_corpus)
    
    if leadership_df.empty:
        print("No leadership data found in the corpus. Ensure 'Ind_Lead' metadata is available.")
    else:
        # Analyze the relationship
        results, merged_data = analyze_politeness_leadership_relationship(politeness_df, leadership_df)
        
        # Calculate percentages
        total_convs = results['total_conversations']
        most_polite_pct = (results['most_polite_is_leader'] / total_convs * 100) if total_convs > 0 else 0
        least_polite_pct = (results['least_polite_is_leader'] / total_convs * 100) if total_convs > 0 else 0
        
        # Print summary results
        print("\n" + "="*70)
        print("POLITENESS AND LEADERSHIP CORRELATION SUMMARY")
        print("="*70)
        print(f"Total conversations analyzed: {total_convs}")
        print(f"Most polite person was a self-perceived leader: {results['most_polite_is_leader']} times ({most_polite_pct:.2f}%)")
        print(f"Least polite person was a self-perceived leader: {results['least_polite_is_leader']} times ({least_polite_pct:.2f}%)")
        
        # Print detailed results for conversations with matches
        print("\n" + "="*70)
        print("CONVERSATIONS WHERE POLITENESS MATCHED LEADERSHIP")
        print("="*70)
        if results['conversations_with_match']:
            for i, match in enumerate(results['conversations_with_match'], 1):
                print(f"\n{i}. Conversation: {match['Conversation ID']}")
                print(f"   Leaders: {', '.join(match['Leaders'])}")
                print(f"   Most Polite: {match['Most Polite']} (Leader match: {match['Most Polite Is Leader']})")
                print(f"   Least Polite: {match['Least Polite']} (Leader match: {match['Least Polite Is Leader']})")
        else:
            print("No conversations found where politeness matched leadership.")
        
        # Print detailed results for conversations without matches
        print("\n" + "="*70)
        print("CONVERSATIONS WHERE POLITENESS DID NOT MATCH LEADERSHIP")
        print("="*70)
        if results['conversations_without_match']:
            for i, match in enumerate(results['conversations_without_match'], 1):
                print(f"\n{i}. Conversation: {match['Conversation ID']}")
                print(f"   Leaders: {', '.join(match['Leaders'])}")
                print(f"   Most Polite: {match['Most Polite']}")
                print(f"   Least Polite: {match['Least Polite']}")
        else:
            print("No conversations found where politeness did not match leadership.")
            
        # Calculate additional statistics
        leader_politeness = merged_data[merged_data['Leadership Score'] == merged_data.groupby('Conversation ID')['Leadership Score'].transform('max')]
        non_leader_politeness = merged_data[merged_data['Leadership Score'] < merged_data.groupby('Conversation ID')['Leadership Score'].transform('max')]
        
        avg_leader_politeness = leader_politeness['Politeness Score'].mean()
        avg_non_leader_politeness = non_leader_politeness['Politeness Score'].mean() if len(non_leader_politeness) > 0 else 0
        
        # Print additional statistics
        print("\n" + "="*70)
        print("ADDITIONAL STATISTICS")
        print("="*70)
        print(f"Average politeness score of self-perceived leaders: {avg_leader_politeness:.4f}")
        if len(non_leader_politeness) > 0:
            print(f"Average politeness score of non-leaders: {avg_non_leader_politeness:.4f}")
            print(f"Difference (Leader - Non-leader): {avg_leader_politeness - avg_non_leader_politeness:.4f}")
        else:
            print("No non-leader data available for comparison.")
            
        # Print correlation coefficient and other statistics in a neat format
        print("\n" + "="*70)
        print("CORRELATION ANALYSIS")
        print("="*70)
        correlation = merged_data['Politeness Score'].corr(merged_data['Leadership Score'])
        print(f"Pearson correlation between politeness and leadership: {correlation:.4f}")
        
        if correlation > 0.3:
            print("This indicates a moderate positive correlation - more polite people tend to rate themselves as leaders.")
        elif correlation < -0.3:
            print("This indicates a moderate negative correlation - less polite people tend to rate themselves as leaders.")
        else:
            print("This indicates little to no correlation between politeness and self-perceived leadership.")
            
        # Add contingency table
        print("\n" + "="*70)
        print("CONTINGENCY ANALYSIS")
        print("="*70)
        print(f"{'Metric':<40} {'Count':<10} {'Percentage':<15}")
        print("-" * 65)
        print(f"{'Total conversations':<40} {total_convs:<10} {'100.00%':<15}")
        print(f"{'Most polite person is a leader':<40} {results['most_polite_is_leader']:<10} {most_polite_pct:.2f}%")
        print(f"{'Least polite person is a leader':<40} {results['least_polite_is_leader']:<10} {least_polite_pct:.2f}%")
        print(f"{'Neither match':<40} {total_convs - results['most_polite_is_leader'] - results['least_polite_is_leader']:<10} {100 - most_polite_pct - least_polite_pct:.2f}%")
            
except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    traceback.print_exc()

Analyzing correlation between politeness and self-perceived leadership...
Extracting leadership data...

POLITENESS AND LEADERSHIP CORRELATION SUMMARY
Total conversations analyzed: 28
Most polite person was a self-perceived leader: 15 times (53.57%)
Least polite person was a self-perceived leader: 11 times (39.29%)

CONVERSATIONS WHERE POLITENESS MATCHED LEADERSHIP

1. Conversation: 1.Pink.1
   Leaders: 1.Pink
   Most Polite: 1.Green (Leader match: False)
   Least Polite: 1.Pink (Leader match: True)

2. Conversation: 10.Orange.1
   Leaders: 10.Orange
   Most Polite: 10.Orange (Leader match: True)
   Least Polite: 10.Pink (Leader match: False)

3. Conversation: 11.Pink.1
   Leaders: 11.Green
   Most Polite: 11.Pink (Leader match: False)
   Least Polite: 11.Green (Leader match: True)

4. Conversation: 12.Blue.1
   Leaders: 12.Orange, 12.Pink
   Most Polite: 12.Orange (Leader match: True)
   Least Polite: 12.Pink (Leader match: True)

5. Conversation: 13.Yellow.1
   Leaders: 13.Yellow, 13