In [1]:
from convokit import Corpus, download
corpus = Corpus(filename=download("gap-corpus"))


Dataset already exists at /Users/sofiansyed/.convokit/saved-corpora/gap-corpus


In [2]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sofiansyed/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [6]:
from convokit import Corpus, download
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import re
import pandas as pd
from collections import defaultdict

# Computes lexical diversity and sentiment for each speaker per conversation.
# Extracts the top self-perceived leader (from Ind_Lead).
# Ranks speakers by:
# Lexical diversity
# Sentiment
# Checks if the self-perceived leader appears among the top-k scorers for each metric

nltk.download('vader_lexicon')

# Load the GAP corpus
corpus = Corpus(filename=download("gap-corpus"))

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute lexical diversity
def lexical_diversity(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return len(set(words)) / len(words) if words else 0

# Function to extract leadership scores
def extract_leadership_data(corpus):
    """
    Extracts leadership scores for all speakers in the corpus,
    grouped by conversation.
    """
    data = []
    
    for conv_id, conversation in corpus.conversations.items():
        group_number = conversation.meta.get('Group Number', 'N/A')
        speakers_in_conv = set(utt.speaker.id for utt in conversation.iter_utterances())

        for speaker_id in speakers_in_conv:
            speaker = corpus.get_speaker(speaker_id)
            leadership_score = speaker.meta.get('Ind_Lead', 0)  # Default to 0 if missing
            
            data.append({
                'Conversation ID': conv_id,
                'Group Number': group_number,
                'Speaker ID': speaker_id,
                'Leadership Score': leadership_score
            })
    
    return pd.DataFrame(data)

# Extract leadership data
leadership_df = extract_leadership_data(corpus)
leadership_df = leadership_df.sort_values(by=['Conversation ID', 'Leadership Score'], ascending=[True, False])

# Store leadership scores in a dictionary
conversation_leaders = {conv_id: group.iloc[0]['Speaker ID'] for conv_id, group in leadership_df.groupby('Conversation ID') if not group.empty}
conversation_results = []

# Iterate through conversations
for conv in corpus.iter_conversations():
    speaker_data = defaultdict(lambda: {'word_count': 0, 'vocab': set(), 'sentiment_sum': 0, 'utterance_count': 0})

    for utt in conv.iter_utterances():
        speaker_id = utt.speaker.id
        text = utt.text.lower()

        # Tokenize and update vocab & word count
        words = re.findall(r'\b\w+\b', text)
        speaker_data[speaker_id]['vocab'].update(words)
        speaker_data[speaker_id]['word_count'] += len(words)

        # Sentiment analysis
        sentiment = sia.polarity_scores(text)['compound']
        speaker_data[speaker_id]['sentiment_sum'] += sentiment

        # Count utterances
        speaker_data[speaker_id]['utterance_count'] += 1

    # Compute scores for each speaker
    lexical_diversity_scores = {speaker: len(data['vocab']) / data['word_count'] if data['word_count'] else 0 for speaker, data in speaker_data.items()}
    sentiment_scores = {speaker: data['sentiment_sum'] / data['utterance_count'] if data['utterance_count'] else 0 for speaker, data in speaker_data.items()}

    # Sort scores
    lexical_diversity_sorted = sorted(lexical_diversity_scores.items(), key=lambda x: x[1], reverse=True)
    sentiment_sorted = sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True)

    # Retrieve the highest self-rated leader
    self_perceived_leader = conversation_leaders.get(conv.id, "Unknown")

    # Handle cases where there are no speakers in the conversation
    if not lexical_diversity_sorted or not sentiment_sorted:
        continue

    lexical_matches = [
        self_perceived_leader in [lexical_diversity_sorted[i][0] for i in range(min(k + 1, len(lexical_diversity_sorted)))]
        for k in range(len(lexical_diversity_sorted))
    ]
    
    sentiment_matches = [
        self_perceived_leader in [sentiment_sorted[i][0] for i in range(min(k + 1, len(sentiment_sorted)))]
        for k in range(len(sentiment_sorted))
    ]

    conversation_results.append({
        "conversation_id": conv.id,
        "self_perceived_leader": self_perceived_leader,
        "lexical_diversity": {
            "ordered_scores": lexical_diversity_sorted,
            "matches": lexical_matches
        },
        "sentiment": {
            "ordered_scores": sentiment_sorted,
            "matches": sentiment_matches
        }
    })

total_conversations = len(conversation_results)

if total_conversations > 0:
    max_k_lexical = max(len(result["lexical_diversity"]["matches"]) for result in conversation_results)
    max_k_sentiment = max(len(result["sentiment"]["matches"]) for result in conversation_results)

    lexical_match_percentages = [sum(result["lexical_diversity"]["matches"][k] for result in conversation_results if k < len(result["lexical_diversity"]["matches"])) / total_conversations * 100 for k in range(max_k_lexical)]
    sentiment_match_percentages = [sum(result["sentiment"]["matches"][k] for result in conversation_results if k < len(result["sentiment"]["matches"])) / total_conversations * 100 for k in range(max_k_sentiment)]
else:
    lexical_match_percentages = []
    sentiment_match_percentages = []

for result in conversation_results:
    print(f"\n=== Conversation: {result['conversation_id']} ===")
    print(f"Self-Perceived Leader: {result['self_perceived_leader']}")

    print("\nLexical Diversity Ranking:")
    for speaker, score in result["lexical_diversity"]["ordered_scores"]:
        print(f"  {speaker}: {score:.4f}")

    print("\nSentiment Ranking:")
    for speaker, score in result["sentiment"]["ordered_scores"]:
        print(f"  {speaker}: {score:.4f}")

    print("\nLeadership Matching Progression:")
    for k, match in enumerate(result["lexical_diversity"]["matches"]):
        print(f"  Top {k + 1} lexical scorers match leader? {match}")
    
    for k, match in enumerate(result["sentiment"]["matches"]):
        print(f"  Top {k + 1} sentiment scorers match leader? {match}")

print("\n=== Final Summary ===")
print(f"Total Conversations Analyzed: {total_conversations}")

for k, percentage in enumerate(lexical_match_percentages):
    print(f"Lexical Diversity - Top {k + 1} Scorers Match Percentage: {percentage:.2f}%")

for k, percentage in enumerate(sentiment_match_percentages):
    print(f"Sentiment - Top {k + 1} Scorers Match Percentage: {percentage:.2f}%")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sofiansyed/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Dataset already exists at /Users/sofiansyed/.convokit/saved-corpora/gap-corpus

=== Conversation: 1.Pink.1 ===
Self-Perceived Leader: 1.Pink

Lexical Diversity Ranking:
  1.Green: 0.4195
  1.Pink: 0.3646
  1.Blue: 0.2959

Sentiment Ranking:
  1.Blue: 0.1386
  1.Pink: 0.0742
  1.Green: 0.0155

Leadership Matching Progression:
  Top 1 lexical scorers match leader? False
  Top 2 lexical scorers match leader? True
  Top 3 lexical scorers match leader? True
  Top 1 sentiment scorers match leader? False
  Top 2 sentiment scorers match leader? True
  Top 3 sentiment scorers match leader? True

=== Conversation: 10.Orange.1 ===
Self-Perceived Leader: 10.Orange

Lexical Diversity Ranking:
  10.Pink: 0.4596
  10.Orange: 0.4020

Sentiment Ranking:
  10.Orange: 0.1565
  10.Pink: 0.1045

Leadership Matching Progression:
  Top 1 lexical scorers match leader? False
  Top 2 lexical scorers match leader? True
  Top 1 sentiment scorers match leader? True
  Top 2 sentiment scorers match leader? True

===

In [7]:
import pandas as pd

# Example data structure (use your actual variables here)
max_k = max(len(lexical_match_percentages), len(sentiment_match_percentages))

# Create summary DataFrame
summary_df = pd.DataFrame({
    'Top K': [f'Top {k+1}' for k in range(max_k)],
    'Lexical Diversity Match (%)': lexical_match_percentages + [None]*(max_k - len(lexical_match_percentages)),
    'Sentiment Match (%)': [f'{sentiment_match_percentages[k]:.2f}%' if k < len(sentiment_match_percentages) else None for k in range(max_k)]
})

# Display the summary table
print("\n=== Final Summary Table ===")
print(summary_df.to_string(index=False))



=== Final Summary Table ===
Top K  Lexical Diversity Match (%) Sentiment Match (%)
Top 1                    17.857143              25.00%
Top 2                    60.714286              78.57%
Top 3                    67.857143              78.57%
Top 4                    21.428571              21.43%


In [5]:
Lexical diversity measures the variety or richness of vocabulary used by a speaker within their utterances during a conversation. 
A high lexical diversity indicates that the speaker employs a broader range of vocabulary, suggesting greater verbal creativity, 
sophistication, or confidence in language use, which can correlate positively with perceived leadership abilities. 
Conversely, low lexical diversity might indicate more repetitive or limited language, potentially reflecting less confidence or 
assertiveness. The computed lexical diversity score is calculated by taking the ratio of unique words to total words spoken. 
Specifically, in our code, it calculates the ratio between the number of unique words used and the total number of words spoken.


Sentiment analysis captures the emotional tone or valence of the language a speaker uses during a conversation. 
Specifically, the VADER (Valence Aware Dictionary and sEntiment Reasoner) sentiment analyzer provides a score indicating whether 
the speakers utterances are predominantly positive, negative, or neutral. A more consistently positive sentiment may correlate 
with leadership through its association with optimism, encouragement, or supportiveness, while negative sentiment might indicate conflict, 
criticism, or dissatisfaction, potentially undermining perceived leadership. The Sentiment Match in our therefore tests whether 
positive sentiment usage correlates with perceived leadership. 

SyntaxError: invalid syntax (3780783933.py, line 1)