In [2]:
# This block loads the corpus and then creates two DataFrames: one for persuader utterances (filtered with persuasive labels) and one for persuadee utterances.from convokit import Corpus, download
import pandas as pd
from convokit import Corpus, download

# Load the Persuasion for Good Corpus
persuasion_corpus = Corpus(filename=download("persuasionforgood-corpus"))
print("✅ Persuasion for Good Corpus loaded successfully.\n")
persuasion_corpus.print_summary_stats()


# These are specific persuasion strategies/labels used by persuaders that are tagged in the dataset.
persuasive_labels = {
    'credibility-appeal', 'emotion-appeal', 'logical-appeal', 
    'foot-in-the-door', 'personal-story', 'proposition-of-donation', 
    'self-modeling', 'ask-donate-more', 'positive-reaction-to-donation',
    'praise-user', 'thank', 'disagree-donation-more',
    'donation-information', 'acknowledgement', 'agree-donation',
    'personal-related-inquiry'
}


# Select persuader utterances where the persuader is actively using a recognized persuasion technique/label from above.
filtered_persuader_utts = [
    utt for utt in persuasion_corpus.iter_utterances()
    if utt.meta.get('role') == 0 
    and 'label_1' in utt.meta 
    and isinstance(utt.meta['label_1'], list)
    and any(label in persuasive_labels for label in utt.meta['label_1'])
]
# Dataframe for above, persuader filtered text.
persuader_df = pd.DataFrame({
    'utterance': [utt.text for utt in filtered_persuader_utts],
    'speaker_id': [utt.speaker.id for utt in filtered_persuader_utts],
    'conversation_id': [utt.conversation_id for utt in filtered_persuader_utts],
    'role': [utt.meta['role'] for utt in filtered_persuader_utts],
    'labels': [utt.meta['label_1'] for utt in filtered_persuader_utts]
})
print(f"✅ Persuader utterances count: {len(persuader_df)}")



# Filter persuadee utterances (role 1; we include all their utterances)
filtered_persuadee_utts = [
    utt for utt in persuasion_corpus.iter_utterances()
    if utt.meta.get('role') == 1
]
# Create a DataFrame for persuadee utterances
persuadee_df = pd.DataFrame({
    'utterance': [utt.text for utt in filtered_persuadee_utts],
    'speaker_id': [utt.speaker.id for utt in filtered_persuadee_utts],
    'conversation_id': [utt.conversation_id for utt in filtered_persuadee_utts],
    'role': [utt.meta['role'] for utt in filtered_persuadee_utts]
})
print(f"✅ Persuadee utterances count: {len(persuadee_df)}")


# Dictionary mapping conversation IDs to donation amounts.
conversation_donations = {
    conv.id: conv.meta.get('donation_ee', 0)
    for conv in persuasion_corpus.iter_conversations()
}
# Both DataFrames get the donation outcome from their conversation
persuader_df['donation_amount'] = persuader_df['conversation_id'].map(conversation_donations)
persuadee_df['donation_amount'] = persuadee_df['conversation_id'].map(conversation_donations)


# Add speaker metadata (age, sex, education) to both DataFrames
speaker_meta = {
    speaker.id: {
        'age': speaker.meta.get('age'),
        'sex': speaker.meta.get('sex'),
        'education': speaker.meta.get('edu')
    }
    for speaker in persuasion_corpus.iter_speakers()
}
for df in [persuader_df, persuadee_df]:
    df['age'] = df['speaker_id'].map(lambda x: speaker_meta.get(x, {}).get('age'))
    df['sex'] = df['speaker_id'].map(lambda x: speaker_meta.get(x, {}).get('sex'))
    df['education'] = df['speaker_id'].map(lambda x: speaker_meta.get(x, {}).get('education'))

print("\n✅ Sample Persuader DataFrame:")
display(persuader_df.head())

print("\n✅ Sample Persuadee DataFrame:")
display(persuadee_df.head())


Dataset already exists at /Users/sofiansyed/.convokit/saved-corpora/persuasionforgood-corpus
✅ Persuasion for Good Corpus loaded successfully.

Number of Speakers: 1285
Number of Utterances: 20932
Number of Conversations: 1017
✅ Persuader utterances count: 2354
✅ Persuadee utterances count: 10332

✅ Sample Persuader DataFrame:


Unnamed: 0,utterance,speaker_id,conversation_id,role,labels,donation_amount,age,sex,education
0,Where are you from?,A3A07QA5U733HQ,0,0,[personal-related-inquiry],0.0,34.0,Male,Less than four-year college
1,I like that they're committed to helping child...,A3A07QA5U733HQ,0,0,"[credibility-appeal, credibility-appeal]",0.0,34.0,Male,Less than four-year college
2,I'm planning on donating most of my earnings t...,A3A07QA5U733HQ,0,0,"[self-modeling, proposition-of-donation]",0.0,34.0,Male,Less than four-year college
3,Yes it would. Any little bit helps. Thank you ...,A3A07QA5U733HQ,0,0,"[positive-to-inquiry, logical-appeal, thank]",0.0,34.0,Male,Less than four-year college
4,I know normally I am too but I was convinced t...,A22WWSTT8TU7G1,41,0,[logical-appeal],0.05,36.0,Female,Less than four-year college



✅ Sample Persuadee DataFrame:


Unnamed: 0,utterance,speaker_id,conversation_id,role,donation_amount,age,sex,education
0,Hi. I am doing good. How about you?,A25L985XCNESXE,0,1,0.0,50.0,Female,Less than four-year college
1,"Haha. Same here, but it really feels like a Mo...",A25L985XCNESXE,0,1,0.0,50.0,Female,Less than four-year college
2,I can not believe how warm it is already.,A25L985XCNESXE,0,1,0.0,50.0,Female,Less than four-year college
3,I am from the Midwest. What about you?,A25L985XCNESXE,0,1,0.0,50.0,Female,Less than four-year college
4,"Oh, yep. You are definitely in for warm weathe...",A25L985XCNESXE,0,1,0.0,50.0,Female,Less than four-year college


In [6]:
import glob, os


# Define path to your ValueNet data directory - CHANGE IF NECESSARY
data_path = './data/'


# Load files
value_files = glob.glob(os.path.join(data_path, '*.csv'))
value_files = [f for f in value_files if os.path.basename(f).lower() not in ['train.csv', 'test.csv', 'eval.csv', 'meta.csv']]


# For each file, extracts the value dimension name from the filename and adds it as a column
valuenet_df = pd.concat([
    pd.read_csv(file).assign(value_dimension=os.path.basename(file).replace('.csv', ''))
    for file in value_files
], ignore_index=True)


print(f"✅ Loaded ValueNet dataset with {len(valuenet_df)} scenarios.")
print("Available Value Dimensions:", valuenet_df['value_dimension'].unique())
display(valuenet_df.head())


✅ Loaded ValueNet dataset with 21374 scenarios.
Available Value Dimensions: ['SECURITY' 'BENEVOLENCE' 'ACHIEVEMENT' 'SELF-DIRECTION' 'POWER'
 'UNIVERSALISM' 'STIMULATION' 'CONFORMITY' 'TRADITION' 'HEDONISM']


Unnamed: 0.1,Unnamed: 0,uid,scenario,label,value_dimension
0,0,51609,i never want to love again.,-1,SECURITY
1,1,51610,I'm drowning and nobody knows,0,SECURITY
2,2,51601,Hanging out with friends is too much work,-1,SECURITY
3,3,51611,A friend is moving away for good,-1,SECURITY
4,4,51604,"My family is messed up, I just wanna vent.",-1,SECURITY


In [7]:
# Sentence-BERT to embed the texts for both pipelines 
# (persuaders and persuadees) and then match each utterance 
# to the most similar ValueNet scenario.
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')

# For Persuaders
persuader_texts = persuader_df['utterance'].tolist()
persuader_embeddings = model.encode(persuader_texts, batch_size=64, show_progress_bar=True)


# For Persuadees
persuadee_texts = persuadee_df['utterance'].tolist()
persuadee_embeddings = model.encode(persuadee_texts, batch_size=64, show_progress_bar=True)


# For ValueNet scenarios
valuenet_texts = valuenet_df['scenario'].tolist()
valuenet_embeddings = model.encode(valuenet_texts, batch_size=64, show_progress_bar=True)


print("✅ Embeddings generated:")
print(f"- Persuader utterances: {len(persuader_embeddings)}")
print(f"- Persuadee utterances: {len(persuadee_embeddings)}")
print(f"- ValueNet scenarios: {len(valuenet_embeddings)}")

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Batches:   0%|          | 0/162 [00:00<?, ?it/s]

Batches:   0%|          | 0/334 [00:00<?, ?it/s]

✅ Embeddings generated:
- Persuader utterances: 2354
- Persuadee utterances: 10332
- ValueNet scenarios: 21374


In [8]:
# Define a function to match utterances to ValueNet scenarios
def match_to_valuenet(utterance_embeddings, texts):
    sim_matrix = cosine_similarity(utterance_embeddings, valuenet_embeddings)
    best_indices = np.argmax(sim_matrix, axis=1)
    similarity_scores = sim_matrix[np.arange(len(texts)), best_indices]
    matched_scenarios = valuenet_df.iloc[best_indices].reset_index(drop=True)
    return matched_scenarios, similarity_scores

# Match for persuaders, add value and similarity score
matched_persuader, pers_sim_scores = match_to_valuenet(persuader_embeddings, persuader_texts)
persuader_df['matched_scenario'] = matched_persuader['scenario'].values
persuader_df['value_dimension'] = matched_persuader['value_dimension'].values
persuader_df['similarity_score'] = pers_sim_scores

# Match for persuadees, add value and similarity score
matched_persuadee, persd_sim_scores = match_to_valuenet(persuadee_embeddings, persuadee_texts)
persuadee_df['matched_scenario'] = matched_persuadee['scenario'].values
persuadee_df['value_dimension'] = matched_persuadee['value_dimension'].values
persuadee_df['similarity_score'] = persd_sim_scores

print("\nSample Persuader DataFrame with ValueNet matching:")
display(persuader_df.head())
print("\nSample Persuadee DataFrame with ValueNet matching:")
display(persuadee_df.head())



Sample Persuader DataFrame with ValueNet matching:


Unnamed: 0,utterance,speaker_id,conversation_id,role,labels,donation_amount,age,sex,education,matched_scenario,value_dimension,similarity_score
0,Where are you from?,A3A07QA5U733HQ,0,0,[personal-related-inquiry],0.0,34.0,Male,Less than four-year college,I hate where I live,POWER,0.465415
1,I like that they're committed to helping child...,A3A07QA5U733HQ,0,0,"[credibility-appeal, credibility-appeal]",0.0,34.0,Male,Less than four-year college,"I work in international charity, saving kids",SECURITY,0.478816
2,I'm planning on donating most of my earnings t...,A3A07QA5U733HQ,0,0,"[self-modeling, proposition-of-donation]",0.0,34.0,Male,Less than four-year college,I wanna donate money to a homeless man,BENEVOLENCE,0.530597
3,Yes it would. Any little bit helps. Thank you ...,A3A07QA5U733HQ,0,0,"[positive-to-inquiry, logical-appeal, thank]",0.0,34.0,Male,Less than four-year college,Feeling good would be a change,STIMULATION,0.36721
4,I know normally I am too but I was convinced t...,A22WWSTT8TU7G1,41,0,[logical-appeal],0.05,36.0,Female,Less than four-year college,emailing my sisters beauty pageant company to ...,SECURITY,0.512934



Sample Persuadee DataFrame with ValueNet matching:


Unnamed: 0,utterance,speaker_id,conversation_id,role,donation_amount,age,sex,education,matched_scenario,value_dimension,similarity_score
0,Hi. I am doing good. How about you?,A25L985XCNESXE,0,1,0.0,50.0,Female,Less than four-year college,I am not doing great,ACHIEVEMENT,0.569396
1,"Haha. Same here, but it really feels like a Mo...",A25L985XCNESXE,0,1,0.0,50.0,Female,Less than four-year college,I feel really sad today,BENEVOLENCE,0.461626
2,I can not believe how warm it is already.,A25L985XCNESXE,0,1,0.0,50.0,Female,Less than four-year college,I'm really enjoying global warming right now,STIMULATION,0.484216
3,I am from the Midwest. What about you?,A25L985XCNESXE,0,1,0.0,50.0,Female,Less than four-year college,I hate where I live,POWER,0.401387
4,"Oh, yep. You are definitely in for warm weathe...",A25L985XCNESXE,0,1,0.0,50.0,Female,Less than four-year college,I hate sunny weather. I'm in my comfort zone w...,HEDONISM,0.525832


In [13]:
# This block conducts the correlation analyses separately. 
# For persuadees, we examine how their matched value dimensions relate to their donation amounts. 
# For persuaders, we compute how the value dimensions in their persuasive utterances correlate with the 
# donation amounts received from their respective persuadees.



# --- Analysis for Persuaders ---
# Group by value dimension and compute the average donation amount per persuader value
persuader_donation_by_value = persuader_df.groupby('value_dimension')['donation_amount'].mean().sort_values(ascending=False)
print("✅ Persuader Value Dimensions vs. Persuadee Donation Amount (Average):")
print(persuader_donation_by_value)


# Optionally, count how many utterances per value dimension for context
persuader_value_counts = persuader_df['value_dimension'].value_counts()
print("\nPersuader utterance counts by Value Dimension:")
print(persuader_value_counts)


# --- Analysis for Persuadees ---
# Group by value dimension and compute the average donation amount per persuadee value
persuadee_donation_by_value = persuadee_df.groupby('value_dimension')['donation_amount'].mean().sort_values(ascending=False)
print("\n✅ Persuadee Value Dimensions vs. Their Donation Amount (Average):")
print(persuadee_donation_by_value)


# Count of utterances per value dimension for persuadees
persuadee_value_counts = persuadee_df['value_dimension'].value_counts()
print("\nPersuadee utterance counts by Value Dimension:")
print(persuadee_value_counts)


✅ Persuader Value Dimensions vs. Persuadee Donation Amount (Average):
value_dimension
SELF-DIRECTION    2.559091
HEDONISM          1.793506
BENEVOLENCE       1.568344
SECURITY          1.306920
STIMULATION       0.497143
TRADITION         0.467647
UNIVERSALISM      0.279481
ACHIEVEMENT       0.253636
CONFORMITY        0.174706
POWER             0.151852
Name: donation_amount, dtype: float64

Persuader utterance counts by Value Dimension:
value_dimension
BENEVOLENCE       1304
SECURITY           659
UNIVERSALISM       135
HEDONISM            77
STIMULATION         63
ACHIEVEMENT         44
POWER               27
CONFORMITY          17
TRADITION           17
SELF-DIRECTION      11
Name: count, dtype: int64

✅ Persuadee Value Dimensions vs. Their Donation Amount (Average):
value_dimension
CONFORMITY        3.312801
STIMULATION       3.209544
SECURITY          2.979776
BENEVOLENCE       2.153477
HEDONISM          1.764055
TRADITION         1.231240
POWER             1.094848
SELF-DIRECTION