In [36]:
import pandas as pd
import glob
import os

# Define path to your ValueNet data directory
data_path = './data/'  # ADJUST if your directory changes


# List of CSV files (excluding meta files like train.csv, eval.csv, test.csv, meta.csv) (provided by AI)
value_files = [f for f in glob.glob(os.path.join(data_path, '*.csv')) 
               if os.path.basename(f).lower() not in ['train.csv', 'test.csv', 'eval.csv', 'meta.csv']]


# Load all files into one DataFrame using a list comprehension with assign() to add the value dimension. (provided by AI)
valuenet_df = pd.concat([
    pd.read_csv(file).assign(value_dimension=os.path.basename(file).replace('.csv', ''))
    for file in value_files
], ignore_index=True)


print(f"✅ Loaded ValueNet dataset with {len(valuenet_df)} scenarios.")
print("Available Value Dimensions:", valuenet_df['value_dimension'].unique())
display(valuenet_df.head())

✅ Loaded ValueNet dataset with 21374 scenarios.
Available Value Dimensions: ['SECURITY' 'BENEVOLENCE' 'ACHIEVEMENT' 'SELF-DIRECTION' 'POWER'
 'UNIVERSALISM' 'STIMULATION' 'CONFORMITY' 'TRADITION' 'HEDONISM']


Unnamed: 0.1,Unnamed: 0,uid,scenario,label,value_dimension
0,0,51609,i never want to love again.,-1,SECURITY
1,1,51610,I'm drowning and nobody knows,0,SECURITY
2,2,51601,Hanging out with friends is too much work,-1,SECURITY
3,3,51611,A friend is moving away for good,-1,SECURITY
4,4,51604,"My family is messed up, I just wanna vent.",-1,SECURITY


In [23]:
# # NOTE THIS EMBEDDING MODEL NOT USED. IT DID NOT GIVE HYPOTHESIZED RESULTS AND IS MORE BASIC EMBEDDER. WE USE SETUP FROM NEXT BLOCK.
# from sklearn.metrics.pairwise import cosine_similarity # Calculate similarity between vectors
# import numpy as np
# from sklearn.feature_extraction.text import TfidfVectorizer # Convert text to vetors


# # persuasion_texts = []: List of all utterances from our text (persuasion corpus)
# persuasion_texts = []
# for utterance in persuasion_corpus.iter_utterances():
#     persuasion_texts.append(utterance.text)


# # This extracts the scenario texts from the ValueNet dataset.
# valuenet_scenarios = valuenet_df['scenario'].tolist()


# # A TF-IDF vectorizer converts text into  vectors by analyzing word frequency and 
# # distinctiveness across documents. With max_features=1000, it selects only the top 1000 
# # most common words to create more manageable representations while ignoring rare terms. 
# # Here's a basic example of how TF-IDF works:
# # Imagine we have two persuasion utterances:
# # "Please donate to help children in need"
# # "Your donation will help children get education"
# # And one ValueNet scenario:
# # "Helping children access education"
# # The TF-IDF vectorizer would:
# # Create a vocabulary from all texts: ["please", "donate", "to"...]
# # Count how often each word appears in each text and how unique it is across all texts
# # Convert each text into numerical vectors such that each number represents the importance of a word
# # When measuring similarity, utterance #2 would likely be more similar to the ValueNet scenario than utterance #1, 
# # because they share important distinctive words like "education" that get higher TF-IDF scores. (AI Help)
# vectorizer = TfidfVectorizer(max_features=1000)
# persuasion_embeddings = vectorizer.fit_transform(persuasion_texts)
# valuenet_embeddings = vectorizer.transform(valuenet_scenarios)

# # Compute cosine similarity matrix
# similarity_matrix = cosine_similarity(persuasion_embeddings, valuenet_embeddings)


# # For each utterance (row) in our target tezt, find index of most similar ValueNet scenario
# most_similar_indices = np.argmax(similarity_matrix, axis=1)


# # Each row in this DataFrame corresponds to the best-matching ValueNet scenario for each persuasion utterance and that scenario's value. (AI help)
# matched_scenarios = valuenet_df.iloc[most_similar_indices].reset_index(drop=True)


# utterances_df = pd.DataFrame({
#     'utterance': persuasion_texts,
#     'matched_scenario': matched_scenarios['scenario'].values,
#     'value_dimension': matched_scenarios['value_dimension'].values,
#     'similarity_score': similarity_matrix[np.arange(len(persuasion_texts)), most_similar_indices]
# })

In [37]:
# Import the required libraries
from convokit import Corpus, download


# Load the Persuasion for Good Corpus
persuasion_corpus = Corpus(filename=download("persuasionforgood-corpus"))
print("✅ Persuasion for Good Corpus loaded successfully.\n")
persuasion_corpus.print_summary_stats()


# Iterate through all utterances in the corpus, and for each utterance get text attribbute
persuasion_texts = [utt.text for utt in persuasion_corpus.iter_utterances()]


# Show how many utterances we have
print(f"Extracted {len(persuasion_texts)} utterances from the persuasion corpus")

Dataset already exists at /Users/sofiansyed/.convokit/saved-corpora/persuasionforgood-corpus
✅ Persuasion for Good Corpus loaded successfully.

Number of Speakers: 1285
Number of Utterances: 20932
Number of Conversations: 1017
Extracted 20932 utterances from the persuasion corpus


In [38]:
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# https://www.sbert.net/
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')


# --- Generate Embeddings for Persuasion for Good Utterances ---
persuasion_embeddings = model.encode(persuasion_texts, batch_size=64, show_progress_bar=True)
# --- Generate Embeddings for ValueNet scenarios ---
valuenet_texts = valuenet_df['scenario'].tolist()


# Generate embeddings (ValueNet scenarios)
valuenet_embeddings = model.encode(valuenet_texts, batch_size=64, show_progress_bar=True)


print("✅ Embeddings generated successfully:")
print(f"- Persuasion utterances embedded: {len(persuasion_embeddings)}")
print(f"- ValueNet scenarios embedded: {len(valuenet_embeddings)}")


Batches:   0%|          | 0/328 [00:00<?, ?it/s]

Batches:   0%|          | 0/334 [00:00<?, ?it/s]

✅ Embeddings generated successfully:
- Persuasion utterances embedded: 20932
- ValueNet scenarios embedded: 21374


In [39]:
# Import necessary library
from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np


# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(persuasion_embeddings, valuenet_embeddings)


# For each utterance (row), find index of most similar ValueNet scenario. Then get all the scenarios' data
most_similar_indices = np.argmax(similarity_matrix, axis=1)
matched_scenarios = valuenet_df.iloc[most_similar_indices].reset_index(drop=True)


utterances_df = pd.DataFrame({
    'utterance': persuasion_texts,
    'matched_scenario': matched_scenarios['scenario'].values,
    'value_dimension': matched_scenarios['value_dimension'].values,
    'similarity_score': similarity_matrix[np.arange(len(persuasion_texts)), most_similar_indices]
})


print("✅ Matching done successfully!")
print("Here's how the matches look (first few rows):")
display(utterances_df.head())


✅ Matching done successfully!
Here's how the matches look (first few rows):


Unnamed: 0,utterance,matched_scenario,value_dimension,similarity_score
0,Good morning. How are you doing today?,not saying good morning,CONFORMITY,0.652229
1,Hi. I am doing good. How about you?,I am not doing great,ACHIEVEMENT,0.569396
2,I'm doing pretty good for a Tuesday morning.,I've literally done nothing at work since Thur...,SECURITY,0.588009
3,"Haha. Same here, but it really feels like a Mo...",I feel really sad today,BENEVOLENCE,0.461626
4,Ugh yes it does!,This makes me mad,CONFORMITY,0.341693


In [40]:
display(utterances_df)


Unnamed: 0,utterance,matched_scenario,value_dimension,similarity_score
0,Good morning. How are you doing today?,not saying good morning,CONFORMITY,0.652229
1,Hi. I am doing good. How about you?,I am not doing great,ACHIEVEMENT,0.569396
2,I'm doing pretty good for a Tuesday morning.,I've literally done nothing at work since Thur...,SECURITY,0.588009
3,"Haha. Same here, but it really feels like a Mo...",I feel really sad today,BENEVOLENCE,0.461626
4,Ugh yes it does!,This makes me mad,CONFORMITY,0.341693
...,...,...,...,...
20927,My small change won't do a thing for those kid...,putting my daughter's children at risk of bein...,UNIVERSALISM,0.472319
20928,Well with our current president it's not likel...,wanting a reaction for a donation,BENEVOLENCE,0.492208
20929,"If it makes you happy, go ahead and take .50",How Much Do You Like To Give Head From 1-100?,HEDONISM,0.478676
20930,"Oh thank you so much, you have no idea how muc...",asking my brother for 50/50 pay when we babysit,SECURITY,0.401692


In [41]:
# Adding potentially beneficial metadata to main dataframe of matched utterances. (AI Help for block)
# This block allows us to use the utterances_df for all analyses
utterances_df['speaker_id'] = [utt.speaker.id for utt in persuasion_corpus.iter_utterances()]
utterances_df['conversation_id'] = [utt.conversation_id for utt in persuasion_corpus.iter_utterances()]
utterances_df['role'] = [utt.meta['role'] for utt in persuasion_corpus.iter_utterances()]

# Add speaker metadata (age, sex, education)
speaker_meta = {
    speaker.id: {
        'age': speaker.meta.get('age'),
        'sex': speaker.meta.get('sex'),
        'education': speaker.meta.get('edu')
    }
    for speaker in persuasion_corpus.iter_speakers()
}
utterances_df['age'] = utterances_df['speaker_id'].map(lambda x: speaker_meta[x]['age'])
utterances_df['sex'] = utterances_df['speaker_id'].map(lambda x: speaker_meta[x]['sex'])
utterances_df['education'] = utterances_df['speaker_id'].map(lambda x: speaker_meta[x]['education'])


# Add donation outcome metadata
conversation_donations = {conv.id: conv.meta.get('donation_ee', 0) for conv in persuasion_corpus.iter_conversations()}
utterances_df['donation_amount'] = utterances_df['conversation_id'].map(conversation_donations)


In [29]:
# Each speaker's most frequently used value dimension and 
# then counts how many speakers primarily use each value dimension.
top_values_by_speaker = utterances_df.groupby('speaker_id')['value_dimension'] \
                        .agg(lambda x: x.value_counts().idxmax())
top_values_by_speaker_summary = top_values_by_speaker.value_counts()
print("\n✅ Number of Speakers per Top Value Dimension:")
print(top_values_by_speaker_summary)



✅ Number of Speakers per Top Value Dimension:
value_dimension
BENEVOLENCE     1044
SECURITY         212
HEDONISM          16
STIMULATION        5
CONFORMITY         4
ACHIEVEMENT        3
UNIVERSALISM       1
Name: count, dtype: int64


In [30]:
# Overall frequency of each value dimension across all utterances.
common_values = utterances_df['value_dimension'].value_counts()
print("\n✅ Most Common Value Dimensions Overall:")
print(common_values)


✅ Most Common Value Dimensions Overall:
value_dimension
BENEVOLENCE       10490
SECURITY           4886
HEDONISM           1487
UNIVERSALISM        993
STIMULATION         921
ACHIEVEMENT         766
CONFORMITY          705
POWER               345
TRADITION           191
SELF-DIRECTION      148
Name: count, dtype: int64


In [42]:
# This analyzes how value dimensions are distributed across different age groups.
value_age_distribution = pd.crosstab(
    index=utterances_df['value_dimension'], 
    columns=utterances_df['age'], 
    normalize='columns'
)
print("\n✅ Value Distribution by Age (columns=age groups, rows=values):")
print(value_age_distribution)


✅ Value Distribution by Age (columns=age groups, rows=values):
age                  3.0    18.0      19.0      20.0      21.0      22.0  \
value_dimension                                                            
ACHIEVEMENT      0.000000  0.024  0.082474  0.053140  0.038585  0.047619   
BENEVOLENCE      0.545455  0.448  0.381443  0.429952  0.520900  0.495238   
CONFORMITY       0.000000  0.048  0.061856  0.038647  0.032154  0.026190   
HEDONISM         0.000000  0.072  0.041237  0.086957  0.054662  0.085714   
POWER            0.000000  0.032  0.010309  0.009662  0.012862  0.016667   
SECURITY         0.272727  0.264  0.340206  0.265700  0.292605  0.223810   
SELF-DIRECTION   0.000000  0.008  0.010309  0.000000  0.000000  0.004762   
STIMULATION      0.000000  0.048  0.010309  0.038647  0.025723  0.042857   
TRADITION        0.000000  0.008  0.000000  0.024155  0.000000  0.007143   
UNIVERSALISM     0.181818  0.048  0.061856  0.053140  0.022508  0.050000   

age                  23

In [32]:
# How value dimensions are distributed across different sexes.
value_sex_distribution = pd.crosstab(
    index=utterances_df['value_dimension'], 
    columns=utterances_df['sex'], 
    normalize='columns'
)
print("\n✅ Value Distribution by Sex (columns=sex, rows=values):")
print(value_sex_distribution)



✅ Value Distribution by Sex (columns=sex, rows=values):
sex                Female      Male     Other
value_dimension                              
ACHIEVEMENT      0.035745  0.037587  0.053333
BENEVOLENCE      0.501650  0.500791  0.426667
CONFORMITY       0.032538  0.035015  0.013333
HEDONISM         0.070452  0.071711  0.093333
POWER            0.014053  0.019189  0.013333
SECURITY         0.240592  0.225816  0.240000
SELF-DIRECTION   0.006508  0.007814  0.000000
STIMULATION      0.041592  0.045500  0.120000
TRADITION        0.008300  0.010188  0.000000
UNIVERSALISM     0.048571  0.046390  0.040000


In [33]:
# How value dimensions are distributed across different education levels.
value_education_distribution = pd.crosstab(
    index=utterances_df['value_dimension'], 
    columns=utterances_df['education'], 
    normalize='columns'
)
print("\n✅ Value Distribution by Education (columns=education, rows=values):")
print(value_education_distribution)



✅ Value Distribution by Education (columns=education, rows=values):
education        Four-year college  Less than four-year college  Postgraduate
value_dimension                                                              
ACHIEVEMENT               0.035064                     0.037222      0.039313
BENEVOLENCE               0.501262                     0.498543      0.509924
CONFORMITY                0.037322                     0.032992      0.025954
HEDONISM                  0.066676                     0.073973      0.072519
POWER                     0.015938                     0.016167      0.019847
SECURITY                  0.231239                     0.235830      0.229771
SELF-DIRECTION            0.007438                     0.007426      0.004962
STIMULATION               0.043698                     0.043519      0.045038
TRADITION                 0.009961                     0.008835      0.008397
UNIVERSALISM              0.051401                     0.045493      0.04

In [34]:
# how value dimensions are distributed between persuaders and persuadees.
value_role_distribution = pd.crosstab(
    index=utterances_df['value_dimension'], 
    columns=utterances_df['role'], 
    normalize='columns'
)
print("\n✅ Value Distribution by Role (columns=0=Persuader,1=Persuadee, rows=values):")
print(value_role_distribution)



✅ Value Distribution by Role (columns=0=Persuader,1=Persuadee, rows=values):
role                    0         1
value_dimension                    
ACHIEVEMENT      0.026509  0.046942
BENEVOLENCE      0.496132  0.506291
CONFORMITY       0.035189  0.032133
HEDONISM         0.068774  0.073364
POWER            0.014057  0.018970
SECURITY         0.262358  0.203736
SELF-DIRECTION   0.004340  0.009872
STIMULATION      0.037358  0.050813
TRADITION        0.006792  0.011518
UNIVERSALISM     0.048491  0.046361


In [43]:
# Relationship between value dimensions and donation outcomes, split by role.
# For persuaders (role 0)
persuader_donation_by_value = utterances_df[utterances_df['role'] == 0].groupby('value_dimension')['donation_amount'].mean().sort_values(ascending=False)
print("\n✅ Average Donation Amount per Value Dimension (Persuaders):")
print(persuader_donation_by_value)

# For persuadees (role 1)
persuadee_donation_by_value = utterances_df[utterances_df['role'] == 1].groupby('value_dimension')['donation_amount'].mean().sort_values(ascending=False)
print("\n✅ Average Donation Amount per Value Dimension (Persuadees):")
print(persuadee_donation_by_value)


✅ Average Donation Amount per Value Dimension (Persuaders):
value_dimension
ACHIEVEMENT       3.669466
CONFORMITY        2.599678
UNIVERSALISM      2.507685
HEDONISM          2.313580
BENEVOLENCE       2.309097
SELF-DIRECTION    2.012826
SECURITY          1.917454
TRADITION         1.100417
STIMULATION       0.943283
POWER             0.919262
Name: donation_amount, dtype: float64

✅ Average Donation Amount per Value Dimension (Persuadees):
value_dimension
CONFORMITY        3.309789
STIMULATION       3.214705
SECURITY          2.976551
BENEVOLENCE       2.152692
HEDONISM          1.763707
TRADITION         1.258655
POWER             1.100153
SELF-DIRECTION    1.074510
ACHIEVEMENT       0.992165
UNIVERSALISM      0.619207
Name: donation_amount, dtype: float64
