In [2]:
# !pip install -q spacy sentence-transformers scikit-learn
# !python -m spacy download en_core_web_trf
import numpy as np
import pandas as pd
import glob
import os
import spacy

# SpaCy is an industrial-strength NLP library that processes text in a pipeline fashion, 
# applying a sequence of components (tokenizer, tagger, parser, etc.) to analyze language. 
# In your code, SpaCy is specifically used for sentence segmentation through the 
# line nlp = spacy.load("en_core_web_trf") which loads a transformer-based English language model, 
# and the split_into_sentences() function which uses 
# SpaCy's sentence boundary detection capabilities (doc.sents) to intelligently break narrative text into individual sentences. 
# Unlike simple split methods, SpaCy recognizes complex sentence boundaries by understanding punctuation 
# in context, handling abbreviations, quotes, and other linguistic complexities.

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

print("Loading SpaCy 'en_core_web_trf' for sentence tokenization...")
nlp = spacy.load("en_core_web_trf")
print("SpaCy loaded.\n")

def split_into_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

def embed_sentences(sentences, model):
    return model.encode(sentences, convert_to_numpy=True)


Loading SpaCy 'en_core_web_trf' for sentence tokenization...
SpaCy loaded.



In [3]:
data_path = './data/'  # MAKE SURE THE PATH IS RIGHT
value_files = [
    f for f in glob.glob(os.path.join(data_path, '*.csv'))
    if os.path.basename(f).lower() not in ['train.csv', 'test.csv', 'eval.csv', 'meta.csv']
]

valuenet_df = pd.concat([
    pd.read_csv(file).assign(value_dimension=os.path.basename(file).replace('.csv', ''))
    for file in value_files
], ignore_index=True)

# Convert scenario col to string
valuenet_df['scenario'] = valuenet_df['scenario'].astype(str)
valuenet_scenarios = valuenet_df['scenario'].tolist()

print(f"Loaded ValueNet with {len(valuenet_df)} total items from {len(value_files)} CSV files.")
print("Sample of ValueNet data:")
display(valuenet_df.head(5))

# Two different types of BERT Models
embedding_models = {
    "MiniLM": SentenceTransformer("all-MiniLM-L6-v2"),
    "MPNet": SentenceTransformer("all-mpnet-base-v2")
}

print("Models loaded:", list(embedding_models.keys()))


Loaded ValueNet with 21374 total items from 10 CSV files.
Sample of ValueNet data:


Unnamed: 0.1,Unnamed: 0,uid,scenario,label,value_dimension
0,0,51609,i never want to love again.,-1,SECURITY
1,1,51610,I'm drowning and nobody knows,0,SECURITY
2,2,51601,Hanging out with friends is too much work,-1,SECURITY
3,3,51611,A friend is moving away for good,-1,SECURITY
4,4,51604,"My family is messed up, I just wanna vent.",-1,SECURITY


Models loaded: ['MiniLM', 'MPNet']


In [4]:
# Create KMeans clusters for ValueNet scenarios
def create_valuenet_clusters():
    valuenet_cluster_info = {}
    
    for model_name, model_obj in embedding_models.items():
        print(f"Creating KMeans clusters for {model_name}...")
        
        # Embed all ValueNet scenarios
        scenario_embeddings = embed_sentences(valuenet_scenarios, model_obj)
        
        # Create KMeans with 10 clusters (one for each Schwartz value)
        kmeans = KMeans(n_clusters=10, random_state=42)
        cluster_labels = kmeans.fit_predict(scenario_embeddings)
        
        # Map cluster IDs to value names based on most common value in each cluster
        cluster_value_counts = {}
        for i, label in enumerate(cluster_labels):
            value_dim = valuenet_df.iloc[i]['value_dimension']
            if label not in cluster_value_counts:
                cluster_value_counts[label] = {}
            if value_dim not in cluster_value_counts[label]:
                cluster_value_counts[label][value_dim] = 0
            cluster_value_counts[label][value_dim] += 1
        
        # Assign the most common value name to each cluster
        cluster_names = {}
        for cluster_id, value_counts in cluster_value_counts.items():
            most_common_value = max(value_counts.items(), key=lambda x: x[1])[0]
            cluster_names[cluster_id] = most_common_value
        
        # Store everything for this model
        valuenet_cluster_info[model_name] = {
            'kmeans': kmeans,
            'cluster_names': cluster_names,
            'embeddings': scenario_embeddings
        }
        print("done")
    return valuenet_cluster_info

# Create the clusters
valuenet_cluster_info = create_valuenet_clusters()

Creating KMeans clusters for MiniLM...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


done
Creating KMeans clusters for MPNet...
done


In [5]:
# This function creates a "center" (average embedding) for each value in valueNet scenariosby
# 1. Finding all scenarios belonging to a specific value dimension
# 2. Converting each scenario to an embedding
# 3. Averaging these embeddings to create a single representative vector for that value dimension
def create_dimension_centers(valuenet_df, model):
    dimension_centers = {}
    all_dims = valuenet_df['value_dimension'].unique()
    for dim_name in all_dims:
        # Subset scenarios for this dimension only
        subset_df = valuenet_df[valuenet_df['value_dimension'] == dim_name]
        scenario_texts = subset_df['scenario'].tolist()
        
        # Embed them
        scenario_embeds = embed_sentences(scenario_texts, model)
        
        # Average across them => dimension center
        dim_center = np.mean(scenario_embeds, axis=0)
        dimension_centers[dim_name] = dim_center
    
    return dimension_centers


# Splits a  text into sentences and converts them to embeddings
# For each value center, finds the top N = 3 sentences from the text that are most similar to that value
# Returns a dictionary mapping each value dimension to its most similar sentences from the narrative (AI generated)
def find_top_sentences_for_each_value_dimension(text, model, dimension_centers, top_n = 3):
    sentences = split_into_sentences(text)
    text_embeddings = embed_sentences(sentences, model)
    results = {}
    for dim_name, dim_center in dimension_centers.items():
        sims = cosine_similarity(text_embeddings, dim_center.reshape(1, -1)).flatten()
        
        # Get top_n sentences
        top_indices = np.argsort(sims)[-top_n:]
        top_sents = [(sentences[i], sims[i]) for i in top_indices]
        
        results[dim_name] = top_sents
    return results

def run_dimension_center_approach(narrative_text, top_n = 3):
    for model_name, model_obj in embedding_models.items():
        print(f"\n=== [Dimension-Center Approach for {model_name}] ===")
        

        dim_centers = create_dimension_centers(valuenet_df, model_obj)        
        # 2) Find top lines in the text for each dimension center
        value_map = find_top_sentences_for_each_value_dimension(narrative_text, model_obj, dim_centers, top_n)
        
        # 3) Print results
        for dim_name, top_sents in value_map.items():
            print(f"\nValue Dimension: {dim_name}")
            for line, sim_score in top_sents:
                print(f"  [Sim={sim_score:.4f}] {line}")


In [6]:
# This function finds the sentences most similar to a specific cluster center.
def get_top_sentences_for_cluster(sentences, text_embeddings, center_vec, top_n=3):
    sims = cosine_similarity(text_embeddings, center_vec.reshape(1, -1)).flatten()
    top_indices = np.argsort(sims)[-top_n:]
    top_sents = [(sentences[i], sims[i]) for i in top_indices]
    return top_sents



# 1) Convert text to sentences, sentences to embeddings, and For each cluster center in ValueNet's KMeans, pick top_n text lines
def find_top_sentences_for_each_value_cluster(text, model, valuenet_clust_info, top_n=3):
    # Prepare text sentences
    sentences = split_into_sentences(text)
    text_embeddings = embed_sentences(sentences, model)

    # Retrieve cluster centers + names from valuenet
    kmeans_model = valuenet_clust_info['kmeans']
    cluster_centers = kmeans_model.cluster_centers_
    cluster_name_map = valuenet_clust_info['cluster_names']

    # For each cluster, pick top_n lines
    results = {}
    for cluster_id in range(len(cluster_centers)):
        center_vec = cluster_centers[cluster_id]
        top_sents = get_top_sentences_for_cluster(sentences, text_embeddings, center_vec, top_n=top_n)
        # store them
        results[cluster_id] = top_sents
    return sentences, text_embeddings, results

# This function runs the KMeans cluster matching approach for each model.
def run_block_3_kmeans_cluster_matching(narrative_text, top_n=3):
    for model_name, model_obj in embedding_models.items():
        print(f"\n=== [Block 3: KMeans Value Matching for {model_name}] ===")
        clust_info = valuenet_cluster_info[model_name]
        sentences, text_embeds, cluster_map = find_top_sentences_for_each_value_cluster(
            narrative_text, model_obj, clust_info, top_n
        )

        for cluster_id, top_sents in cluster_map.items():
            # get the cluster label
            cluster_label = clust_info['cluster_names'].get(cluster_id, "Unknown")
            print(f"\n  >> Cluster ID {cluster_id}, Value: {cluster_label}")
            for sent, sim_score in top_sents:
                print(f"     [Sim={sim_score:.4f}] {sent}")



In [7]:
# Finding sentences whose embeddings are furthest from the mean (most different from the average sentence)
def find_outliers_by_distance(text, model, top_n=3):
    sentences = split_into_sentences(text)
    embeddings = embed_sentences(sentences, model)
    mean_vec = np.mean(embeddings, axis=0, keepdims=True)
    distances = 1 - cosine_similarity(embeddings, mean_vec).flatten()
    top_indices = np.argsort(distances)[-top_n:]
    top_outliers = [(sentences[i], distances[i]) for i in top_indices]
    return sentences, embeddings, top_outliers

# This function runs the outlier detection for each model and finds top n outliers.
def run_block_4_outliers(narrative_text, top_n=3):
    for model_name, model_obj in embedding_models.items():
        print(f"\n=== [Block 4: Outlier Sentences for {model_name}] ===")
        sents, emb, outliers = find_outliers_by_distance(narrative_text, model_obj, top_n)
        for sent, dist_score in outliers:
            print(f"[Dist={dist_score:.4f}] {sent}")



In [8]:
# 1) Define the narrative text
narrative_text = """
Oh, you know, just another week here at the home. Nothing too exciting, but it’s been nice.
Monday, I had knitting circle. Finally finished that blue shawl for my granddaughter, Lizzie—she’s off at college and always forgetting a jacket.
Tuesday, Mabel and I had lunch together. The meatloaf was terrible, but we had a good laugh about it.
She was going on about her great-grandson’s piano recital. Said he’s the next Mozart, but, well, we’ll see.
Wednesday, David called. He’s busy as ever, but he always makes time for me. Said he’s bringing the kids by Sunday.
I told him not to let them eat all my candy again. Last time, they cleared out my
whole stash of Werther’s. Thursday was bingo night. I was one number away from winning that big tea basket. Just my luck.
Yesterday, I sat out in the garden for a bit. The roses are starting to come in, and it was warm enough to just sit and listen to the birds for a while.
That was nice. And today? Well, not much yet, but I hear there’s peach cobbler for dessert tonight. If it’s as good as last week’s, I might have to get an extra slice.
""".strip()

# 2) Print top text lines for each value dimension (new approach)
print("\n====== BLOCK 2 (New Dimension-Center Approach) ======")
run_dimension_center_approach(narrative_text, top_n=3)
# New Dimension-Center Approach: Finds which sentences align most closely with each value (e.g., Benevolence, Security) by comparing them to that value’s averaged embedding, helping us see event-like statements tied to specific human values.

# 3) Run KMeans Cluster Matching (ADDED THIS)
print("\n====== BLOCK 3: KMEANS CLUSTER MATCHING ======")
run_block_3_kmeans_cluster_matching(narrative_text, top_n=3)
# KMeans Cluster Matching: Groups ValueNet scenarios into clusters and matches sentences to cluster centers


# 3) Print outlier lines (Block 4)
print("\n====== BLOCK 4: OUTLIER SENTENCES ======")
run_block_4_outliers(narrative_text, top_n=3)
# Outlier Sentences: Identifies which lines deviate the most from the overall mean embedding, highlighting unique or noteworthy events that stand out from the rest of the text.

print("\n✅ DONE! Check the printed output above for all results.")




=== [Dimension-Center Approach for MiniLM] ===

Value Dimension: SECURITY
  [Sim=0.3417] Monday, I had knitting circle.
  [Sim=0.3522] Tuesday, Mabel and I had lunch together.
  [Sim=0.4770] I told him not to let them eat all my candy again.

Value Dimension: BENEVOLENCE
  [Sim=0.3320] Finally finished that blue shawl for my granddaughter, Lizzie—she’s off at college and always forgetting a jacket.
  [Sim=0.3417] He’s busy as ever, but he always makes time for me.
  [Sim=0.4737] I told him not to let them eat all my candy again.

Value Dimension: ACHIEVEMENT
  [Sim=0.3673] Monday, I had knitting circle.
  [Sim=0.3730] Yesterday, I sat out in the garden for a bit.
  [Sim=0.4280] I told him not to let them eat all my candy again.

Value Dimension: SELF-DIRECTION
  [Sim=0.3410] Monday, I had knitting circle.
  [Sim=0.3712] Yesterday, I sat out in the garden for a bit.
  [Sim=0.4541] I told him not to let them eat all my candy again.

Value Dimension: POWER
  [Sim=0.3275] Monday, I had k