In [None]:
# Install required packages
!pip install sentence-transformers
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install networkx
!pip install community
!pip install python-igraph
!pip install matplotlib
!pip install gensim
!pip install tqdm     # For progress bars

In [43]:
# Add to your pipeline for complete reproducibility
import random
import numpy as np
import os

# Set deterministic seeds
random.seed(42)
np.random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

# Document all hyperparameters explicitly
HYPERPARAMETERS = {
    'embedding_model': 'all-MiniLM-L6-v2',
    'umap_neighbors': 25,
    'umap_components': 5,
    'umap_min_dist': 0.10,
    'hdbscan_min_size': 50,
    'hdbscan_min_samples': 10,
    'similarity_threshold': 0.7,
    'top_n_words': 1}

In [44]:
# Module 1: Core Embedding Model
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import re
from typing import List, Dict, Tuple

class EmbeddingModel:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        """
        Initialize the core embedding model
        """
        self.model = SentenceTransformer(model_name)
        self.stance_mapping = {'favor': 1, 'against': -1, 'neutral': 0}
    
    def preprocess_text(self, text: str) -> str:
        """Clean and preprocess text for better embeddings"""
        if pd.isna(text):
            return ""
        # Remove URLs, mentions, hashtags for cleaner text
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#\w+', '', text)
        return text.strip()
    
    def create_tweet_embeddings(self, df: pd.DataFrame) -> Dict[str, np.ndarray]:
        """
        Create embeddings for tweets including original content and context
        """
        tweet_embeddings = {}
        
        # Combine different text components for richer embeddings
        for idx, row in df.iterrows():
            # Create enriched text by combining tweet content with stance and intention context
            original_tweet = self.preprocess_text(row['text_sender'])
            
            # Add stance context
            stance_context = ""
            if pd.notna(row['final_stance']):
                stance_context = f"stance: {row['final_stance']} "
            
            # Add intentions context
            intentions = ""
            if pd.notna(row['final_intention1']):
                intentions += f"intention1: {row['final_intention1']} "
            if pd.notna(row['final_intention2']):
                intentions += f"intention2: {row['final_intention2']} "
            
            # Add explanation context
            explanation = ""
            if pd.notna(row['final_stance_explanation']):
                explanation += f"stance_explanation: {row['final_stance_explanation']} "
            if pd.notna(row['explanation_intention']):
                explanation += f"intentions_explanation: {row['explanation_intention']} "
            
            # Combine all components
            enriched_text = f"{original_tweet} {stance_context} {intentions} {explanation}".strip()
            
            # Create embedding - FIXED: Use self.model.encode()
            embedding = self.model.encode(enriched_text)
            tweet_embeddings[idx] = embedding
            
        return tweet_embeddings

In [45]:
# Module 2: User-Level Aggregation
import numpy as np
from collections import defaultdict

class UserEmbeddingAggregator:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model
    
    def create_user_embeddings(self, df: pd.DataFrame, tweet_embeddings: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
        """
        Create user-level embeddings by aggregating their tweet embeddings
        """
        user_embeddings = {}
        
        # Group tweets by user (using party_SENDER_eng column for user identification)
        if 'party_SENDER_eng' not in df.columns:
            # Create artificial user groups based on tweet patterns
            df['party_SENDER_eng'] = self._create_user_groups(df, tweet_embeddings)
        
        # Aggregate embeddings for each user
        user_groups = df.groupby('party_SENDER_eng')
        
        for user_id, group in user_groups:
            # Get all tweet embeddings for this user
            user_tweet_indices = group.index.tolist()
            user_tweet_embeddings = [tweet_embeddings[idx] for idx in user_tweet_indices]
            
            # Average the embeddings
            if user_tweet_embeddings:  # Check if list is not empty
                user_embedding = np.mean(user_tweet_embeddings, axis=0)
                user_embeddings[user_id] = user_embedding
            
        return user_embeddings
    
    def _create_user_groups(self, df: pd.DataFrame, tweet_embeddings: Dict[str, np.ndarray]) -> List[str]:
        """
        Create artificial user groups based on tweet content similarity
        """
        # Simple grouping based on tweet content similarity
        n_tweets = len(df)
        groups = []
        
        # For demonstration, create groups based on tweet content similarity
        for i in range(n_tweets):
            groups.append(f"party_{i % 5}")  # Simple grouping into 5 groups
            
        return groups


In [46]:
# Module 3: Topic-Stance-Intentions Embeddings
class TopicStanceIntentionsEmbedder:
    def __init__(self, embedding_model):
        # Store the actual SentenceTransformer model
        self.model = embedding_model.model  # This is the correct way
    
    def create_topic_stance_intentions_embeddings(self, df: pd.DataFrame) -> Dict[str, np.ndarray]:
        """
        Create embeddings that capture topic-stance-intentions relationships
        """
        topic_stance_embeddings = {}
        
        # Group by policy topic (using target_policy column)
        if 'target_policy' not in df.columns:
            df['target_policy'] = 'public_spending_limits'  # Default topic
            
        topic_groups = df.groupby('target_policy')
        
        for topic, group in topic_groups:
            # Create topic-stance-intentions combination embeddings
            for idx, row in group.iterrows():
                # Create comprehensive text for topic-stance-intentions embedding
                topic_text = f"topic: {topic}"
                stance_text = f"stance: {row['final_stance']}" if pd.notna(row['final_stance']) else ""
                intention1_text = f"intention1: {row['final_intention1']}" if pd.notna(row['final_intention1']) else ""
                intention2_text = f"intention2: {row['final_intention2']}" if pd.notna(row['final_intention2']) else ""
                
                # Combine all elements
                combined_text = f"{topic_text} {stance_text} {intention1_text} {intention2_text}".strip()
                
                if combined_text.strip():
                    # Use the actual SentenceTransformer's encode method
                    embedding = self.model.encode(combined_text)
                    topic_stance_embeddings[f"{topic}_{idx}"] = embedding
        
        return topic_stance_embeddings


In [47]:
# Module 4: Network Graph Construction
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

class NetworkGraphBuilder:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model
    
    def create_network_graph(self, df: pd.DataFrame, tweet_embeddings: Dict[str, np.ndarray]) -> nx.Graph:
        """
        Create a network graph with tweet embeddings as nodes
        """
        G = nx.Graph()
        
        # Add tweet nodes with embeddings
        for idx, embedding in tweet_embeddings.items():
            tweet_text = self.embedding_model.preprocess_text(df.loc[idx, 'text_sender'])
            G.add_node(idx, 
                      embedding=embedding,
                      tweet=tweet_text,
                      stance=df.loc[idx, 'final_stance'] if 'final_stance' in df.columns else None,
                      intentions1=df.loc[idx, 'final_intention1'] if 'final_intention1' in df.columns else None,
                      intentions2=df.loc[idx, 'final_intention2'] if 'final_intention2' in df.columns else None,
                      topic=df.loc[idx, 'target_policy'] if 'target_policy' in df.columns else 'unknown')
        
        # Add edges based on tweet relationships
        # Simple similarity-based edges
        node_indices = list(tweet_embeddings.keys())
        for i in range(len(node_indices)):
            for j in range(i+1, len(node_indices)):
                idx1, idx2 = node_indices[i], node_indices[j]
                similarity = cosine_similarity([tweet_embeddings[idx1]], [tweet_embeddings[idx2]])[0][0]
                
                # Add edge if similarity is above threshold
                if similarity > 0.7:  # Adjust threshold as needed
                    G.add_edge(idx1, idx2, similarity=similarity)
        
        return G


In [48]:
# Module 5: User Stance Profiling
class UserStanceProfiler:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model
    
    def get_user_stance_profile(self, df: pd.DataFrame) -> Dict[str, Dict]:
        """
        Create stance profiles for users (parties)
        """
        user_stance_profiles = {}
        
        if 'party_SENDER_eng' not in df.columns:
            # Create a simple grouping if column doesn't exist
            df['party_SENDER_eng'] = [f"party_{i % 5}" for i in range(len(df))]
        
        user_groups = df.groupby('party_SENDER_eng')
        
        for user_id, group in user_groups:
            stance_counts = group['final_stance'].value_counts()
            intention_counts = {}
            
            # Count intentions
            for idx, row in group.iterrows():
                if pd.notna(row['final_intention1']):
                    intention_counts[row['final_intention1']] = intention_counts.get(row['final_intention1'], 0) + 1
                if pd.notna(row['final_intention2']):
                    intention_counts[row['final_intention2']] = intention_counts.get(row['final_intention2'], 0) + 1
            
            user_stance_profiles[user_id] = {
                'stance_distribution': stance_counts.to_dict(),
                'intention_distribution': intention_counts,
                'avg_stance_score': self._calculate_avg_stance_score(group['final_stance']),
                'total_tweets': len(group)
            }
        
        return user_stance_profiles
    
    def _calculate_avg_stance_score(self, stance_series) -> float:
        """Calculate average stance score (favor=1, against=-1, neutral=0)"""
        scores = [self.embedding_model.stance_mapping.get(stance, 0) for stance in stance_series if pd.notna(stance)]
        return np.mean(scores) if scores else 0


In [49]:
# Add this debugging code before the pipeline runs
def debug_embedding_model():
    embedding_model = EmbeddingModel()
    print("Type of embedding_model:", type(embedding_model))
    print("embedding_model attributes:", [attr for attr in dir(embedding_model) if not attr.startswith('_')])
    print("Has encode method:", hasattr(embedding_model, 'encode'))
    
    # Check if model attribute exists and its type
    if hasattr(embedding_model, 'model'):
        print("Type of embedding_model.model:", type(embedding_model.model))
        print("embedding_model.model attributes:", [attr for attr in dir(embedding_model.model) if not attr.startswith('_')])
        print("Has encode method on model:", hasattr(embedding_model.model, 'encode'))
    
debug_embedding_model()

Type of embedding_model: <class '__main__.EmbeddingModel'>
embedding_model attributes: ['create_tweet_embeddings', 'model', 'preprocess_text', 'stance_mapping']
Has encode method: False
Type of embedding_model.model: <class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>
embedding_model.model attributes: ['T_destination', 'active_adapter', 'active_adapters', 'add_adapter', 'add_module', 'append', 'apply', 'backend', 'bfloat16', 'buffers', 'call_super_init', 'check_peft_compatible_model', 'children', 'compile', 'cpu', 'cuda', 'default_prompt_name', 'delete_adapter', 'device', 'disable_adapters', 'double', 'dtype', 'dump_patches', 'enable_adapters', 'encode', 'encode_document', 'encode_multi_process', 'encode_query', 'eval', 'evaluate', 'extend', 'extra_repr', 'fit', 'float', 'forward', 'get_adapter_state_dict', 'get_backend', 'get_buffer', 'get_extra_state', 'get_max_seq_length', 'get_model_kwargs', 'get_parameter', 'get_sentence_embedding_dimension', 'get_sentence_feat

In [31]:
# Test the fix
embedding_model = EmbeddingModel()
topic_embedder = TopicStanceIntentionsEmbedder(embedding_model)

# Test that the model attribute works
test_text = "Hello world"
embedding = topic_embedder.model.encode(test_text)  # This should work now
print("Test successful! Embedding shape:", embedding.shape)


Test successful! Embedding shape: (384,)


In [50]:
# Module 6: Complete Pipeline
def run_narrative_embedding_pipeline(df: pd.DataFrame) -> Dict:
    """
    Complete pipeline for narrative embedding analysis
    """
    # Initialize components
    embedding_model = EmbeddingModel()
    user_aggregator = UserEmbeddingAggregator(embedding_model)
    topic_embedder = TopicStanceIntentionsEmbedder(embedding_model)
    graph_builder = NetworkGraphBuilder(embedding_model)
    stance_profiler = UserStanceProfiler(embedding_model)
    
    # Step 1: Create tweet embeddings
    tweet_embeddings = embedding_model.create_tweet_embeddings(df)
    
    # Step 2: Create user embeddings
    user_embeddings = user_aggregator.create_user_embeddings(df, tweet_embeddings)
    
    # Step 3: Create topic-stance-intentions embeddings
    topic_stance_embeddings = topic_embedder.create_topic_stance_intentions_embeddings(df)
    
    # Step 4: Create network graph
    graph = graph_builder.create_network_graph(df, tweet_embeddings)
    
    # Step 5: Get user stance profiles
    user_profiles = stance_profiler.get_user_stance_profile(df)
    
    return {
        'tweet_embeddings': tweet_embeddings,
        'user_embeddings': user_embeddings,
        'topic_stance_embeddings': topic_stance_embeddings,
        'graph': graph,
        'user_profiles': user_profiles
    }


In [20]:
# Test the embedding model
embedding_model = EmbeddingModel()
test_text = "This is a test tweet"
embedding = embedding_model.model.encode(test_text)
print("Embedding shape:", embedding.shape)
print("Test successful!")


Embedding shape: (384,)
Test successful!


In [51]:
# Module 7: Usage Example
def main():
    # Load the actual dataset
    df = pd.read_csv("final_FI_01122025_public_spending.csv")
    
    print("Dataset shape:", df.shape)
    print("Columns:", df.columns.tolist())
    
    # Run complete pipeline
    results = run_narrative_embedding_pipeline(df)
    
    print("Pipeline completed successfully!")
    print(f"Number of tweet embeddings: {len(results['tweet_embeddings'])}")
    print(f"Number of user embeddings: {len(results['user_embeddings'])}")
    print(f"Number of topic-stance embeddings: {len(results['topic_stance_embeddings'])}")

if __name__ == "__main__":
    main()

Dataset shape: (940, 186)
Columns: ['dataset', 'party_SENDER_eng', 'text_sender', 'A1_stance', 'A2_stance', 'A3_stance', 'adv_favor_stance', 'adv_against_stance', 'adv_neutral_stance', 'final_stance', 'confidence_stance', 'final_stance_explanation', 'A1_intention', 'A2_intention', 'A3_intention', 'final_intention1', 'final_intention2', 'confidence_intention', 'explanation_intention', 'targeted_pol_reaction_text_tr', 'topic_id', 'target_policy', 'topic_top_words', 'meta_json', 'actor_type_sender', 'reply_settings_company', 'entities_company', 'in_reply_to_user_id_company', 't_company', 'public_metrics_company', 'edit_history_tweet_ids_company', 'author_id_company', 'created_at_company', 'conversation_id_company', 'referenced_tweets_company', 'lang_company', 'id', 'author_url_company', 'author_verified_type_company', 'description_sender', 'author_profile_image_url_company', 'author_created_at_company', 'author_name_company', 'author_public_metrics_company', 'author_verified_company', 'au

In [52]:
# Module 8: Community Detection and Mapping (Simplified)
import community as community_louvain
import numpy as np
from collections import defaultdict

class CommunityExtractor:
    def __init__(self, embedding_model):
        self.model = embedding_model.model
        self.community_mapping = {}  # Maps node indices to community IDs
    
    def extract_tweet_communities(self, graph):
        """
        Extract communities from the tweet network graph using Louvain method
        """
        # Use Louvain method for community detection
        partition = community_louvain.best_partition(graph)
        self.community_mapping = partition
        return partition
    
    def map_communities_to_csv(self, df, graph, community_partition):
        """
        Map community assignments back to the original CSV file
        """
        # Add community column to dataframe
        df['community_id'] = None
        
        # Map each tweet to its community
        for node_idx, community_id in community_partition.items():
            if node_idx < len(df):  # Make sure index is valid
                df.loc[node_idx, 'community_id'] = community_id
        
        return df
    
    def get_community_statistics(self, df, community_partition):
        """
        Get statistics about communities
        """
        community_stats = defaultdict(list)
        
        for node_idx, community_id in community_partition.items():
            if node_idx < len(df):
                # Get the original tweet data for this node
                row = df.iloc[node_idx]
                community_stats[community_id].append({
                    'stance': row.get('final_stance', 'unknown'),
                    'intention1': row.get('final_intention1', 'unknown'),
                    'intention2': row.get('final_intention2', 'unknown'),
                    'topic': row.get('target_policy', 'unknown'),
                    'tweet_text': row.get('text_sender', 'unknown')[:50] + '...' if pd.notna(row.get('text_sender')) else 'unknown'
                })
        
        return community_stats

# Enhanced pipeline with community detection (simplified)
def run_enhanced_narrative_embedding_pipeline(df: pd.DataFrame) -> Dict:
    """
    Complete pipeline for narrative embedding analysis with community detection
    """
    # Initialize components
    embedding_model = EmbeddingModel()
    user_aggregator = UserEmbeddingAggregator(embedding_model)
    topic_embedder = TopicStanceIntentionsEmbedder(embedding_model)
    graph_builder = NetworkGraphBuilder(embedding_model)
    stance_profiler = UserStanceProfiler(embedding_model)
    community_extractor = CommunityExtractor(embedding_model)
    
    # Step 1: Create tweet embeddings
    tweet_embeddings = embedding_model.create_tweet_embeddings(df)
    
    # Step 2: Create user embeddings
    user_embeddings = user_aggregator.create_user_embeddings(df, tweet_embeddings)
    
    # Step 3: Create topic-stance-intentions embeddings
    topic_stance_embeddings = topic_embedder.create_topic_stance_intentions_embeddings(df)
    
    # Step 4: Create network graph
    graph = graph_builder.create_network_graph(df, tweet_embeddings)
    
    # Step 5: Extract communities (using only Louvain method)
    tweet_communities = community_extractor.extract_tweet_communities(graph)
    
    # Step 6: Map communities back to CSV
    df_with_communities = community_extractor.map_communities_to_csv(df, graph, tweet_communities)
    
    # Step 7: Get community statistics
    community_stats = community_extractor.get_community_statistics(df_with_communities, tweet_communities)
    
    # Step 8: Get user stance profiles
    user_profiles = stance_profiler.get_user_stance_profile(df_with_communities)
    
    return {
        'tweet_embeddings': tweet_embeddings,
        'user_embeddings': user_embeddings,
        'topic_stance_embeddings': topic_stance_embeddings,
        'graph': graph,
        'user_profiles': user_profiles,
        'communities': tweet_communities,
        'community_stats': community_stats,
        'df_with_communities': df_with_communities
    }


In [59]:
# Run the enhanced pipeline
results = run_enhanced_narrative_embedding_pipeline(df)

# Check the results
print("Number of communities detected:", len(set(results['communities'].values())))
print("First 10 community assignments:", list(results['communities'].items())[:10])

# Export to CSV
results['df_with_communities'].to_csv('final_FI_01122025_with_communities.csv', index=False)
print("CSV with communities saved!")

# Check community statistics
print("Community statistics:")
for comm_id, stats in list(results['community_stats'].items())[:3]:  # Show first 3 communities
    print(f"Community {comm_id}: {len(stats)} tweets")


KeyError: 'text_sender'

In [61]:
print(df)

                                               tweet stance  \
0  Climate change is a serious threat that needs ...    pro   
1  We should focus on economic growth over enviro...    con   
2  The new policy will help reduce carbon emissio...    pro   
3  This policy benefits the wealthy at the expens...    con   
4  Social media platforms should regulate misinfo...    pro   
5  Government oversight of social media is an ove...    con   

                                  stance_explanation        intentions1  \
0  Climate change affects everyone and requires u...           advocacy   
1  Economic stability is more important than envi...         opposition   
2   Policy will create positive environmental impact     policy_support   
3  Policy creates inequality and benefits only ce...  policy_opposition   
4  Misinformation harms public discourse and demo...         regulation   
5  Government regulation limits free speech and p...            freedom   

     intentions2    policy_topic

In [58]:
# Quick debugging
print("Available columns in DataFrame:")
print(df.columns.tolist())
print("\nFirst few rows of text_sender column:")
print(df['text_sender'].head())

Available columns in DataFrame:
['tweet', 'stance', 'stance_explanation', 'intentions1', 'intentions2', 'policy_topic', 'user_id']

First few rows of text_sender column:


KeyError: 'text_sender'

In [None]:
# Export the dataframe with communities to CSV
df_with_communities.to_csv('final_FI_01122025_with_communities.csv', index=False)
print("CSV with communities saved as 'final_FI_01122025_with_communities.csv'")

# Export community statistics
import json
with open('community_statistics.json', 'w') as f:
    json.dump(results['community_stats'], f, indent=2)
print("Community statistics saved as 'community_statistics.json'")


In [None]:
# Analyze community patterns
def analyze_communities(df_with_communities):
    """
    Analyze and visualize community patterns
    """
    # Group by community and analyze
    community_analysis = df_with_communities.groupby('community_id').agg({
        'final_stance': lambda x: x.value_counts().to_dict(),
        'final_intention1': lambda x: x.value_counts().to_dict(),
        'target_policy': lambda x: x.value_counts().to_dict(),
        'party_SENDER_eng': lambda x: x.value_counts().to_dict()
    }).reset_index()
    
    return community_analysis

# Run analysis
community_analysis = analyze_communities(df_with_communities)
print("Community Analysis:")
print(community_analysis)


In [None]:
# Create a summary of each community
def create_community_summary(df_with_communities):
    """
    Create a summary of each community's characteristics
    """
    summary = []
    
    for community_id in set(df_with_communities['community_id'].dropna()):
        community_data = df_with_communities[df_with_communities['community_id'] == community_id]
        
        summary.append({
            'community_id': community_id,
            'tweet_count': len(community_data),
            'average_stance_score': community_data['final_stance'].value_counts().get('favor', 0) - 
                                  community_data['final_stance'].value_counts().get('against', 0),
            'main_topics': community_data['target_policy'].value_counts().head(3).to_dict(),
            'main_intentions': community_data['final_intention1'].value_counts().head(3).to_dict(),
            'main_parties': community_data['party_SENDER_eng'].value_counts().head(3).to_dict()
        })
    
    return summary

# Generate summary
community_summary = create_community_summary(df_with_communities)
print("Community Summary:")
for comm in community_summary:
    print(f"Community {comm['community_id']}: {comm['tweet_count']} tweets")
