# Analysis of Apollo Dialogue Conversations

This notebook loads conversation data from DynamoDB and analyzes interactions.

In [None]:
import os
import sys
import boto3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime
from tqdm.notebook import tqdm

# Add parent directory to path so we can import modules
sys.path.append('..')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## Configure AWS Credentials

First, make sure you have your AWS credentials set. You can either set them here or load from environment variables.

In [ ]:
# Load AWS credentials from environment variables or set them directly
# If you set them directly here, be careful not to commit this notebook with sensitive credentials

AWS_REGION = os.environ.get('AWS_REGION', 'eu-north-1')  # Default to Stockholm region
DYNAMODB_TABLE = os.environ.get('DYNAMODB_TABLE', 'apollolytics_dialogues')

# Uncomment and use these lines if environment variables are not set
# os.environ['AWS_ACCESS_KEY_ID'] = 'your_access_key_id'
# os.environ['AWS_SECRET_ACCESS_KEY'] = 'your_secret_access_key'

print(f"Using AWS region: {AWS_REGION}")
print(f"DynamoDB table: {DYNAMODB_TABLE}")

## Connect to DynamoDB and Load Data

In [None]:
# Initialize DynamoDB client
dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
table = dynamodb.Table(DYNAMODB_TABLE)

# Check if table exists
try:
    response = table.scan(Limit=1)
    print(f"Successfully connected to DynamoDB table: {DYNAMODB_TABLE}")
except Exception as e:
    print(f"Error connecting to DynamoDB: {str(e)}")

In [None]:
# Function to scan all items from DynamoDB table
def scan_table(table_name):
    """Scan all items from DynamoDB table"""
    items = []
    scan_kwargs = {}
    
    print("Scanning DynamoDB table...")
    done = False
    start_key = None
    while not done:
        if start_key:
            scan_kwargs['ExclusiveStartKey'] = start_key
        response = table.scan(**scan_kwargs)
        items.extend(response.get('Items', []))
        start_key = response.get('LastEvaluatedKey', None)
        done = start_key is None
        print(f"Retrieved {len(items)} items so far...")
    
    print(f"Total items retrieved: {len(items)}")
    return items

# Scan all items from the table
items = scan_table(DYNAMODB_TABLE)

In [None]:
# Convert DynamoDB items to DataFrame
df = pd.DataFrame(items)

# Display the first few rows
print(f"Data shape: {df.shape}")
df.head()

## Explore Session Data

In [None]:
# Convert timestamp to datetime
df['datetime'] = pd.to_datetime(df['timestamp'].astype(int), unit='s')

# Get unique session IDs
session_ids = df['session_id'].unique()
print(f"Number of unique sessions: {len(session_ids)}")

# Count events by type
event_counts = df['event_type'].value_counts()
print("\nEvent types:")
print(event_counts)

In [None]:
# Plot event type distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=event_counts.index, y=event_counts.values)
plt.title('Distribution of Event Types')
plt.ylabel('Count')
plt.xlabel('Event Type')
plt.xticks(rotation=45)
plt.show()

## Analyze Dialogue Modes

In [None]:
# Extract sessions with initialization data
session_init_df = df[df['event_type'] == 'session_init']

# Count dialogue modes
dialogue_modes = session_init_df['dialogue_mode'].value_counts()
print("Dialogue Modes:")
print(dialogue_modes)

# Plot dialogue mode distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=dialogue_modes.index, y=dialogue_modes.values)
plt.title('Distribution of Dialogue Modes')
plt.ylabel('Count')
plt.xlabel('Dialogue Mode')
plt.xticks(rotation=45)
plt.show()

## Analyze Origin URLs

In [None]:
# Extract origin URLs
origin_urls = session_init_df['origin_url'].value_counts()
print("Origin URLs:")
print(origin_urls)

# Plot origin URL distribution
plt.figure(figsize=(12, 6))
sns.barplot(x=origin_urls.index, y=origin_urls.values)
plt.title('Distribution of Origin URLs')
plt.ylabel('Count')
plt.xlabel('Origin URL')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## Analyze Messages

In [None]:
# Extract messages
message_df = df[df['event_type'] == 'message']

# Count messages by role
role_counts = message_df['role'].value_counts()
print("Message Roles:")
print(role_counts)

# Plot message role distribution
plt.figure(figsize=(8, 6))
sns.barplot(x=role_counts.index, y=role_counts.values)
plt.title('Distribution of Message Roles')
plt.ylabel('Count')
plt.xlabel('Role')
plt.show()

## Analyze Message Length

In [None]:
# Function to calculate message length from transcript or content
def get_message_length(row):
    if pd.notna(row.get('transcript')):
        return len(row['transcript'])
    elif isinstance(row.get('message_content'), str):
        return len(row['message_content'])
    return 0

# Add message length column
message_df['message_length'] = message_df.apply(get_message_length, axis=1)

# Group by role and calculate average message length
avg_length_by_role = message_df.groupby('role')['message_length'].mean()
print("Average Message Length by Role:")
print(avg_length_by_role)

# Plot average message length by role
plt.figure(figsize=(8, 6))
sns.barplot(x=avg_length_by_role.index, y=avg_length_by_role.values)
plt.title('Average Message Length by Role')
plt.ylabel('Average Character Count')
plt.xlabel('Role')
plt.show()

## Analyze Messages per Session

In [None]:
# Count messages per session
messages_per_session = message_df.groupby('session_id').size()
print(f"Average messages per session: {messages_per_session.mean():.2f}")
print(f"Median messages per session: {messages_per_session.median()}")
print(f"Max messages in a session: {messages_per_session.max()}")
print(f"Min messages in a session: {messages_per_session.min()}")

# Plot distribution of messages per session
plt.figure(figsize=(10, 6))
sns.histplot(messages_per_session, kde=True)
plt.title('Distribution of Messages per Session')
plt.xlabel('Number of Messages')
plt.ylabel('Frequency')
plt.show()

## Analyze Propaganda Results

In [None]:
# Extract propaganda analysis events
propaganda_df = df[df['event_type'] == 'propaganda_analysis']
print(f"Number of propaganda analysis events: {len(propaganda_df)}")

# This cell will need customization based on the actual structure of propaganda results
# Here's a sample approach that assumes propaganda_result is a complex nested structure

# Function to extract propaganda categories from the result
def extract_propaganda_categories(row):
    try:
        if isinstance(row.get('propaganda_result'), dict) and 'data' in row['propaganda_result']:
            return list(row['propaganda_result']['data'].keys())
        return []
    except:
        return []

# Apply function to get categories
try:
    propaganda_df['categories'] = propaganda_df.apply(extract_propaganda_categories, axis=1)
    
    # Explode the categories list to count occurrences
    categories_exploded = propaganda_df.explode('categories')
    category_counts = categories_exploded['categories'].value_counts()
    
    print("\nPropaganda Categories:")
    print(category_counts)
    
    # Plot propaganda categories
    plt.figure(figsize=(12, 6))
    sns.barplot(x=category_counts.index, y=category_counts.values)
    plt.title('Distribution of Propaganda Categories')
    plt.ylabel('Count')
    plt.xlabel('Category')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error analyzing propaganda results: {str(e)}")
    print("You may need to customize this section based on your actual data structure.")

## Session Timeline Analysis

In [None]:
# Sample a single session to analyze its timeline
session_id_sample = session_ids[0] if len(session_ids) > 0 else None

if session_id_sample:
    # Filter data for the sampled session
    session_data = df[df['session_id'] == session_id_sample].sort_values('timestamp')
    
    print(f"Timeline for session: {session_id_sample}")
    print(f"Number of events: {len(session_data)}")
    
    # Display simplified timeline
    timeline_df = session_data[['event_type', 'datetime']]
    timeline_df = timeline_df.reset_index(drop=True)
    timeline_df

## Export Processed Data

In [None]:
# Export the processed data to CSV
df.to_csv('apollolytics_dialogue_data.csv', index=False)
print("Data exported to apollolytics_dialogue_data.csv")

## Custom Analysis

This section can be expanded for specific analyses you want to perform on your dialogue data.

In [None]:
# Add your custom analysis here
# For example, you might want to analyze:
# - Correlation between dialogue mode and conversation length
# - Most common propaganda techniques by article source
# - User engagement metrics
# - Sentiment analysis of messages
# - etc.