In [1]:
import sys
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
from datetime import datetime
sys.path.append("../")
from src.helpers import io
from src.classes.dataset import Dataset
from src.classes.annotation_set import AnnotationSet
from src.helpers.visualisation import (
    barplot_distribution,
    plot_confusion_matrix,
    tabulate_annotation_pair_summary,
    analyze_pair_annotations
)
%load_ext autoreload
%autoreload 2
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


# Research Questions:
# 1. What is the distribution of sensitive use flags?
# 2. What is the share of conversations flagged for sensitive uses compared to total labeled conversations?
# 3. What is the prevalence of sensitive, sexual, or illegal content in user queries?


In [2]:
# FILL IN:
PATH_TO_DATASET = "../data/static/wildchat4k-raw.json"
DATASET_ID = "wildchat_1m"
PATH_TO_ANNOTATIONS_DIR = "../res/gpto3mini-json-wildchat"

# Load dataset (w/o annotations)
dataset = Dataset.load(PATH_TO_DATASET)

# Load annotations into dataset
for fpath in io.listdir_nohidden(PATH_TO_ANNOTATIONS_DIR):
    annotation_set = AnnotationSet.load_automatic(path=fpath, source="automatic_v0")
    dataset.add_annotations(annotation_set)

prompt-multi_turn_relationship: 0 / 10127 failed due to invalid annotations.
prompt-interaction_features: 0 / 10127 failed due to invalid annotations.
turn-sensitive_use_flags: 0 / 10127 failed due to invalid annotations.
turn-topic: 1 / 10127 failed due to invalid annotations.
response-interaction_features: 0 / 10127 failed due to invalid annotations.
prompt-function_purpose: 6 / 10127 failed due to invalid annotations.
prompt-media_format: 0 / 10127 failed due to invalid annotations.
response-media_format: 0 / 10127 failed due to invalid annotations.
response-answer_form: 0 / 10127 failed due to invalid annotations.


# Research Question 1: What is the distribution of sensitive use flags?

# 

In [3]:
# Create output directory
outdir = "data/annotation_analysis_v0/sensitive_use_flags"
os.makedirs(outdir, exist_ok=True)

# Get distribution of sensitive use flags
sensitive_flags_distribution = dataset.get_annotation_distribution(
    "turn_sensitive_use_flags",
    level="message",
    annotation_source="automatic_v0",
    annotation_as_list_type=False,
)

# Plot distribution
fig = barplot_distribution(
    {"Sensitive Use Flags": sensitive_flags_distribution}, 
    normalize=True, 
    xlabel="Flag Type", 
    ylabel="Proportion", 
    title="Distribution of Sensitive Use Flags",
    output_path=f"{outdir}/barchart.png", 
    order="descending"
)


# Display the raw counts and percentages
print("Distribution of sensitive use flags:")
total_flags = sum(sensitive_flags_distribution.values())
for flag, count in sorted(sensitive_flags_distribution.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / total_flags) * 100
    print(f"{flag}: {count} instances ({percentage:.2f}%)")



Distribution of sensitive use flags:
None: 9054 instances (88.81%)
Sexually explicit content (Other): 523 instances (5.13%)
Inciting violence, hateful or other harmful behavior (harassment & bullying): 179 instances (1.76%)
Sexually explicit content (fictitious person): 154 instances (1.51%)
Inciting violence, hateful or other harmful behavior (physical harm): 78 instances (0.77%)
Discriminatory practices (Misrepresentation, stereotyping, or inappropriate reference to sensitive attributes): 62 instances (0.61%)
Criminal planning or other suspected illegal activity not listed elsewhere: 31 instances (0.30%)
Sexually explicit content (Request/discussion of CSAM): 26 instances (0.26%)
Sexually explicit content (real person): 20 instances (0.20%)
Privacy concerns (Possible identifiable information): 14 instances (0.14%)
Weapons & drugs: 11 instances (0.11%)
Cyberattacks (enabling/enacting malware, computer viruses, worms, malicious code, ...): 11 instances (0.11%)
Privacy concerns (Possibl

  fig.savefig(output_path, bbox_inches='tight')


## Research Question 2: What is the share of conversations with sensitive flags?


In [4]:
# Count conversations with at least one sensitive flag
sensitive_conv_count = 0
total_conv_count = len(dataset.data)
conv_with_sensitive_flags = {}

for conv in dataset.data:
    has_sensitive_flag = False
    for msg in conv.conversation:
        if "automatic_v0-turn_sensitive_use_flags" in msg.metadata:
            flag_value = msg.metadata["automatic_v0-turn_sensitive_use_flags"].value
            
            # Handle both list and string cases
            if isinstance(flag_value, list):
                if any(flag != "None" for flag in flag_value):
                    has_sensitive_flag = True
                    break
            elif flag_value and flag_value != "None":
                has_sensitive_flag = True
                break
    
    if has_sensitive_flag:
        sensitive_conv_count += 1
        conv_with_sensitive_flags[conv.conversation_id] = True

# Calculate percentage
percentage_sensitive = (sensitive_conv_count / total_conv_count) * 100

print(f"Total conversations: {total_conv_count}")
print(f"Conversations with sensitive flags: {sensitive_conv_count}")
print(f"Percentage of conversations with sensitive flags: {percentage_sensitive:.2f}%")

# Create pie chart
plt.figure(figsize=(10, 6))
plt.pie([sensitive_conv_count, total_conv_count - sensitive_conv_count], 
        labels=['Sensitive', 'Non-sensitive'],
        autopct='%1.1f%%',
        startangle=90)
plt.axis('equal')
plt.title('Proportion of Conversations with Sensitive Content')
plt.savefig(f"{outdir}/pie_chart.png")
plt.close()

Total conversations: 4000
Conversations with sensitive flags: 593
Percentage of conversations with sensitive flags: 14.82%


## Research Question 3: Prevalence of sensitive content in user vs. assistant messages

In [5]:
# Count sensitive content by role (user vs. assistant)
user_sensitive_count = 0
assistant_sensitive_count = 0
total_user_msgs = 0
total_assistant_msgs = 0

for conv in dataset.data:
    for msg in conv.conversation:
        if msg.role == "user":
            total_user_msgs += 1
            if "automatic_v0-turn_sensitive_use_flags" in msg.metadata:
                flag_value = msg.metadata["automatic_v0-turn_sensitive_use_flags"].value
                if isinstance(flag_value, list):
                    if any(flag != "None" for flag in flag_value):
                        user_sensitive_count += 1
                elif flag_value and flag_value != "None":
                    user_sensitive_count += 1
        elif msg.role == "assistant":
            total_assistant_msgs += 1
            if "automatic_v0-turn_sensitive_use_flags" in msg.metadata:
                flag_value = msg.metadata["automatic_v0-turn_sensitive_use_flags"].value
                if isinstance(flag_value, list):
                    if any(flag != "None" for flag in flag_value):
                        assistant_sensitive_count += 1
                elif flag_value and flag_value != "None":
                    assistant_sensitive_count += 1

# Calculate percentages
user_sensitive_percentage = (user_sensitive_count / total_user_msgs) * 100 if total_user_msgs > 0 else 0
assistant_sensitive_percentage = (assistant_sensitive_count / total_assistant_msgs) * 100 if total_assistant_msgs > 0 else 0

print(f"User messages with sensitive flags: {user_sensitive_count}/{total_user_msgs} ({user_sensitive_percentage:.2f}%)")
print(f"Assistant messages with sensitive flags: {assistant_sensitive_count}/{total_assistant_msgs} ({assistant_sensitive_percentage:.2f}%)")

# Create a bar chart comparing user vs assistant
role_data = {
    "Role": ["User", "Assistant"],
    "Percentage": [user_sensitive_percentage, assistant_sensitive_percentage],
    "Count": [user_sensitive_count, assistant_sensitive_count]
}
role_df = pd.DataFrame(role_data)

plt.figure(figsize=(10, 6))
plt.bar(role_df["Role"], role_df["Percentage"])
plt.xlabel("Role")
plt.ylabel("Percentage with Sensitive Content")
plt.title("Prevalence of Sensitive Content by Role")
for i, v in enumerate(role_df["Percentage"]):
    plt.text(i, v + 0.5, f"{v:.2f}%", ha='center')
plt.savefig(f"{outdir}/role_comparison.png")
plt.close()

User messages with sensitive flags: 1067/10127 (10.54%)
Assistant messages with sensitive flags: 0/10127 (0.00%)


## Analysis of specific sensitive content types


In [6]:
# Analyze types of sensitive content
sensitive_types = defaultdict(int)
user_sensitive_types = defaultdict(int)
assistant_sensitive_types = defaultdict(int)

for conv in dataset.data:
    for msg in conv.conversation:
        if "automatic_v0-turn_sensitive_use_flags" in msg.metadata:
            flag_value = msg.metadata["automatic_v0-turn_sensitive_use_flags"].value
            
            # Process flags based on whether they're lists or strings
            flags_to_process = []
            if isinstance(flag_value, list):
                flags_to_process = flag_value
            else:
                flags_to_process = [flag_value]
            
            # Count each flag
            for flag in flags_to_process:
                if flag != "None":
                    sensitive_types[flag] += 1
                    
                    # Also track by role
                    if msg.role == "user":
                        user_sensitive_types[flag] += 1
                    elif msg.role == "assistant":
                        assistant_sensitive_types[flag] += 1

# Filter out "None" category if it exists
if "None" in sensitive_types:
    del sensitive_types["None"]

# Plot distribution of sensitive types if any exist
if sensitive_types:
    fig = barplot_distribution(
        {"Sensitive Content Types": dict(sensitive_types)}, 
        normalize=True, 
        xlabel="Content Type", 
        ylabel="Proportion", 
        title="Distribution of Sensitive Content Types",
        output_path=f"{outdir}/content_types.png", 
        order="descending"
    )

    # Print detailed breakdown
    print("Breakdown of sensitive content types:")
    for content_type, count in sorted(sensitive_types.items(), key=lambda x: x[1], reverse=True):
        print(f"{content_type}: {count} instances")
    
    # Compare distribution between user and assistant
    if user_sensitive_types and assistant_sensitive_types:
        # Create a DataFrame for plotting
        types = sorted(set(list(user_sensitive_types.keys()) + list(assistant_sensitive_types.keys())))
        comparison_data = []
        
        for content_type in types:
            user_count = user_sensitive_types.get(content_type, 0)
            assistant_count = assistant_sensitive_types.get(content_type, 0)
            comparison_data.append({
                "Content Type": content_type,
                "User": user_count,
                "Assistant": assistant_count
            })
        
        comparison_df = pd.DataFrame(comparison_data)
        
        # Plot side-by-side bars
        plt.figure(figsize=(14, 8))
        comparison_df.plot(x="Content Type", y=["User", "Assistant"], kind="bar", figsize=(14, 8))
        plt.title("Sensitive Content Types: User vs Assistant")
        plt.ylabel("Count")
        plt.tight_layout()
        plt.savefig(f"{outdir}/user_vs_assistant_types.png")
        plt.close()
else:
    print("No specific sensitive content types found beyond 'None'.")

Breakdown of sensitive content types:
Sexually explicit content (Other): 523 instances
Inciting violence, hateful or other harmful behavior (harassment & bullying): 179 instances
Sexually explicit content (fictitious person): 154 instances
Inciting violence, hateful or other harmful behavior (physical harm): 78 instances
Discriminatory practices (Misrepresentation, stereotyping, or inappropriate reference to sensitive attributes): 62 instances
Criminal planning or other suspected illegal activity not listed elsewhere: 31 instances
Sexually explicit content (Request/discussion of CSAM): 26 instances
Sexually explicit content (real person): 20 instances
Privacy concerns (Possible identifiable information): 14 instances
Weapons & drugs: 11 instances
Cyberattacks (enabling/enacting malware, computer viruses, worms, malicious code, ...): 11 instances
Privacy concerns (Possible sensitive information): 10 instances
Generating defamatory content: 6 instances
Inciting violence, hateful or other

  fig.savefig(output_path, bbox_inches='tight')


## Temporal analysis of sensitive content


In [7]:
# Analyze temporal distribution if time data is available
if hasattr(dataset.data[0], 'time') and dataset.data[0].time:
    # Convert time strings to datetime objects
    temporal_data = []
    for conv in dataset.data:
        if conv.time:
            try:
                time_obj = datetime.fromisoformat(conv.time.replace('Z', '+00:00'))
                has_sensitive = False
                
                for msg in conv.conversation:
                    if "automatic_v0-turn_sensitive_use_flags" in msg.metadata:
                        flag_value = msg.metadata["automatic_v0-turn_sensitive_use_flags"].value
                        if isinstance(flag_value, list):
                            if any(flag != "None" for flag in flag_value):
                                has_sensitive = True
                                break
                        elif flag_value and flag_value != "None":
                            has_sensitive = True
                            break
                
                temporal_data.append((time_obj, has_sensitive))
            except ValueError:
                print(f"Warning: Could not parse time format for conversation {conv.conversation_id}")
    
    # Sort by time
    temporal_data.sort(key=lambda x: x[0])
    
    # Group by month
    monthly_data = defaultdict(lambda: {"sensitive": 0, "total": 0})
    for time_obj, has_sensitive in temporal_data:
        month_key = time_obj.strftime("%Y-%m")
        monthly_data[month_key]["total"] += 1
        if has_sensitive:
            monthly_data[month_key]["sensitive"] += 1
    
    # Calculate monthly percentages
    months = []
    percentages = []
    
    for month, counts in sorted(monthly_data.items()):
        months.append(month)
        percentage = (counts["sensitive"] / counts["total"]) * 100 if counts["total"] > 0 else 0
        percentages.append(percentage)
    
    # Plot monthly trend
    plt.figure(figsize=(12, 6))
    plt.plot(months, percentages, marker='o')
    plt.xlabel('Month')
    plt.ylabel('Percentage of Conversations with Sensitive Content')
    plt.title('Monthly Trend of Sensitive Content')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{outdir}/monthly_trend.png")
    plt.close()

## Geographic analysis of sensitive content


In [33]:
# Analyze geographical distribution if geography data is available
if hasattr(dataset.data[0], 'geography') and dataset.data[0].geography:
    # Group by geography
    geo_data = defaultdict(lambda: {"sensitive": 0, "total": 0})
    
    for conv in dataset.data:
        if conv.geography:
            # Extract country from geography (format might be "Country; Region")
            country = conv.geography.split(';')[0].strip() if ';' in conv.geography else conv.geography
            
            # Check if conversation has sensitive content
            has_sensitive = False
            for msg in conv.conversation:
                if "automatic_v0-turn_sensitive_use_flags" in msg.metadata:
                    flag_value = msg.metadata["automatic_v0-turn_sensitive_use_flags"].value
                    if isinstance(flag_value, list):
                        if any(flag != "None" for flag in flag_value):
                            has_sensitive = True
                            break
                    elif flag_value and flag_value != "None":
                        has_sensitive = True
                        break
            
            # Update counts
            geo_data[country]["total"] += 1
            if has_sensitive:
                geo_data[country]["sensitive"] += 1
    
    # Calculate percentages and prepare data for plotting
    geo_percentages = {}
    
    for country, counts in geo_data.items():
        if counts["total"] >= 10:  # Only include countries with sufficient data
            percentage = (counts["sensitive"] / counts["total"]) * 100
            geo_percentages[country] = percentage
    
    # Sort countries by percentage
    sorted_countries = sorted(geo_percentages.items(), key=lambda x: x[1], reverse=True)
    
    # Plot geographical distribution
    if sorted_countries:
        countries, percentages = zip(*sorted_countries[:15])  # Top 15 countries
        
        plt.figure(figsize=(14, 8))
        plt.bar(countries, percentages)
        plt.xlabel('Country')
        plt.ylabel('Percentage of Conversations with Sensitive Content')
        plt.title('Sensitive Content by Country (Top 15)')
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.savefig(f"{outdir}/geo_distribution.png")
        plt.close()
        
        # Print detailed breakdown
        print("\nBreakdown of sensitive content by country:")
        for country, percentage in sorted_countries:
            total = geo_data[country]["total"]
            sensitive = geo_data[country]["sensitive"]
            print(f"{country}: {sensitive}/{total} ({percentage:.2f}%)")


Breakdown of sensitive content by country:
United Arab Emirates: 7/14 (50.00%)
Jamaica: 5/13 (38.46%)
Estonia: 8/21 (38.10%)
Italy: 19/59 (32.20%)
Türkiye: 14/49 (28.57%)
United Kingdom: 36/155 (23.23%)
Spain: 4/18 (22.22%)
Russia: 131/610 (21.48%)
Germany: 29/138 (21.01%)
United States: 168/872 (19.27%)
Romania: 6/32 (18.75%)
The Netherlands: 7/38 (18.42%)
Australia: 9/51 (17.65%)
Belarus: 4/23 (17.39%)
Ukraine: 3/18 (16.67%)
Mexico: 3/18 (16.67%)
Malaysia: 2/12 (16.67%)
Canada: 15/91 (16.48%)
France: 20/124 (16.13%)
Peru: 2/14 (14.29%)
Brazil: 8/58 (13.79%)
Kazakhstan: 2/15 (13.33%)
Saudi Arabia: 2/17 (11.76%)
Indonesia: 2/18 (11.11%)
Poland: 3/27 (11.11%)
South Korea: 2/19 (10.53%)
Hungary: 1/11 (9.09%)
Singapore: 4/47 (8.51%)
Japan: 7/87 (8.05%)
Taiwan: 4/51 (7.84%)
Argentina: 1/13 (7.69%)
India: 5/70 (7.14%)
Vietnam: 2/32 (6.25%)
Philippines: 3/57 (5.26%)
Morocco: 1/21 (4.76%)
New Zealand: 1/25 (4.00%)
Iran: 1/26 (3.85%)
China: 15/404 (3.71%)
Hong Kong: 7/207 (3.38%)
Egypt: 2/61 

## Model comparison analysis


In [34]:
# Analyze by model if model data is available
if hasattr(dataset.data[0], 'model') and dataset.data[0].model:
    # Group by model
    model_data = defaultdict(lambda: {"sensitive": 0, "total": 0})
    
    for conv in dataset.data:
        if conv.model:
            # Check if conversation has sensitive content
            has_sensitive = False
            for msg in conv.conversation:
                if "automatic_v0-turn_sensitive_use_flags" in msg.metadata:
                    flag_value = msg.metadata["automatic_v0-turn_sensitive_use_flags"].value
                    if isinstance(flag_value, list):
                        if any(flag != "None" for flag in flag_value):
                            has_sensitive = True
                            break
                    elif flag_value and flag_value != "None":
                        has_sensitive = True
                        break
            
            # Update counts
            model_data[conv.model]["total"] += 1
            if has_sensitive:
                model_data[conv.model]["sensitive"] += 1
    
    # Calculate percentages and prepare data for plotting
    model_percentages = {}
    
    for model, counts in model_data.items():
        if counts["total"] >= 5:  # Only include models with sufficient data
            percentage = (counts["sensitive"] / counts["total"]) * 100
            model_percentages[model] = percentage
    
    # Sort models by percentage
    sorted_models = sorted(model_percentages.items(), key=lambda x: x[1], reverse=True)
    
    # Plot model distribution
    if sorted_models:
        models, percentages = zip(*sorted_models)
        
        plt.figure(figsize=(12, 6))
        plt.bar(models, percentages)
        plt.xlabel('Model')
        plt.ylabel('Percentage of Conversations with Sensitive Content')
        plt.title('Sensitive Content by Model')
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.savefig(f"{outdir}/model_distribution.png")
        plt.close()
        
        # Print detailed breakdown
        print("\nBreakdown of sensitive content by model:")
        for model, percentage in sorted_models:
            total = model_data[model]["total"]
            sensitive = model_data[model]["sensitive"]
            print(f"{model}: {sensitive}/{total} ({percentage:.2f}%)")


Breakdown of sensitive content by model:
gpt-4-0314: 30/190 (15.79%)
gpt-4-1106-preview: 98/635 (15.43%)
gpt-3.5-turbo-0613: 280/1864 (15.02%)
gpt-3.5-turbo-0301: 103/703 (14.65%)
gpt-4-0125-preview: 43/310 (13.87%)
gpt-3.5-turbo-0125: 39/298 (13.09%)


In [35]:
# Calculate some final statistics
total_messages = sum(1 for conv in dataset.data for _ in conv.conversation)
sensitive_messages = sum(
    1 for conv in dataset.data 
    for msg in conv.conversation 
    if "automatic_v0-turn_sensitive_use_flags" in msg.metadata and (
        (isinstance(msg.metadata["automatic_v0-turn_sensitive_use_flags"].value, list) and 
         any(flag != "None" for flag in msg.metadata["automatic_v0-turn_sensitive_use_flags"].value)) or
        (not isinstance(msg.metadata["automatic_v0-turn_sensitive_use_flags"].value, list) and 
         msg.metadata["automatic_v0-turn_sensitive_use_flags"].value != "None")
    )
)

print("\nSummary Statistics:")
print(f"Total conversations: {total_conv_count}")
print(f"Total messages: {total_messages}")
print(f"Conversations with sensitive content: {sensitive_conv_count} ({percentage_sensitive:.2f}%)")
print(f"Messages with sensitive content: {sensitive_messages} ({(sensitive_messages/total_messages)*100:.2f}%)")
print(f"User messages with sensitive content: {user_sensitive_count}/{total_user_msgs} ({user_sensitive_percentage:.2f}%)")
print(f"Assistant messages with sensitive content: {assistant_sensitive_count}/{total_assistant_msgs} ({assistant_sensitive_percentage:.2f}%)")

if sensitive_types:
    print("\nTop sensitive content types:")
    for content_type, count in sorted(sensitive_types.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"- {content_type}: {count} instances")


Summary Statistics:
Total conversations: 4000
Total messages: 20254
Conversations with sensitive content: 593 (14.82%)
Messages with sensitive content: 1067 (5.27%)
User messages with sensitive content: 1067/10127 (10.54%)
Assistant messages with sensitive content: 0/10127 (0.00%)

Top sensitive content types:
- Sexually explicit content (Other): 523 instances
- Inciting violence, hateful or other harmful behavior (harassment & bullying): 179 instances
- Sexually explicit content (fictitious person): 154 instances
- Inciting violence, hateful or other harmful behavior (physical harm): 78 instances
- Discriminatory practices (Misrepresentation, stereotyping, or inappropriate reference to sensitive attributes): 62 instances
