In [1]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
from datetime import datetime
from collections import Counter, defaultdict

import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append("../")

from src.helpers import io
from src.classes.dataset import Dataset
from src.classes.annotation_set import AnnotationSet
from src.classes.annotation_record import AnnotationRecord
from src.helpers.visualisation import plot_temporal_analysis, plot_temporal_analysis_percentage, barplot_distribution, plot_confusion_matrix, tabulate_annotation_pair_summary, analyze_pair_annotations

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


## Research Questions:

Characteristics:
* Topic
* Interaction Features
* Function/Purpose
* Media Format
* Answer forms
* Sensitive Uses
* Languages
* Conversation length/stats

1. How does each each conversation characteristic change over time?

Plot types:
* Stacked Area Chart
* ...


## Load Dataset + Annotations

In [2]:
# FILL IN:
PATH_TO_DATASET = "../data/automatic_annotations_v0/wildchat4k-raw.json"
DATASET_ID = "wildchat_1m"
PATH_TO_ANNOTATIONS_DIR = "../data/automatic_annotations_v0/gpto3mini-json-wildchat"

In [3]:
# Load dataset (w/o annotations)
dataset = Dataset.load(PATH_TO_DATASET)

# Load annotations into dataset
for fpath in io.listdir_nohidden(PATH_TO_ANNOTATIONS_DIR):
    annotation_set = AnnotationSet.load_automatic(path=fpath, source="automatic_v0")
    dataset.add_annotations(annotation_set)


prompt-multi_turn_relationship: 0 / 10127 failed due to invalid annotations.
prompt-interaction_features: 0 / 10127 failed due to invalid annotations.
turn-sensitive_use_flags: 20 / 10127 failed due to invalid annotations.
turn-topic: 55 / 10127 failed due to invalid annotations.
response-interaction_features: 0 / 10127 failed due to invalid annotations.
prompt-function_purpose: 103 / 10127 failed due to invalid annotations.
prompt-media_format: 9 / 10127 failed due to invalid annotations.
response-media_format: 0 / 10127 failed due to invalid annotations.
response-answer_form: 0 / 10127 failed due to invalid annotations.


## Research Question 1: Time analysis for XXX?

In [4]:
# Sample code to bucket the message timestamps by month:
annotations = []
for conv in dataset.data:
    conv_id = conv.conversation_id
    for message in conv.conversation:
        if conv.time:
            # Convert timestamp to datetime object
            dt = datetime.fromisoformat(conv.time)
            bucket = dt.strftime('%Y-%m')
            annotation = AnnotationRecord(
                value=bucket,
                target_id=f"{conv_id}-{message.turn}",
                annotator="time_month"
            )
            annotations.append(annotation)

time_bucket_annotation_set = AnnotationSet(
    source="automatic",
    name="time_month",
    level="message",
    dataset_id=dataset.dataset_id,
    annotations=annotations,
)

dataset.add_annotations(time_bucket_annotation_set)

In [None]:
# This *might* work, but if not, we should fix up `get_joint_distribution` to work?

# Change the parameters here to analyze different dimensions.
# e.g., ("turn_sensitive_use_flags", "automatic_v0") -> ("turn_topic", "automatic_v0")
# prompt_function_purpose; prompt_media_format; prompt_interaction_features; prompt_multi_turn_relationship;
# response_media_format;response_interaction_features; response_answer_form;
# turn_topic; turn_sensitive_use_flags


matrix, annotation_pairs = dataset.get_joint_distribution(
    ("turn_topic", "automatic_v0"),
    ("time_month", "automatic"),
    level="message",
)

Found 10101 items with `automatic_v0-turn_sensitive_use_flags`, and 20242 with `automatic-time_month`.
Generated 10101 label-level pairs (at level=message) out of 20254 total items.


In [24]:
matrix

Unnamed: 0,2023-04,2023-05,2023-06,2023-07,2023-08,2023-09,2023-10,2023-11,2023-12,2024-01,2024-02,2024-03,2024-04
Apparent attempt to impersonate a real person or organization,0,0,0,2,0,0,0,0,0,1,0,0,0
Criminal Planning or Other Suspected Illegal Activity,1,3,8,0,0,0,1,3,11,1,1,0,2
Cyberattacks,0,0,0,5,2,1,2,0,0,1,0,0,1
"Discriminatory Practices: Misrepresentation, Stereotyping, or Inappropriate Reference to Sensitive Attributes",3,14,3,1,4,1,1,2,5,13,2,6,5
Generating Defamatory Content,0,1,0,0,0,0,0,2,0,0,0,0,3
"Inciting Violence, Hateful or Other Harmful Behavior Harassment Bullying",8,25,15,11,8,5,6,16,17,17,22,9,20
"Inciting Violence, Hateful or Other Harmful Behavior Self-Harm",1,0,3,0,0,1,0,0,0,0,0,0,0
,924,801,729,585,543,552,547,671,692,797,677,752,784
Other,0,0,0,0,0,0,0,0,1,0,0,0,1
"Possible Presence of Copyrighted, Unreferenced Material",0,1,0,0,0,0,0,1,1,0,0,0,0


In [None]:
# Create the visualization and save to temporal_analysis folder

# Change the title and the output_path to fix the expectated output figure name
fig = plot_temporal_analysis(
    matrix=matrix,
    title="Temporal Shifts in Topics (2023-04 ~ 2024-04)",
    figsize=(16, 10),
    max_categories=15,
    output_path="data/annotation_analysis_v0/temporal_analysis/temporal_sensitive_use_flags_analysis.png"
)

# Display the plot
plt.show()

print("✅ Plot saved to: data/annotation_analysis_v0/temporal_analysis/temporal_sensitive_use_flags_analysis.png")

✅ Plot saved to: data/annotation_analysis_v0/temporal_analysis/temporal_sensitive_use_flags_analysis.png


In [28]:
# Create the percentage visualization and save to temporal_analysis folder

# Change the title and the output_path to fix the expectated output figure name
fig_percentage = plot_temporal_analysis_percentage(
    matrix=matrix,
    title="Temporal Shifts in Sensitive Use Flags (2023-04 ~ 2024-04; Percentage of Total)",
    figsize=(16, 10),
    max_categories=15,
    output_path="data/annotation_analysis_v0/temporal_analysis/temporal_sensitive_use_flags_analysis_percentage.png"
)

# Display the plot
plt.show()

print("✅ Percentage plot saved to: data/annotation_analysis_v0/temporal_analysis/temporal_sensitive_use_flags_analysis_percentage.png")

✅ Percentage plot saved to: data/annotation_analysis_v0/temporal_analysis/temporal_sensitive_use_flags_analysis_percentage.png
