# Conversation Alignment Analysis

This notebook loads public ChatGPT conversation data, reconstructs user-assistant turns, and computes alignment metrics.

In [15]:
# Imports and paths
import json
from pathlib import Path

import numpy as np
import pandas as pd

# Resolve project root for robust relative paths
project_root = Path.cwd()
if not (project_root / 'data').exists() and (project_root.parent / 'data').exists():
    project_root = project_root.parent

data_path = project_root / 'data' / 'conversations_english.jsonl'
if not data_path.exists():
    data_path = project_root / 'conversations_english.jsonl'

output_dir = project_root / 'analysis'
output_dir.mkdir(parents=True, exist_ok=True)


In [16]:
# Load and clean messages
message_rows = []
conversation_ids = set()

with data_path.open('r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        share_id = obj.get('share_id')
        if share_id is not None:
            conversation_ids.add(share_id)

        for msg in obj.get('messages', []):
            role = msg.get('role')
            if role not in {'user', 'assistant'}:
                continue

            extracted_text = None
            text = msg.get('text')
            if isinstance(text, str) and text.strip():
                extracted_text = text
            else:
                raw_content = msg.get('raw_content', {})
                parts = raw_content.get('parts') if isinstance(raw_content, dict) else None
                if isinstance(parts, list):
                    joined = ' '.join(p for p in parts if isinstance(p, str) and p.strip())
                    if joined.strip():
                        extracted_text = joined

            if extracted_text is None:
                continue

            message_rows.append({
                'share_id': share_id,
                'role': role,
                'backend_index': msg.get('backend_index'),
                'text': extracted_text
            })

df_messages = pd.DataFrame(message_rows)
df_messages = df_messages.dropna(subset=['share_id', 'backend_index', 'text'])


In [17]:
# Reconstruct user -> assistant and assistant -> user turn pairs
pairs = []

for share_id, group in df_messages.groupby('share_id'):
    group_sorted = group.sort_values('backend_index', kind='mergesort')
    rows = group_sorted.to_dict('records')
    ua_turn_index = 0
    au_turn_index = 0

    for i in range(len(rows) - 1):
        current_role = rows[i]['role']
        next_role = rows[i + 1]['role']
        if current_role == 'user' and next_role == 'assistant':
            ua_turn_index += 1
            pairs.append({
                'share_id': share_id,
                'direction': 'user_to_assistant',
                'turn_index': ua_turn_index,
                'user_text': rows[i]['text'],
                'assistant_text': rows[i + 1]['text']
            })
        elif current_role == 'assistant' and next_role == 'user':
            au_turn_index += 1
            pairs.append({
                'share_id': share_id,
                'direction': 'assistant_to_user',
                'turn_index': au_turn_index,
                'user_text': rows[i + 1]['text'],
                'assistant_text': rows[i]['text']
            })

df_pairs = pd.DataFrame(pairs)


In [18]:
df_pairs

Unnamed: 0,share_id,direction,turn_index,user_text,assistant_text
0,026a0dac-1412-4912-9d69-097e2746d9f8,user_to_assistant,1,I really worry about the psychological ramific...,Your concerns tap into some of the most profou...
1,026a0dac-1412-4912-9d69-097e2746d9f8,assistant_to_user,1,The ironic twist I think would be hilarious: A...,Your concerns tap into some of the most profou...
2,026a0dac-1412-4912-9d69-097e2746d9f8,user_to_assistant,2,The ironic twist I think would be hilarious: A...,That's a pretty wild and darkly humorous scena...
3,026a0dac-1412-4912-9d69-097e2746d9f8,assistant_to_user,2,I was thinking self destruction in order to sa...,That's a pretty wild and darkly humorous scena...
4,026a0dac-1412-4912-9d69-097e2746d9f8,user_to_assistant,3,I was thinking self destruction in order to sa...,The concept of AI self-destruction to save hum...
...,...,...,...,...,...
74499,fd0c4f1c-65a1-40af-91ab-cb69217dc3b9,user_to_assistant,3,more,Here's an extended version of the scene with m...
74500,fd0c4f1c-65a1-40af-91ab-cb69217dc3b9,assistant_to_user,3,extend it a little more,Here's an extended version of the scene with m...
74501,fd0c4f1c-65a1-40af-91ab-cb69217dc3b9,user_to_assistant,4,extend it a little more,Continuing from where we left off:As Gwildor c...
74502,fd0c4f1c-65a1-40af-91ab-cb69217dc3b9,assistant_to_user,4,keep going now I want more,Continuing from where we left off:As Gwildor c...


In [19]:
# GPU availability check for sentence-transformers
try:
    import torch
    gpu_available = torch.cuda.is_available()
    device = 'cuda' if gpu_available else 'cpu'
    print(f'CUDA available: {gpu_available} (device={device})')
except Exception as exc:
    device = 'cpu'
    print(f'CUDA check failed, defaulting to CPU: {exc}')


CUDA available: False (device=cpu)


In [6]:
# Semantic similarity with sentence-transformers
if not df_pairs.empty:
    from sentence_transformers import SentenceTransformer

    if 'device' not in globals():
        try:
            import torch
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
        except Exception:
            device = 'cpu'

    model = SentenceTransformer('all-mpnet-base-v2', device=device)

    batch_size = 128
    user_emb = model.encode(
        df_pairs['user_text'].tolist(),
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )
    assistant_emb = model.encode(
        df_pairs['assistant_text'].tolist(),
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    df_pairs['semantic_similarity'] = (user_emb * assistant_emb).sum(axis=1)
else:
    df_pairs['semantic_similarity'] = pd.Series(dtype=float)


Batches:   4%|▍         | 12/304 [02:26<59:29, 12.22s/it]  


KeyboardInterrupt: 

In [7]:
# Linguistic alignment using align2 (align2-linguistic-alignment)
try:
    from align import Align
    aligner = Align()
except Exception:
    aligner = None

def _get_metric(result, name):
    if isinstance(result, dict):
        return result.get(name)
    return getattr(result, name, np.nan)

def compute_align_metrics(user_text, assistant_text):
    if aligner is None:
        return (np.nan, np.nan, np.nan)

    try:
        if hasattr(aligner, 'align'):
            result = aligner.align(user_text, assistant_text)
        elif hasattr(aligner, 'compute_alignment'):
            result = aligner.compute_alignment(user_text, assistant_text)
        else:
            return (np.nan, np.nan, np.nan)

        lex = _get_metric(result, 'lexical_alignment')
        syn = _get_metric(result, 'syntactic_alignment')
        sem = _get_metric(result, 'semantic_alignment')
        return (lex, syn, sem)
    except Exception:
        return (np.nan, np.nan, np.nan)

if not df_pairs.empty:
    metrics = [
        compute_align_metrics(u, a)
        for u, a in zip(df_pairs['user_text'], df_pairs['assistant_text'])
    ]
    df_pairs['lexical_alignment'] = [m[0] for m in metrics]
    df_pairs['syntactic_alignment'] = [m[1] for m in metrics]
    df_pairs['align2_semantic_alignment'] = [m[2] for m in metrics]
else:
    df_pairs['lexical_alignment'] = pd.Series(dtype=float)
    df_pairs['syntactic_alignment'] = pd.Series(dtype=float)
    df_pairs['align2_semantic_alignment'] = pd.Series(dtype=float)


In [9]:
# Aggregate to conversation level (by direction)
if not df_pairs.empty:
    grouped = (
        df_pairs.groupby(['share_id', 'direction'])
        .agg(
            mean_semantic_similarity=('semantic_similarity', 'mean'),
            mean_lexical_alignment=('lexical_alignment', 'mean'),
            mean_syntactic_alignment=('syntactic_alignment', 'mean'),
            mean_align2_semantic_alignment=('align2_semantic_alignment', 'mean'),
            n_pairs=('turn_index', 'count'),
            conversation_length=('turn_index', 'max')
        )
        .reset_index()
    )

    def pivot_metric(metric_name, prefix):
        pivoted = grouped.pivot(index='share_id', columns='direction', values=metric_name)
        pivoted = pivoted.rename(columns={
            'user_to_assistant': f'{prefix}_user_to_assistant',
            'assistant_to_user': f'{prefix}_assistant_to_user'
        })
        return pivoted

    df_conversation = pd.concat([
        pivot_metric('mean_semantic_similarity', 'mean_semantic_similarity'),
        pivot_metric('mean_lexical_alignment', 'mean_lexical_alignment'),
        pivot_metric('mean_syntactic_alignment', 'mean_syntactic_alignment'),
        pivot_metric('mean_align2_semantic_alignment', 'mean_align2_semantic_alignment'),
        pivot_metric('n_pairs', 'n_pairs'),
        pivot_metric('conversation_length', 'conversation_length')
    ], axis=1).reset_index()

    df_conversation['semantic_lead'] = (
        df_conversation.get('mean_semantic_similarity_user_to_assistant')
        - df_conversation.get('mean_semantic_similarity_assistant_to_user')
    )
else:
    df_conversation = pd.DataFrame(
        columns=[
            'share_id',
            'mean_semantic_similarity_user_to_assistant',
            'mean_semantic_similarity_assistant_to_user',
            'mean_lexical_alignment_user_to_assistant',
            'mean_lexical_alignment_assistant_to_user',
            'mean_syntactic_alignment_user_to_assistant',
            'mean_syntactic_alignment_assistant_to_user',
            'mean_align2_semantic_alignment_user_to_assistant',
            'mean_align2_semantic_alignment_assistant_to_user',
            'n_pairs_user_to_assistant',
            'n_pairs_assistant_to_user',
            'conversation_length_user_to_assistant',
            'conversation_length_assistant_to_user',
            'semantic_lead'
        ]
    )

# Save outputs
turn_level_path = output_dir / 'alignment_turn_level.csv'
conversation_level_path = output_dir / 'alignment_conversation_level.csv'

df_pairs.to_csv(turn_level_path, index=False)
df_conversation.to_csv(conversation_level_path, index=False)


KeyError: "Column(s) ['semantic_similarity'] do not exist"

In [10]:
# Diagnostics
num_conversations_processed = len(conversation_ids)
num_pairs = len(df_pairs)

mean_sem = df_pairs['semantic_similarity'].mean() if num_pairs else np.nan
sd_sem = df_pairs['semantic_similarity'].std(ddof=1) if num_pairs else np.nan

ua_pairs = df_pairs[df_pairs['direction'] == 'user_to_assistant']
au_pairs = df_pairs[df_pairs['direction'] == 'assistant_to_user']

ua_mean = ua_pairs['semantic_similarity'].mean() if len(ua_pairs) else np.nan
au_mean = au_pairs['semantic_similarity'].mean() if len(au_pairs) else np.nan

print(f'Number of conversations processed: {num_conversations_processed}')
print(f'Number of valid user-assistant pairs: {num_pairs}')
print(f'Mean semantic similarity (overall): {mean_sem}')
print(f'SD semantic similarity (overall): {sd_sem}')
print(f'Mean semantic similarity (user->assistant): {ua_mean}')
print(f'Mean semantic similarity (assistant->user): {au_mean}')
print(f'Semantic lead (user->assistant - assistant->user): {ua_mean - au_mean}')


KeyError: 'semantic_similarity'