### 00 Dataset Generation
This notebook generates the dataset to be used for training

### Imports

In [1]:
%load_ext autoreload

import os
os.chdir("..")

In [2]:
import json
import re
import uuid
import pandas as pd
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple

### Load raw text messages

In [3]:
raw_df = pd.read_csv("data/ryan_text_messages.csv", dtype={
    'Sender ID': str,
    'Subject': str
})

print(f"Loaded {len(raw_df)} messages")
raw_df.head()

Loaded 783325 messages


Unnamed: 0,Chat Session,Message Date,Delivered Date,Read Date,Edited Date,Service,Type,Sender ID,Sender Name,Status,Replying to,Subject,Text,Attachment,Attachment type
0,hoganites,2024-02-11 23:37:26,,,,iMessage,Outgoing,,,Sent,,,Guys,,
1,hoganites,2024-02-11 23:37:30,,,,iMessage,Outgoing,,,Sent,,,,IMG_3206.PNG,Image
2,hoganites,2024-02-11 23:37:33,,,,iMessage,Outgoing,,,Sent,,,This can’t be real,,
3,hoganites,2024-02-11 23:42:00,,2024-02-11 23:42:03,,iMessage,Incoming,15627745147.0,Will Park,Read,,,Ew,,
4,hoganites,2024-08-04 19:11:21,,,,iMessage,Outgoing,,,Sent,,,What day are ygs free to go to MGrill?,,


### Filter the messages

In [4]:
original_count = len(raw_df)

EXCLUDED_WORDS = ["nigger", "fuck", "shit", "nigga", "fag", "bitch", "chink", "retard", "bastard", "gay", "whore"]

df = raw_df.copy()

print(f"Original number of messages: {original_count:,}")

# Remove empty text Messages
df = df[df['Text'].notna()]

# Remove empty dates
df = df[df['Message Date'].notna()]
# Filter out messages that are just numbers (likely verification codes)
is_verification_code = df['Text'].astype(str).str.match(r'^\d{4,8}$')
df = df[~is_verification_code]

# Remove words that are excluded that I sent, and remove all outgoing messages with attachments
pattern = '|'.join(EXCLUDED_WORDS)
outgoing_mask = df['Type'] == 'Outgoing'
text_lower = df['Text'].astype(str).str.lower()
contains_excluded = text_lower.str.contains(pattern, na=False, regex=True)
has_attachment = df['Attachment'].notna()

# Keep all incoming messages OR outgoing messages that (don't contain excluded words AND don't have attachments)
keep_mask = ~outgoing_mask | (outgoing_mask & ~contains_excluded & ~has_attachment)
df = df[keep_mask]

print(f"Final message count: {len(df)}")


Original number of messages: 783,325
Final message count: 750762


### Tool Call Conversion Functions

In [16]:
REACTION_PATTERNS = {
    "Loved": "love",
    "Liked": "like",
    "Disliked": "dislike",
    "Laughed at": "laugh",
    "Emphasized": "emphasize",
    "Questioned": "question"
}

def detect_reaction(text: str) -> Optional[Dict[str, str]]:
    """Detect if text is a reaction message and extract reaction type and quoted text.
    
    Returns:
        Dict with 'reaction_type' and 'quoted_text' if reaction detected, None otherwise
    """
    if not text:
        return None
    
    for reaction_label, reaction_type in REACTION_PATTERNS.items():
        if text.startswith(reaction_label + " "):
            quoted_match = re.search(r'[“](.*?)[”]', text)
            if quoted_match:
                quoted_text = quoted_match.group(1).strip()
                return {
                    'reaction_type': reaction_type,
                    'quoted_text': quoted_text
                }
    
    return None


def find_message_by_text(
    messages: List[Dict],
    target_text: str,
    start_idx: int,
    guid_map: Dict[int, str],
    target_speaker: Optional[str] = None,
    target_timestamp: Optional[str] = None
) -> Optional[str]:
    """Find message GUID by matching text (search backwards chronologically).
    
    Args:
        messages: List of message dicts (in chronological order)
        target_text: Text to match (may be truncated with ellipsis)
        start_idx: Index to start searching backwards from
        guid_map: Mapping of message indices to GUIDs
        target_speaker: Optional speaker name to match
        target_timestamp: Optional timestamp to match
        
    Returns:
        GUID string if match found, None otherwise
    """
    if not target_text:
        return None
    
    target_text_clean = target_text.rstrip("…").strip()
    
    if not target_text_clean:
        return None
    
    for i in range(start_idx - 1, -1, -1):
        if i >= len(messages):
            continue
        msg = messages[i]
        
        msg_text = msg.get('text', '')
        if not msg_text or target_text_clean not in msg_text:
            continue
        
        if target_speaker is not None:
            msg_speaker = msg.get('speaker', '')
            if msg_speaker != target_speaker:
                continue
        
        if target_timestamp is not None:
            msg_timestamp = msg.get('timestamp', '')
            if msg_timestamp != target_timestamp:
                continue
        
        return guid_map.get(i)
    
    return None

def parse_replying_to_format(replying_to_str: str) -> Optional[Dict[str, str]]:
    """Parse replying_to formatted string to extract speaker, timestamp, and text.
    
    Format: "➜ Replying to {speaker}, {timestamp}: « {text} »"
    
    Returns:
        Dict with 'speaker', 'timestamp', 'text' if parsed, None otherwise
    """
    if not replying_to_str or not replying_to_str.strip():
        return None
    
    pattern = r'➜ Replying to (.+?), (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}): « (.+?) »'
    match = re.search(pattern, replying_to_str)
    if match:
        return {
            'speaker': match.group(1).strip(),
            'timestamp': match.group(2).strip(),
            'text': match.group(3).strip()
        }
    
    return None

def convert_to_tool_call(msg: Dict, messages: List[Dict], msg_idx: int, guid_map: Dict[int, str]) -> Tuple[str, str, Optional[str]]:
    """Convert a message to tool call format.
    
    Returns:
        Tuple of (tool_call_string, conversion_type, replying_to_guid)
        conversion_type: 'reaction', 'reply', 'send_message', or 'failed'
    """
    text = msg.get('text', '')
    replying_to_str = msg.get('replying_to')
    
    reaction_info = detect_reaction(text)
    if reaction_info:
        quoted_text = reaction_info['quoted_text']
        reaction_type = reaction_info['reaction_type']
        
        message_guid = find_message_by_text(messages, quoted_text, msg_idx, guid_map)
        if message_guid:
            tool_call = f'react(message_guid="{message_guid}", reaction_type="{reaction_type}")'
            return tool_call, 'reaction', None
        else:
            tool_call = f'send_message(text={repr(text)})'
            return tool_call, 'failed', None
    
    if replying_to_str:
        replying_to_info = parse_replying_to_format(replying_to_str)
        if replying_to_info:
            replying_to_guid = find_message_by_text(
                messages,
                replying_to_info['text'],
                msg_idx,
                guid_map,
                target_speaker=replying_to_info.get('speaker'),
                target_timestamp=replying_to_info.get('timestamp')
            )
            if replying_to_guid:
                tool_call = f'reply(message_guid="{replying_to_guid}", text={repr(text)})'
                return tool_call, 'reply', replying_to_guid
    
    tool_call = f'send_message(text={repr(text)})'
    return tool_call, 'send_message', None

def process_conversation_with_tool_calls(messages: List[Dict]) -> Tuple[List[Dict], Dict[str, int]]:
    """Process conversation messages to add tool calls and GUIDs.
    
    Failed conversions are filtered out (not included in output).
    
    Returns:
        Tuple of (updated_messages_list, conversion_stats)
    """
    stats = {
        'total_outgoing': 0,
        'reaction': 0,
        'reply': 0,
        'send_message': 0,
        'failed': 0
    }
    
    guid_map = {}
    updated_messages = []
    
    for i, msg in enumerate(messages):
        guid = str(uuid.uuid4()).upper()
        guid_map[i] = guid
        
        if msg.get('type') == 'Outgoing':
            stats['total_outgoing'] += 1
            tool_call, conversion_type, replying_to_guid = convert_to_tool_call(msg, messages, i, guid_map)
            
            if conversion_type == 'failed':
                stats['failed'] += 1
                continue
            
            updated_msg = msg.copy()
            updated_msg['guid'] = guid
            updated_msg['tool_call'] = tool_call
            stats[conversion_type] = stats.get(conversion_type, 0) + 1
            
            if replying_to_guid:
                updated_msg['replying_to_guid'] = replying_to_guid
            
            updated_messages.append(updated_msg)
        else:
            updated_msg = msg.copy()
            updated_msg['guid'] = guid
            updated_msg['tool_call'] = None
            updated_messages.append(updated_msg)
    
    return updated_messages, stats

print("Tool call conversion functions loaded")


Tool call conversion functions loaded


### Create conversations

In [17]:
OUTGOING_SPEAKER_NAME = "Ryan Amiri"

def parse_message_date(date_str):
    """Parse Message Date string to datetime object."""
    if pd.isna(date_str) or date_str == '':
        return None
    try:
        return datetime.strptime(str(date_str).strip(), '%Y-%m-%d %H:%M:%S')
    except (ValueError, TypeError):
        return None

def get_speaker_name(row):
    """Get speaker name for a message row."""
    if row['Type'] == 'Outgoing':
        return OUTGOING_SPEAKER_NAME
    else:
        sender_name = row.get('Sender Name', '')
        sender_id = row.get('Sender ID', '')
        if pd.notna(sender_name) and str(sender_name).strip():
            return str(sender_name).strip()
        elif pd.notna(sender_id) and str(sender_id).strip():
            return str(sender_id).strip()
        else:
            return 'Unknown'

# Sort messages by date within each chat session
df_sorted = df.copy().reset_index()
df_sorted['_parsed_date'] = df_sorted['Message Date'].apply(parse_message_date)
df_sorted = df_sorted.sort_values(['Chat Session', '_parsed_date', 'index'])
df_sorted = df_sorted.drop(columns=['_parsed_date', 'index'])

# Group by Chat Session and create conversations
conversations = []
filtered_out_count = 0

total_stats = {
    'total_outgoing': 0,
    'reaction': 0,
    'reply': 0,
    'send_message': 0,
    'failed': 0
}

for chat_session, group_df in df_sorted.groupby('Chat Session'):
    # Filter out conversations with no outgoing messages (messages you sent)
    has_outgoing = (group_df['Type'] == 'Outgoing').any()
    if not has_outgoing:
        filtered_out_count += 1
        continue
    
    messages = []
    
    for _, row in group_df.iterrows():
        message = {
            'timestamp': row['Message Date'],
            'type': row['Type'],
            'speaker': get_speaker_name(row),
            'text': str(row['Text']).strip(),
            "replying_to": str(row['Replying to']).strip() if pd.notna(row.get('Replying to')) else None
        }
        messages.append(message)
    
    updated_messages, conv_stats = process_conversation_with_tool_calls(messages)
    
    for key in total_stats:
        total_stats[key] += conv_stats.get(key, 0)
    
    conversations.append({
        'chat_session': chat_session,
        'message_count': len(updated_messages),
        'messages': updated_messages
    })

if filtered_out_count > 0:
    print(f"Filtered out {filtered_out_count:,} conversations with no outgoing messages")

print(f"Created {len(conversations)} conversations")
print(f"Total messages: {sum(c['message_count'] for c in conversations)}")

print()
print("Conversion Statistics:")
print(f"Total Outgoing Messages: {total_stats['total_outgoing']}")
print(f"Reactions: {total_stats['reaction']}")
print(f"Replies: {total_stats['reply']}")
print(f"Send Messages: {total_stats['send_message']}")
print(f"Failed Conversions (filtered out): {total_stats['failed']}")

# Save to JSON
output_path = Path("data/conversations.json")
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(conversations, f, indent=2, ensure_ascii=False)

print()
print(f"Saved conversations to {output_path}")


Filtered out 1,228 conversations with no outgoing messages
Created 1116 conversations
Total messages: 747001

Conversion Statistics:
Total Outgoing Messages: 395558
Reactions: 6538
Replies: 9860
Send Messages: 379156
Failed Conversions (filtered out): 4

Saved conversations to data/conversations.json


### Transfer to HPC Cluster

After generating the conversations.json file, transfer it to the HPC cluster:

```bash
rsync -avz --progress training/data/conversations.json amiri.ry@login.explorer.northeastern.edu:/projects/llpr/amiri.ry/projects/yap-for-me/training/data/
```

Or using scp:

```bash
scp training/data/conversations.json amiri.ry@login.explorer.northeastern.edu:/projects/llpr/amiri.ry/projects/yap-for-me/training/data/
```