### 00 Dataset Generation
This notebook generates the dataset to be used for training

### Imports

In [1]:
%load_ext autoreload

import os
os.chdir("..")

In [2]:
import json
import pandas as pd
from datetime import datetime
from pathlib import Path

### Load raw text messages

In [3]:
raw_df = pd.read_csv("data/ryan_text_messages.csv", dtype={
    'Sender ID': str,
    'Subject': str
})

print(f"Loaded {len(raw_df)} messages")
raw_df.head()

Loaded 783325 messages


Unnamed: 0,Chat Session,Message Date,Delivered Date,Read Date,Edited Date,Service,Type,Sender ID,Sender Name,Status,Replying to,Subject,Text,Attachment,Attachment type
0,hoganites,2024-02-11 23:37:26,,,,iMessage,Outgoing,,,Sent,,,Guys,,
1,hoganites,2024-02-11 23:37:30,,,,iMessage,Outgoing,,,Sent,,,,IMG_3206.PNG,Image
2,hoganites,2024-02-11 23:37:33,,,,iMessage,Outgoing,,,Sent,,,This can’t be real,,
3,hoganites,2024-02-11 23:42:00,,2024-02-11 23:42:03,,iMessage,Incoming,15627745147.0,Will Park,Read,,,Ew,,
4,hoganites,2024-08-04 19:11:21,,,,iMessage,Outgoing,,,Sent,,,What day are ygs free to go to MGrill?,,


### Filter the messages

In [4]:
original_count = len(raw_df)

EXCLUDED_WORDS = ["nigger", "fuck", "shit", "nigga", "fag", "bitch", "chink", "retard", "bastard", "gay", "whore"]

df = raw_df.copy()

print(f"Original number of messages: {original_count:,}")

# Remove empty text Messages
df = df[df['Text'].notna()]

# Remove empty dates
df = df[df['Message Date'].notna()]
# Filter out messages that are just numbers (likely verification codes)
is_verification_code = df['Text'].astype(str).str.match(r'^\d{4,8}$')
df = df[~is_verification_code]

# Remove words that are excluded that I sent, and remove all outgoing messages with attachments
pattern = '|'.join(EXCLUDED_WORDS)
outgoing_mask = df['Type'] == 'Outgoing'
text_lower = df['Text'].astype(str).str.lower()
contains_excluded = text_lower.str.contains(pattern, na=False, regex=True)
has_attachment = df['Attachment'].notna()

# Keep all incoming messages OR outgoing messages that (don't contain excluded words AND don't have attachments)
keep_mask = ~outgoing_mask | (outgoing_mask & ~contains_excluded & ~has_attachment)
df = df[keep_mask]

print(f"Final message count: {len(df)}")


Original number of messages: 783,325
Final message count: 750762


### Create conversations

In [5]:
OUTGOING_SPEAKER_NAME = "Ryan Amiri"

def parse_message_date(date_str):
    """Parse Message Date string to datetime object."""
    if pd.isna(date_str) or date_str == '':
        return None
    try:
        return datetime.strptime(str(date_str).strip(), '%Y-%m-%d %H:%M:%S')
    except (ValueError, TypeError):
        return None

def get_speaker_name(row):
    """Get speaker name for a message row."""
    if row['Type'] == 'Outgoing':
        return OUTGOING_SPEAKER_NAME
    else:
        sender_name = row.get('Sender Name', '')
        sender_id = row.get('Sender ID', '')
        if pd.notna(sender_name) and str(sender_name).strip():
            return str(sender_name).strip()
        elif pd.notna(sender_id) and str(sender_id).strip():
            return str(sender_id).strip()
        else:
            return 'Unknown'

# Sort messages by date within each chat session
df_sorted = df.copy().reset_index()
df_sorted['_parsed_date'] = df_sorted['Message Date'].apply(parse_message_date)
df_sorted = df_sorted.sort_values(['Chat Session', '_parsed_date', 'index'])
df_sorted = df_sorted.drop(columns=['_parsed_date', 'index'])

# Group by Chat Session and create conversations
conversations = []
filtered_out_count = 0

for chat_session, group_df in df_sorted.groupby('Chat Session'):
    # Filter out conversations with no outgoing messages (messages you sent)
    has_outgoing = (group_df['Type'] == 'Outgoing').any()
    if not has_outgoing:
        filtered_out_count += 1
        continue
    
    messages = []
    
    for _, row in group_df.iterrows():
        message = {
            'timestamp': row['Message Date'],
            'type': row['Type'],
            'speaker': get_speaker_name(row),
            'text': str(row['Text']).strip(),
            "replying_to": str(row['Replying to']).strip() if pd.notna(row.get('Replying to')) else None
        }
        messages.append(message)
    
    conversations.append({
        'chat_session': chat_session,
        'message_count': len(messages),
        'messages': messages
    })

if filtered_out_count > 0:
    print(f"Filtered out {filtered_out_count:,} conversations with no outgoing messages")

print(f"Created {len(conversations)} conversations")
print(f"Total messages: {sum(c['message_count'] for c in conversations)}")

# Save to JSON
output_path = Path("data/conversations.json")
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(conversations, f, indent=2, ensure_ascii=False)

print(f"Saved conversations to {output_path}")


Filtered out 1,228 conversations with no outgoing messages
Created 1116 conversations
Total messages: 747005
Saved conversations to data/conversations.json
