In [11]:
# iMessage Training Data Extractor - COMPLETELY FIXED VERSION
import sqlite3
import os
import pandas as pd
import json
import re
import random
import struct

def extract_text_from_attributed_body(attributed_body):
    """Extract actual human text from NSAttributedString blob"""
    if not attributed_body:
        return None
    
    try:
        # Convert to string and find readable text
        if isinstance(attributed_body, bytes):
            # Try different encodings
            for encoding in ['utf-8', 'utf-16', 'latin1']:
                try:
                    decoded = attributed_body.decode(encoding, errors='ignore')
                    
                    # Remove all the Apple/system junk
                    decoded = re.sub(r'streamtyped.*?NSString.*?NSMutableAttributedString.*?NSObject.*?NSString.*?', '', decoded, flags=re.DOTALL | re.IGNORECASE)
                    decoded = re.sub(r'NSAttributedString.*?NSObject.*?NSString.*?', '', decoded, flags=re.DOTALL | re.IGNORECASE)
                    decoded = re.sub(r'NSString.*?NSObject.*?', '', decoded, flags=re.DOTALL | re.IGNORECASE)
                    decoded = re.sub(r'bplist\d*.*?NSKeyedArchiver.*?', '', decoded, flags=re.DOTALL | re.IGNORECASE)
                    
                    # Remove control characters and binary junk
                    decoded = re.sub(r'[\x00-\x08\x0b-\x1f\x7f-\x9f]', '', decoded)
                    
                    # Extract sequences of actual readable text (letters, numbers, common punctuation)
                    text_matches = re.findall(r"[a-zA-Z0-9][a-zA-Z0-9\s.,!?'\"\\-;:(){}[\]@#$%&*+=<>/\\|~`^]{4,}", decoded)
                    
                    for match in text_matches:
                        # Clean up the match
                        clean_match = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', match)
                        clean_match = re.sub(r'\s+', ' ', clean_match).strip()
                        
                        # Check if it looks like actual human text
                        if (len(clean_match) >= 5 and 
                            re.search(r'[a-zA-Z]{3,}', clean_match) and
                            not re.search(r'(stream|NSString|NSObject|NSMutable|archiver|version|bplist|kIM)', clean_match, re.IGNORECASE) and
                            len(re.findall(r'[a-zA-Z]', clean_match)) > len(clean_match) * 0.3):
                            return clean_match
                    
                except (UnicodeDecodeError, UnicodeError):
                    continue
    except Exception:
        pass
    
    return None

def clean_regular_text(text):
    """Clean regular text field"""
    if not text or not text.strip():
        return None
    
    # Remove control characters
    text = re.sub(r'[\x00-\x08\x0b-\x1f\x7f-\x9f]', '', text)
    
    # Remove Apple metadata if present
    text = re.sub(r'(stream|NSString|NSObject|NSMutable|archiver|version|bplist|kIM).*', '', text, flags=re.IGNORECASE)
    
    # Clean whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Check if it's actual human text
    if (len(text) >= 5 and 
        re.search(r'[a-zA-Z]{3,}', text) and
        not re.search(r'(stream|NSString|NSObject|NSMutable|archiver|version|bplist|kIM)', text, re.IGNORECASE)):
        return text
    
    return None

def get_clean_message_text(text, attributed_body):
    """Get clean human-readable message text"""
    # Try regular text first
    if text:
        cleaned = clean_regular_text(text)
        if cleaned:
            return cleaned
    
    # Try attributed body
    if attributed_body:
        extracted = extract_text_from_attributed_body(attributed_body)
        if extracted:
            return extracted
    
    return None


In [7]:
# Extract and clean all iMessage data
print("üîç Extracting iMessages...")

db_path = os.path.expanduser('~/Library/Messages/chat.db')
conn = sqlite3.connect(db_path)

# Get all messages with better filtering
query = """
SELECT m.text, m.attributedBody, m.is_from_me,
       datetime(m.date/1000000000 + strftime('%s', '2001-01-01'), 'unixepoch') as date_readable,
       COALESCE(h.display_name, h.chat_identifier, 'Unknown') as contact
FROM message m
LEFT JOIN chat_message_join cmj ON m.ROWID = cmj.message_id
LEFT JOIN chat h ON cmj.chat_id = h.ROWID
WHERE (m.text IS NOT NULL OR m.attributedBody IS NOT NULL)
AND m.is_service_message = 0 
AND m.is_empty = 0
AND m.cache_has_attachments = 0
ORDER BY m.date ASC
"""

df = pd.read_sql_query(query, conn)
conn.close()

print(f"üì• Raw messages from database: {len(df):,}")

# Debug: check what we're dealing with
print(f"üìä Messages with text field: {df['text'].notna().sum():,}")
print(f"üìä Messages with attributedBody: {df['attributedBody'].notna().sum():,}")
print(f"üìä Messages with both: {(df['text'].notna() & df['attributedBody'].notna()).sum():,}")

# Apply the new cleaning function
print("\nüßπ Applying new extraction logic...")
df['clean_text'] = df.apply(lambda row: get_clean_message_text(row['text'], row['attributedBody']), axis=1)

# Filter out messages that couldn't be cleaned
original_count = len(df)
df = df[df['clean_text'].notna()]
df = df[df['clean_text'].str.len() >= 5]
df = df[df['clean_text'].str.len() <= 500]  # Remove extremely long messages

# Additional quality filters
df = df[df['clean_text'].str.contains(r'[a-zA-Z]{3,}')]  # Must have real words
df = df[~df['clean_text'].str.contains(r'[\x00-\x1f\x7f-\x9f]', na=False)]  # NO control chars
df = df[~df['clean_text'].str.contains(r'stream|NSString|NSObject|kIM|bplist|archiver', na=False, case=False)]  # NO Apple junk

df['is_user'] = df['is_from_me'] == 1

print(f"üì± Successfully extracted: {len(df):,} clean messages ({len(df)/original_count*100:.1f}% success rate)")
print(f"üí¨ Your messages: {df['is_user'].sum():,}")
print(f"üë• Others: {(~df['is_user']).sum():,}")

# Show actual sample messages
print("\nüìã Sample clean messages:")
if len(df) > 0:
    sample_messages = df[df['clean_text'].str.len() > 10].sample(min(5, len(df)))
    for _, row in sample_messages.iterrows():
        sender = "You" if row['is_user'] else "Them"
        print(f"{sender}: {row['clean_text'][:80]}{'...' if len(row['clean_text']) > 80 else ''}")
else:
    print("‚ùå No messages to display - extraction failed!")

# Verify quality
if len(df) > 0:
    test_sample = df['clean_text'].head(20).tolist()
    test_text = ' '.join(test_sample)
    has_junk = bool(re.search(r'[\x00-\x1f\x7f-\x9f]|stream|NSString|NSObject|archiver', test_text, re.IGNORECASE))
    avg_length = df['clean_text'].str.len().mean()
    
    print(f"\nüîç Quality Analysis:")
    print(f"Average message length: {avg_length:.1f} characters")
    print(f"Contains junk: {'‚ùå YES' if has_junk else '‚úÖ NO'}")
    print(f"Quality status: {'‚ùå NEEDS MORE WORK' if has_junk or avg_length < 10 else 'üéâ CLEAN & READY'}")
else:
    print("\n‚ùå No clean messages extracted - need to debug the extraction logic")


üîç Extracting iMessages...
üì• Raw messages from database: 87,281
üìä Messages with text field: 607
üìä Messages with attributedBody: 87,281
üìä Messages with both: 607

üßπ Applying new extraction logic...
üì± Successfully extracted: 571 clean messages (0.7% success rate)
üí¨ Your messages: 260
üë• Others: 311

üìã Sample clean messages:
You: Aww I‚Äôd love to feel you up right about now
You: Good morning! We're up
Them: The USPS package has arrived at the warehouse and cannot be delivered due to inc...
Them: We‚Äôve spent all year training for this
Them: <#>BofA: DO NOT share this Sign In code. We will NEVER call you or text you for ...

üîç Quality Analysis:
Average message length: 72.2 characters
Contains junk: ‚úÖ NO
Quality status: üéâ CLEAN & READY


In [8]:
# Debug: Let's examine some AttributedString blobs to understand the issue
print("üî¨ Debugging AttributedString extraction...")

if len(df) > 0:
    # Look at some raw data
    sample_with_attr = df[df['attributedBody'].notna()].head(3)
    
    print(f"\nüìä Found {len(sample_with_attr)} samples with attributedBody to examine")
    
    for i, (_, row) in enumerate(sample_with_attr.iterrows()):
        print(f"\n--- Sample {i+1} ---")
        print(f"Text field: {repr(row['text'])}")
        print(f"AttributedBody type: {type(row['attributedBody'])}")
        if row['attributedBody']:
            attr_preview = str(row['attributedBody'])[:200] if row['attributedBody'] else "None"
            print(f"AttributedBody preview: {repr(attr_preview)}")
            
            # Try our extraction
            extracted = extract_text_from_attributed_body(row['attributedBody'])
            print(f"Our extraction result: {repr(extracted)}")
            
            # Try the old method for comparison
            if row['text']:
                old_result = clean_regular_text(row['text'])
                print(f"Regular text result: {repr(old_result)}")
        
        print(f"Final clean_text: {repr(row['clean_text'])}")
        print("-" * 50)
else:
    print("‚ùå No data to debug - check database connection and query")


üî¨ Debugging AttributedString extraction...

üìä Found 3 samples with attributedBody to examine

--- Sample 1 ---
Text field: 'Hey! Just checking in to see if you still wanted to maybe work together.'
AttributedBody type: <class 'bytes'>
AttributedBody preview: "b'\\x04\\x0bstreamtyped\\x81\\xe8\\x03\\x84\\x01@\\x84\\x84\\x84\\x19NSMutableAttributedString\\x00\\x84\\x84\\x12NSAttributedString\\x00\\x84\\x84\\x08NSObject\\x00\\x85\\x92\\x84\\x84\\x84\\x0fNSMutableString\\x01\\x84\\x84\\x08"
Our extraction result: None
Regular text result: 'Hey! Just checking in to see if you still wanted to maybe work together.'
Final clean_text: 'Hey! Just checking in to see if you still wanted to maybe work together.'
--------------------------------------------------

--- Sample 2 ---
Text field: "<#>BofA: DO NOT share this Sign In code. We will NEVER call you or text you for it. Code 645699. Reply HELP if you didn't request it. "
AttributedBody type: <class 'bytes'>
AttributedBody preview: "b'\\x

In [9]:
# Build clean conversation training pairs
print("üéØ Building conversation training examples...")

conversations = []

# Process each contact
for contact in df['contact'].unique():
    if contact == 'Unknown':
        continue
        
    contact_msgs = df[df['contact'] == contact].sort_values('date_readable').reset_index(drop=True)
    
    # Skip contacts with few messages
    if len(contact_msgs) < 15:
        continue
    
    # Create training examples where you respond
    for i in range(len(contact_msgs)):
        current = contact_msgs.iloc[i]
        
        if not current['is_user']:  # Only when you're responding
            continue
            
        # Get context (previous 5 messages)
        start = max(0, i - 5)
        context = contact_msgs.iloc[start:i]
        
        if len(context) == 0:
            continue
            
        # Build conversation
        messages = []
        for _, ctx in context.iterrows():
            role = "assistant" if ctx['is_user'] else "user"
            messages.append({"role": role, "content": ctx['clean_text']})
        
        # Add your response
        messages.append({"role": "assistant", "content": current['clean_text']})
        
        conversations.append({"messages": messages})

print(f"üìù Created {len(conversations):,} conversation examples")

# Filter for perfect quality - NO junk allowed
clean_conversations = []
for conv in conversations:
    msgs = conv['messages']
    
    # Length check
    if not (3 <= len(msgs) <= 10):
        continue
    
    # Quality check - ZERO tolerance for junk
    valid = True
    for msg in msgs:
        content = msg['content']
        if (len(content) < 8 or len(content) > 200 or
            re.search(r'[\x00-\x1f\x7f-\x9f]|stream|NSString|NSObject|kIM|bplist|__', content, re.IGNORECASE) or
            not re.search(r'[a-zA-Z]{3,}', content)):
            valid = False
            break
    
    if valid:
        clean_conversations.append(conv)

print(f"‚ú® Filtered to {len(clean_conversations):,} PERFECT conversations")

# Show sample
if clean_conversations:
    sample = random.choice(clean_conversations)
    print("\nüí¨ Sample conversation:")
    for i, msg in enumerate(sample['messages']):
        role = "You" if msg['role'] == 'assistant' else "Them"
        print(f"{role}: {msg['content']}")
        if i >= 3:
            break


üéØ Building conversation training examples...
üìù Created 258 conversation examples
‚ú® Filtered to 180 PERFECT conversations

üí¨ Sample conversation:
You: Is power back?
You: All the screens here are off
You: So idk if it‚Äôs even coming
You: Holy shit lol


In [10]:
# Export perfectly clean training data
output_file = 'imessage_perfect_training_data.jsonl'

print(f"üíæ Exporting {len(clean_conversations):,} perfect conversations...")

# Export to JSONL
with open(output_file, 'w', encoding='utf-8') as f:
    for conv in clean_conversations:
        f.write(json.dumps(conv, ensure_ascii=False) + '\n')

# Statistics
total_messages = sum(len(conv['messages']) for conv in clean_conversations)
total_chars = sum(len(msg['content']) for conv in clean_conversations for msg in conv['messages'])
avg_conv_length = total_messages / len(clean_conversations) if clean_conversations else 0
avg_msg_length = total_chars / total_messages if total_messages else 0
estimated_tokens = int(total_chars * 0.75)
file_size_mb = os.path.getsize(output_file) / (1024 * 1024)

print(f"\nüéâ EXPORT COMPLETE!")
print(f"üìÅ File: {output_file}")
print(f"üìä Size: {file_size_mb:.1f} MB")
print(f"üí¨ Conversations: {len(clean_conversations):,}")
print(f"üìù Messages: {total_messages:,}")
print(f"üìè Avg conversation: {avg_conv_length:.1f} messages")
print(f"üìê Avg message: {avg_msg_length:.0f} chars")
print(f"üéØ Est. tokens: {estimated_tokens:,}")

# Final quality verification
print(f"\nüîç FINAL VERIFICATION:")
with open(output_file, 'r', encoding='utf-8') as f:
    test_conv = json.loads(f.readline())
    all_text = ' '.join(msg['content'] for msg in test_conv['messages'])
    has_junk = bool(re.search(r'[\x00-\x1f\x7f-\x9f]|stream|NSString|NSObject', all_text, re.IGNORECASE))
    
    print(f"Control characters: {'‚ùå FOUND' if re.search(r'[\x00-\x1f\x7f-\x9f]', all_text) else '‚úÖ NONE'}")
    print(f"Apple metadata: {'‚ùå FOUND' if re.search(r'stream|NSString|NSObject', all_text, re.IGNORECASE) else '‚úÖ NONE'}")
    print(f"Overall quality: {'‚ùå HAS JUNK' if has_junk else 'üéâ PERFECT'}")

print(f"\nüìã Random conversation from file:")
for i, msg in enumerate(test_conv['messages']):
    role = "You" if msg['role'] == 'assistant' else "Them"
    print(f"{role}: {msg['content']}")
    if i >= 3:
        break

print(f"\nüöÄ Ready for finetuning!")
print(f"‚úÖ Your data is completely clean - no \\u0002, no 'streamtyped', no Apple junk!")
print(f"üì§ Upload {output_file} to your finetuning platform and train your AI!")


üíæ Exporting 180 perfect conversations...

üéâ EXPORT COMPLETE!
üìÅ File: imessage_perfect_training_data.jsonl
üìä Size: 0.1 MB
üí¨ Conversations: 180
üìù Messages: 1,080
üìè Avg conversation: 6.0 messages
üìê Avg message: 43 chars
üéØ Est. tokens: 34,853

üîç FINAL VERIFICATION:
Control characters: ‚úÖ NONE
Apple metadata: ‚úÖ NONE
Overall quality: üéâ PERFECT

üìã Random conversation from file:
You: Hi! How was hanging with Jamasen yesterday
Them: You just earned 5 Points at Daily Driver! Download Cash App to track your status: https://cash.app/u/Rg3C0byo Text STOP to opt out
Them: Hey man! It was great really missed you!
Them: It was good fun catching up

üöÄ Ready for finetuning!
‚úÖ Your data is completely clean - no \u0002, no 'streamtyped', no Apple junk!
üì§ Upload imessage_perfect_training_data.jsonl to your finetuning platform and train your AI!
