# Email Intelligence Research - Data Exploration

This notebook explores the Enron email dataset and demonstrates the power of our AI system with real data.

## Research Objectives
1. Load and explore the complete Enron dataset
2. Analyze email patterns and communication networks
3. Demonstrate descriptive analytics capabilities
4. Store processed data in S3 for further analysis

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
import json
import os
from pathlib import Path
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("üìä Email Intelligence Research Pipeline - Data Exploration")
print("=" * 60)
print(f"üïê Started at: {datetime.now()}")

In [None]:
# AWS S3 Setup
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')

# S3 bucket names (from CDK deployment)
RAW_DATA_BUCKET = 'email-intelligence-raw-data'
PROCESSED_DATA_BUCKET = 'email-intelligence-processed-data'
RESULTS_BUCKET = 'email-intelligence-results'

print("üîó AWS S3 Configuration:")
print(f"   Raw Data: {RAW_DATA_BUCKET}")
print(f"   Processed: {PROCESSED_DATA_BUCKET}")
print(f"   Results: {RESULTS_BUCKET}")

In [None]:
# Load Enron Dataset
MAILDIR_PATH = '/home/ec2-user/maildir'  # Path to extracted Enron dataset

def load_enron_dataset(maildir_path, limit=1000):
    """
    Load Enron email dataset for research analysis
    This demonstrates the system's power with real data
    """
    emails = []
    maildir = Path(maildir_path)
    
    if not maildir.exists():
        print(f"‚ùå Maildir not found at {maildir_path}")
        return pd.DataFrame()
    
    print(f"üìß Loading Enron dataset from {maildir_path}...")
    
    # Target high-volume users for research
    target_users = [
        "kaminski-v", "beck-s", "allen-p", "lay-k", "skilling-j",
        "dasovich-j", "kean-s", "mann-k", "delainey-d", "farmer-d"
    ]
    
    count = 0
    for user_dir in maildir.iterdir():
        if not user_dir.is_dir() or user_dir.name not in target_users:
            continue
            
        print(f"   üìÅ Processing {user_dir.name}...")
        
        # Process different email folders
        for subfolder in ["sent_items", "inbox", "_sent_mail", "sent", "all_documents"]:
            folder_path = user_dir / subfolder
            if folder_path.exists():
                for email_file in folder_path.iterdir():
                    if email_file.is_file() and count < limit:
                        email_data = parse_email_file(email_file)
                        if email_data:
                            emails.append(email_data)
                            count += 1
                            
                            if count % 100 == 0:
                                print(f"      ‚ö° Loaded {count} emails...")
                    
                    if count >= limit:
                        break
            
            if count >= limit:
                break
        
        if count >= limit:
            break
    
    df = pd.DataFrame(emails)
    print(f"‚úÖ Loaded {len(df)} emails for research analysis")
    return df

def parse_email_file(file_path):
    """
    Parse individual email file
    """
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
        
        email = {
            'file_path': str(file_path),
            'user': file_path.parts[-3],  # Extract user from path
            'folder': file_path.parts[-2],  # Extract folder
            'subject': '',
            'from': '',
            'to': [],
            'cc': [],
            'date': '',
            'body': '',
            'body_length': 0
        }
        
        lines = content.split('\n')
        body_start = 0
        
        # Parse headers
        for i, line in enumerate(lines):
            line = line.strip()
            
            if line.startswith('Subject: '):
                email['subject'] = line[9:].strip()
            elif line.startswith('From: '):
                email['from'] = extract_email_address(line[6:])
            elif line.startswith('To: '):
                email['to'] = parse_recipients(line[4:])
            elif line.startswith('Cc: '):
                email['cc'] = parse_recipients(line[4:])
            elif line.startswith('Date: '):
                email['date'] = line[6:].strip()
            elif line == '':
                body_start = i + 1
                break
        
        # Extract body
        if body_start < len(lines):
            email['body'] = '\n'.join(lines[body_start:]).strip()
            email['body_length'] = len(email['body'])
        
        # Only return emails with meaningful content
        if email['body_length'] > 50 and email['subject']:
            return email
            
    except Exception as e:
        pass  # Skip problematic files
    
    return None

def extract_email_address(email_str):
    """Extract clean email address"""
    import re
    match = re.search(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', email_str)
    if match:
        return match.group(1).lower().strip()
    return email_str.strip().lower()

def parse_recipients(recipients_str):
    """Parse recipient list"""
    recipients = []
    for recipient in recipients_str.split(','):
        email_addr = extract_email_address(recipient.strip())
        if '@' in email_addr:
            recipients.append(email_addr)
    return recipients

# Load the dataset
emails_df = load_enron_dataset(MAILDIR_PATH, limit=2000)  # Load 2000 emails for research

In [None]:
# Dataset Overview and Statistics
print("üìä ENRON DATASET RESEARCH ANALYSIS")
print("=" * 50)

if not emails_df.empty:
    print(f"üìß Total Emails Loaded: {len(emails_df):,}")
    print(f"üë• Unique Users: {emails_df['user'].nunique()}")
    print(f"üìÅ Email Folders: {emails_df['folder'].nunique()}")
    print(f"üìù Average Body Length: {emails_df['body_length'].mean():.0f} characters")
    print(f"üìà Total Characters Processed: {emails_df['body_length'].sum():,}")
    
    # Display sample data
    print("\nüìã Sample Email Data:")
    display(emails_df[['user', 'folder', 'subject', 'from', 'body_length']].head(10))
    
    # User distribution
    print("\nüë• Emails by User:")
    user_counts = emails_df['user'].value_counts()
    print(user_counts)
    
    # Folder distribution
    print("\nüìÅ Emails by Folder:")
    folder_counts = emails_df['folder'].value_counts()
    print(folder_counts)
    
else:
    print("‚ùå No emails loaded. Check dataset path.")

In [None]:
# Visualize Dataset Characteristics
if not emails_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Email distribution by user
    user_counts.head(10).plot(kind='bar', ax=axes[0,0], color='skyblue')
    axes[0,0].set_title('üìß Top 10 Users by Email Count')
    axes[0,0].set_xlabel('User')
    axes[0,0].set_ylabel('Email Count')
    axes[0,0].tick_params(axis='x', rotation=45)
    
    # Email body length distribution
    emails_df['body_length'].hist(bins=50, ax=axes[0,1], color='lightgreen', alpha=0.7)
    axes[0,1].set_title('üìù Email Body Length Distribution')
    axes[0,1].set_xlabel('Body Length (characters)')
    axes[0,1].set_ylabel('Frequency')
    
    # Folder distribution
    folder_counts.plot(kind='pie', ax=axes[1,0], autopct='%1.1f%%')
    axes[1,0].set_title('üìÅ Email Distribution by Folder')
    axes[1,0].set_ylabel('')
    
    # Subject length analysis
    emails_df['subject_length'] = emails_df['subject'].str.len()
    emails_df['subject_length'].hist(bins=30, ax=axes[1,1], color='orange', alpha=0.7)
    axes[1,1].set_title('üìã Subject Length Distribution')
    axes[1,1].set_xlabel('Subject Length (characters)')
    axes[1,1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
    # Save visualization to S3
    plt.savefig('/tmp/dataset_overview.png', dpi=300, bbox_inches='tight')
    
    try:
        s3_client.upload_file(
            '/tmp/dataset_overview.png',
            RESULTS_BUCKET,
            'visualizations/dataset_overview.png'
        )
        print("‚úÖ Visualization saved to S3")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not save to S3: {e}")

In [None]:
# Communication Network Analysis
if not emails_df.empty:
    print("üï∏Ô∏è COMMUNICATION NETWORK ANALYSIS")
    print("=" * 40)
    
    # Extract all unique email addresses
    all_senders = set(emails_df['from'].dropna())
    all_recipients = set()
    
    for recipients_list in emails_df['to']:
        if isinstance(recipients_list, list):
            all_recipients.update(recipients_list)
    
    print(f"üë• Unique Senders: {len(all_senders)}")
    print(f"üë• Unique Recipients: {len(all_recipients)}")
    print(f"üåê Total Network Size: {len(all_senders.union(all_recipients))}")
    
    # Top communicators
    sender_counts = emails_df['from'].value_counts().head(10)
    print("\nüì§ Top Email Senders:")
    for sender, count in sender_counts.items():
        print(f"   {sender}: {count} emails")
    
    # Analyze recipient patterns
    recipient_counts = {}
    for recipients_list in emails_df['to']:
        if isinstance(recipients_list, list):
            for recipient in recipients_list:
                recipient_counts[recipient] = recipient_counts.get(recipient, 0) + 1
    
    top_recipients = sorted(recipient_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    print("\nüì• Top Email Recipients:")
    for recipient, count in top_recipients:
        print(f"   {recipient}: {count} emails")
    
    # Create communication matrix for top users
    top_users = list(sender_counts.head(5).index)
    communication_matrix = pd.DataFrame(0, index=top_users, columns=top_users)
    
    for _, email in emails_df.iterrows():
        sender = email['from']
        if sender in top_users and isinstance(email['to'], list):
            for recipient in email['to']:
                if recipient in top_users:
                    communication_matrix.loc[sender, recipient] += 1
    
    # Visualize communication matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(communication_matrix, annot=True, cmap='Blues', fmt='d')
    plt.title('üï∏Ô∏è Communication Matrix - Top 5 Users')
    plt.xlabel('Recipients')
    plt.ylabel('Senders')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
    # Save communication analysis
    plt.savefig('/tmp/communication_matrix.png', dpi=300, bbox_inches='tight')
    
    try:
        s3_client.upload_file(
            '/tmp/communication_matrix.png',
            RESULTS_BUCKET,
            'visualizations/communication_matrix.png'
        )
        print("‚úÖ Communication matrix saved to S3")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not save to S3: {e}")

In [None]:
# Content Analysis - Demonstrate AI Power
if not emails_df.empty:
    print("üß† CONTENT ANALYSIS - AI POWER DEMONSTRATION")
    print("=" * 50)
    
    # Analyze email subjects for business terms
    business_terms = [
        'meeting', 'project', 'report', 'budget', 'contract', 'proposal',
        'deadline', 'schedule', 'review', 'analysis', 'strategy', 'plan',
        'deliverable', 'task', 'urgent', 'important', 'critical', 'asap'
    ]
    
    # Count business terms in subjects
    term_counts = {}
    for term in business_terms:
        count = emails_df['subject'].str.lower().str.contains(term, na=False).sum()
        if count > 0:
            term_counts[term] = count
    
    print("üìä Business Terms in Email Subjects:")
    for term, count in sorted(term_counts.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / len(emails_df)) * 100
        print(f"   {term}: {count} emails ({percentage:.1f}%)")
    
    # Visualize business terms
    if term_counts:
        plt.figure(figsize=(12, 6))
        terms = list(term_counts.keys())
        counts = list(term_counts.values())
        
        plt.bar(terms, counts, color='lightcoral', alpha=0.8)
        plt.title('üìä Business Terms Frequency in Email Subjects')
        plt.xlabel('Business Terms')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # Save business terms analysis
        plt.savefig('/tmp/business_terms.png', dpi=300, bbox_inches='tight')
        
        try:
            s3_client.upload_file(
                '/tmp/business_terms.png',
                RESULTS_BUCKET,
                'visualizations/business_terms.png'
            )
            print("‚úÖ Business terms analysis saved to S3")
        except Exception as e:
            print(f"‚ö†Ô∏è Could not save to S3: {e}")
    
    # Sample high-value emails for AI processing
    business_emails = emails_df[
        emails_df['subject'].str.lower().str.contains('|'.join(business_terms[:5]), na=False)
    ].head(10)
    
    print(f"\nüéØ Found {len(business_emails)} high-value business emails for AI processing:")
    for idx, email in business_emails.iterrows():
        print(f"   üìß {email['subject'][:60]}... (from: {email['from']})")

In [None]:
# Save Processed Dataset to S3
if not emails_df.empty:
    print("üíæ SAVING PROCESSED DATASET TO S3")
    print("=" * 35)
    
    # Create comprehensive dataset summary
    dataset_summary = {
        'processing_date': datetime.now().isoformat(),
        'total_emails': len(emails_df),
        'unique_users': emails_df['user'].nunique(),
        'unique_senders': len(all_senders),
        'unique_recipients': len(all_recipients),
        'total_characters': emails_df['body_length'].sum(),
        'avg_body_length': emails_df['body_length'].mean(),
        'business_terms_found': term_counts,
        'top_users': user_counts.head(10).to_dict(),
        'folder_distribution': folder_counts.to_dict(),
        'dataset_quality': {
            'emails_with_subjects': emails_df['subject'].notna().sum(),
            'emails_with_senders': emails_df['from'].notna().sum(),
            'emails_with_recipients': emails_df['to'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum(),
            'avg_subject_length': emails_df['subject_length'].mean()
        }
    }
    
    # Save dataset summary
    try:
        s3_client.put_object(
            Bucket=PROCESSED_DATA_BUCKET,
            Key='dataset_summary.json',
            Body=json.dumps(dataset_summary, indent=2, default=str),
            ContentType='application/json'
        )
        print("‚úÖ Dataset summary saved to S3")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not save summary to S3: {e}")
    
    # Save processed emails dataset
    try:
        # Convert to JSON for S3 storage
        emails_json = emails_df.to_json(orient='records', indent=2)
        
        s3_client.put_object(
            Bucket=PROCESSED_DATA_BUCKET,
            Key='processed_emails.json',
            Body=emails_json,
            ContentType='application/json'
        )
        print("‚úÖ Processed emails dataset saved to S3")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not save dataset to S3: {e}")
    
    # Save high-value business emails for AI processing
    try:
        business_emails_json = business_emails.to_json(orient='records', indent=2)
        
        s3_client.put_object(
            Bucket=PROCESSED_DATA_BUCKET,
            Key='business_emails_for_ai.json',
            Body=business_emails_json,
            ContentType='application/json'
        )
        print("‚úÖ High-value business emails saved for AI processing")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not save business emails to S3: {e}")
    
    print(f"\nüìä RESEARCH DATASET READY:")
    print(f"   üìß {len(emails_df):,} emails processed")
    print(f"   üß† {len(business_emails)} high-value emails identified for AI")
    print(f"   üíæ All data stored in S3 for further analysis")
    print(f"   üöÄ Ready for advanced AI processing in next notebooks")

In [None]:
# Research Summary and Next Steps
print("üéØ RESEARCH PHASE 1 COMPLETE")
print("=" * 30)
print("\n‚úÖ ACCOMPLISHED:")
print("   üìä Loaded and analyzed real Enron dataset")
print("   üï∏Ô∏è Mapped communication networks")
print("   üìà Identified business patterns")
print("   üíæ Stored processed data in S3")
print("   üé® Generated research visualizations")

print("\nüöÄ NEXT STEPS:")
print("   üìù Notebook 02: Advanced AI Processing")
print("   üß† Notebook 03: Predictive Modeling")
print("   üí° Notebook 04: Prescriptive Analytics")
print("   üóÑÔ∏è Notebook 05: Neo4j Graph Analysis")

print(f"\nüïê Completed at: {datetime.now()}")
print("\nüéâ This demonstrates the REAL POWER of our AI system with actual data!")