In [2]:
# Exploratory Data Analysis of Kaggle Enron Email Dataset

# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# 2. Load Dataset
# Adjust the path and column names as needed for your dataset
df = pd.read_csv('../dataset/emails.csv')

# 3. Initial Dataset Exploration
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())

print("\nDataset info:")
print(df.info())

print("\nMissing values:")
print(df.isnull().sum())

# 4. Analyze Email Structure
# Look at the message column to understand email format
print("\n" + "="*50)
print("EMAIL STRUCTURE ANALYSIS")
print("="*50)

# Display a sample email to understand the structure
print("\nSample email message:")
sample_email = df['message'].iloc[0]
print(sample_email[:1000] + "..." if len(sample_email) > 1000 else sample_email)

# 5. Extract and Analyze Email Components
def extract_email_components(email_text):
    """Extract common email components from raw email text"""
    components = {}
    
    # Extract Message-ID
    message_id_match = re.search(r'Message-ID:\s*<([^>]+)>', email_text)
    components['message_id'] = message_id_match.group(1) if message_id_match else None
    
    # Extract Date
    date_match = re.search(r'Date:\s*([^\n\r]+)', email_text)
    components['date'] = date_match.group(1).strip() if date_match else None
    
    # Extract From
    from_match = re.search(r'From:\s*([^\n\r]+)', email_text)
    components['from'] = from_match.group(1).strip() if from_match else None
    
    # Extract To
    to_match = re.search(r'To:\s*([^\n\r]+)', email_text)
    components['to'] = to_match.group(1).strip() if to_match else None
    
    # Extract Subject
    subject_match = re.search(r'Subject:\s*([^\n\r]+)', email_text)
    components['subject'] = subject_match.group(1).strip() if subject_match else None
    
    # Extract CC (if present)
    cc_match = re.search(r'Cc:\s*([^\n\r]+)', email_text)
    components['cc'] = cc_match.group(1).strip() if cc_match else None
    
    # Extract BCC (if present)
    bcc_match = re.search(r'Bcc:\s*([^\n\r]+)', email_text)
    components['bcc'] = bcc_match.group(1).strip() if bcc_match else None
    
    # Extract email body (content after headers)
    # Look for double newline which typically separates headers from body
    body_split = re.split(r'\n\s*\n', email_text, 1)
    components['body'] = body_split[1].strip() if len(body_split) > 1 else ""
    
    return components

# Apply extraction to a sample of emails
print("\nExtracting email components from sample emails...")
sample_size = min(5, len(df))
for i in range(sample_size):
    print(f"\n--- Email {i+1} ---")
    components = extract_email_components(df['message'].iloc[i])
    for key, value in components.items():
        if value:
            print(f"{key.capitalize()}: {value[:100]}..." if len(str(value)) > 100 else f"{key.capitalize()}: {value}")

# 6. Statistical Overview of Email Features
print("\n" + "="*50)
print("EMAIL FEATURES STATISTICS")
print("="*50)

# Extract components for all emails (sample first 1000 for performance)
sample_df = df.head(1000).copy()
components_list = [extract_email_components(msg) for msg in sample_df['message']]
components_df = pd.DataFrame(components_list)

print(f"\nAnalyzing {len(components_df)} emails...")
print("\nEmail components availability:")
for col in components_df.columns:
    non_null_count = components_df[col].notna().sum()
    percentage = (non_null_count / len(components_df)) * 100
    print(f"{col.capitalize()}: {non_null_count}/{len(components_df)} ({percentage:.1f}%)")

# Subject line analysis
subjects = components_df['subject'].dropna()
print(f"\nSubject line statistics:")
print(f"Average subject length: {subjects.str.len().mean():.1f} characters")
print(f"Subject length range: {subjects.str.len().min()} - {subjects.str.len().max()} characters")

# Email body analysis
bodies = components_df['body'].dropna()
print(f"\nEmail body statistics:")
print(f"Average body length: {bodies.str.len().mean():.1f} characters")
print(f"Body length range: {bodies.str.len().min()} - {bodies.str.len().max()} characters")

# Sender analysis
senders = components_df['from'].dropna()
print(f"\nSender information:")
print(f"Unique senders: {senders.nunique()}")
print(f"Most common senders:")
print(senders.value_counts().head(10))


Dataset shape: (517401, 2)

Column names:
['file', 'message']

First few rows:
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   file     517401 non-null  object
 1   message  517401 non-null  object
dtypes: object(2)
memory usage: 7.9+ MB
None

Missing values:
file       0
message    0
dtype: int64

EMAIL STRUCTURE ANALYSIS

Sample email message:
Message-ID: <1878298