In [2]:
import pandas as pd
import numpy as np
import os
import email
from email.parser import Parser
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
enron_dir = "../data/raw/enron_dataset_test"
print(os.listdir(enron_dir))

['inbox']


In [4]:
inbox_path = os.path.join(enron_dir, 'inbox')
print(os.listdir(inbox_path)[:5])
print(f'Total files in inbox: {len(os.listdir(inbox_path))}')

['36.', '313.', '264.', '166.', '96.']
Total files in inbox: 343


In [5]:
def parse_email(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            msg = Parser().parsestr(content)

            # Extract body
            body = []
            if msg.is_multipart():
                for part in msg.walk():
                    if part.get_content_type() == 'text/plain':
                        decoded_part = part.get_payload(decode=True)
                        if decoded_part:
                            decoded_text = decoded_part.decode('utf-8', errors='ignore')
                            body.append(decoded_text)
            else:
                payload = msg.get_payload(decode=True)
                if payload:
                    body.append(payload.decode('utf-8', errors='ignore'))
            
            # Extract recipients
            recipients = []
            for field in ['to', 'cc', 'bcc']:
                field_value = msg.get(field)
                if field_value:
                    # Split and clean recipient addresses
                    field_recipients = [r.strip() for r in field_value.split(',')]
                    recipients.extend(field_recipients)
            
            # Extract headers
            headers = {
                header.lower(): value 
                for header, value in msg.items()
            }

            return {
                'message_id': msg.get('message-id', ''),
                'subject': msg.get('subject', ''),
                'body': '\n'.join(body) if body else '',
                'sender': msg.get('from', ''),
                'date': msg.get('date', ''),
                'recipients': recipients,
                'headers': headers
                
            }
    except Exception as e:
        print(f"Error parsing {file_path}: {str(e)}")
        return None

sample_email = os.path.join(inbox_path, os.listdir(inbox_path)[0])
email_data = parse_email(sample_email)
print("\nSample Email Contents:")

# Print the extracted information
print("\nMessage ID:", email_data['message_id'])
print("\nSubject:", email_data['subject'])
print("\nSender:", email_data['sender'])
print("\nAll Recipients:", email_data['recipients'])
print("\nBody preview:", email_data['body'][:500] if email_data['body'] else "No body")


Sample Email Contents:

Message ID: <33004025.1075853131914.JavaMail.evans@thyme>

Subject: RE: Subsidiary Equity or Phantom Equity

Sender: nancy.corbet@enron.com

All Recipients: ['michelle.cash@enron.com', 'thomas.kalb@enron.com', 'g..bushman@enron.com', 'thomas.kalb@enron.com', 'g..bushman@enron.com']

Body preview: I am copying this to Tom and Teresa for info purposes. Tom can advise John but I think that it would be benefitial for you to leave John a message if you cannot reach him directly since I defer to your expertise in this area. I would be interested in participating in a discussion on this next week but do not hold things up on my account. Thanks, Nancy

-----Original Message-----
From: Cash, Michelle 
Sent: Thursday, October 11, 2001 2:29 AM
To: Corbet, Nancy
Subject: FW: Subsidiary Equity or Pha


In [6]:
def process_emails(directory):
    emails = []
    total_files = len(os.listdir(directory))

    print(f"Starting to precess {total_files} emails...")
    
    for i, filename in enumerate(os.listdir(directory), 1):
        file_path = os.path.join(directory, filename)
        email_data = parse_email(file_path)
    
        if email_data:
            emails.append(email_data)
    
        if i % 100 == 0:
            print(f"Processed {i}/{total_files} emails...")

    print(f"Successfully processed {len(emails)} emails")
    return pd.DataFrame(emails)

df = process_emails(inbox_path)

# Basic analysis
print("\nDataset Overview:")
print(f"Total emails: {len(df)}")
print(f"Columns: {df.columns.tolist()}")

# Basic statistics
print("\nBasic Statistics:")
print(f"Number of unique senders: {df['sender'].nunique()}")
print(f"Number of unique recipients: {len(set([r for recipients in df['recipients'] for r in recipients]))}")
print(f"Number of emails with subjects: {df['subject'].notna().sum()}")
print(f"Average subject length: {df['subject'].str.len().mean():.2f} characters")

# Show sample of the data
print("\nSample of the dataset (first 2 rows):")
pd.set_option('display.max_columns', None)
print(df[['message_id', 'subject', 'sender', 'date']].head(2))

# Save the processed data
df.to_pickle('../data/processed/processed_emails.pkl')
print("\nData saved to '../data/processed/processed_emails.pkl'")



Starting to precess 343 emails...
Processed 100/343 emails...
Processed 200/343 emails...
Processed 300/343 emails...
Successfully processed 343 emails

Dataset Overview:
Total emails: 343
Columns: ['message_id', 'subject', 'body', 'sender', 'date', 'recipients', 'headers']

Basic Statistics:
Number of unique senders: 145
Number of unique recipients: 654
Number of emails with subjects: 343
Average subject length: 28.18 characters

Sample of the dataset (first 2 rows):
                                      message_id  \
0  <33004025.1075853131914.JavaMail.evans@thyme>   
1   <5434593.1075862052937.JavaMail.evans@thyme>   

                                   subject                  sender  \
0  RE: Subsidiary Equity or Phantom Equity  nancy.corbet@enron.com   
1    RE: A few questions about the scripts  rick.johnson@enron.com   

                                    date  
0  Thu, 11 Oct 2001 08:59:26 -0700 (PDT)  
1  Mon, 26 Nov 2001 09:07:58 -0800 (PST)  

Data saved to '../data/proces

In [7]:
df

Unnamed: 0,message_id,subject,body,sender,date,recipients,headers
0,<33004025.1075853131914.JavaMail.evans@thyme>,RE: Subsidiary Equity or Phantom Equity,I am copying this to Tom and Teresa for info p...,nancy.corbet@enron.com,"Thu, 11 Oct 2001 08:59:26 -0700 (PDT)","[michelle.cash@enron.com, thomas.kalb@enron.co...",{'message-id': '<33004025.1075853131914.JavaMa...
1,<5434593.1075862052937.JavaMail.evans@thyme>,RE: A few questions about the scripts,Scripts and the coresponding talking points we...,rick.johnson@enron.com,"Mon, 26 Nov 2001 09:07:58 -0800 (PST)",[michelle.cash@enron.com],{'message-id': '<5434593.1075862052937.JavaMai...
2,<637032.1075862051693.JavaMail.evans@thyme>,WARN issues,Document attached. \n\n - 2k01 sc design m2.doc,fmackin@aol.com,"Sun, 18 Nov 2001 10:27:39 -0800 (PST)",[michelle.cash@enron.com],{'message-id': '<637032.1075862051693.JavaMail...
3,<2906173.1075855361040.JavaMail.evans@thyme>,RE: Employee Trust Documentation,"March!\n \nSeriously, I will find time this we...",david.oxley@enron.com,"Thu, 13 Dec 2001 06:04:55 -0800 (PST)",[michelle.cash@enron.com],{'message-id': '<2906173.1075855361040.JavaMai...
4,<33527112.1075853133594.JavaMail.evans@thyme>,New Agreement,"Michelle -- per our discussion, please prepare...",ann.hill@enron.com,"Mon, 29 Oct 2001 18:05:15 -0800 (PST)","[michelle.cash@enron.com, twanda.sweet@enron.c...",{'message-id': '<33527112.1075853133594.JavaMa...
...,...,...,...,...,...,...,...
338,<15266483.1075855360479.JavaMail.evans@thyme>,Clickathome Q&A as of 11/26,I havn't been able yet to pin down the posted ...,lizzette.palmer@enron.com,"Wed, 5 Dec 2001 16:59:13 -0800 (PST)","[sharon.butcher@enron.com, michelle.cash@enron...",{'message-id': '<15266483.1075855360479.JavaMa...
339,<30167490.1075855360021.JavaMail.evans@thyme>,RE: Human Contact to Answer Questions about HR...,You too! I saw yor 5:30 AM message !\n-------...,fran.mayes@enron.com,"Fri, 30 Nov 2001 04:28:00 -0800 (PST)",[michelle.cash@enron.com],{'message-id': '<30167490.1075855360021.JavaMa...
340,<8735198.1075853133058.JavaMail.evans@thyme>,Casey v. NEPCO,"\nAttached for your review and comment, please...",cneely@dsda.com,"Thu, 25 Oct 2001 11:29:59 -0700 (PDT)","[michelle.cash@enron.com, mattson@enron.com, l...",{'message-id': '<8735198.1075853133058.JavaMai...
341,<27490461.1075862050782.JavaMail.evans@thyme>,RE: can we talk briefly?,"Michelle,\n \nThank you for your reply. I have...",william.pentak@enron.com,"Tue, 13 Nov 2001 15:09:54 -0800 (PST)",[michelle.cash@enron.com],{'message-id': '<27490461.1075862050782.JavaMa...


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import re
import spacy
import numpy as np

class EmailLabeler:
    def __init__(self):
        # Load spaCy for better text processing
        self.nlp = spacy.load('en_core_web_sm')
        
        # Define strong indicators for each category
        self.category_patterns = {
            'PERSONAL': {
                'senders': [
                    '@gmail.com', '@yahoo.com', '@hotmail.com', '@outlook.com'
                ],
                'strong_keywords': [
                    'family', 'personal', 'friend', 'dinner', 'lunch', 'weekend',
                    'vacation', 'holiday', 'birthday', 'social', 'meetup'
                ],
                'subject_patterns': [
                    r'\b(hey|hi|hello)\b', 
                    r'personal matter',
                    r'family|friend'
                ],
                'header_indicators': [
                    'personal', 'private', 'confidential'
                ]
            },
            'CAREER': {
                'senders': [
                    '@linkedin.com', '@indeed.com', '@careers.', '@recruit.',
                    '@talent.', '@hr.', '@jobs.'
                ],
                'strong_keywords': [
                    'job', 'career', 'position', 'opportunity', 'resume', 'cv',
                    'interview', 'recruitment', 'hiring', 'salary', 'application',
                    'qualification', 'experience', 'skill'
                ],
                'subject_patterns': [
                    r'job (opportunity|opening|position)',
                    r'career|position|vacancy',
                    r'interview|recruitment'
                ],
                'header_indicators': [
                    'job', 'career', 'recruitment', 'employment'
                ]
            },
            'FINANCE': {
                'senders': [
                    '@bank.', '@paypal.', '@billing.', '@finance.',
                    '@accounting.', '@invoice.', '@tax.'
                ],
                'strong_keywords': [
                    'transaction', 'payment', 'invoice', 'bank', 'financial',
                    'account', 'balance', 'credit', 'debit', 'statement',
                    'tax', 'investment', 'money', 'fund'
                ],
                'subject_patterns': [
                    r'payment|invoice|transaction',
                    r'financial|statement|balance',
                    r'tax|investment'
                ],
                'header_indicators': [
                    'finance', 'banking', 'payment', 'invoice'
                ]
            },
            'SHOPPING': {
                'senders': [
                    '@amazon.', '@ebay.', '@walmart.', '@shop.',
                    '@store.', '@retail.', '@order.'
                ],
                'strong_keywords': [
                    'order', 'purchase', 'delivery', 'shipping', 'tracking',
                    'item', 'product', 'cart', 'buy', 'price', 'discount',
                    'deal', 'sale', 'shop'
                ],
                'subject_patterns': [
                    r'order (confirmation|status|shipped)',
                    r'delivery|tracking',
                    r'purchase|shopping'
                ],
                'header_indicators': [
                    'order', 'purchase', 'shipping', 'delivery'
                ]
            },
            'HEALTH': {
                'senders': [
                    '@hospital.', '@clinic.', '@healthcare.',
                    '@medical.', '@health.', '@doctor.'
                ],
                'strong_keywords': [
                    'appointment', 'medical', 'health', 'doctor', 'clinic',
                    'prescription', 'medication', 'treatment', 'insurance',
                    'patient', 'healthcare', 'wellness'
                ],
                'subject_patterns': [
                    r'medical|health|appointment',
                    r'doctor|clinic|hospital',
                    r'prescription|medication'
                ],
                'header_indicators': [
                    'medical', 'health', 'healthcare', 'appointment'
                ]
            },
            'SUBSCRIPTIONS': {
                'senders': [
                    '@netflix.', '@spotify.', '@subscription.',
                    '@service.', '@membership.'
                ],
                'strong_keywords': [
                    'subscription', 'membership', 'renewal', 'plan', 'service',
                    'account', 'streaming', 'monthly', 'annual', 'auto-renewal',
                    'premium'
                ],
                'subject_patterns': [
                    r'subscription|membership',
                    r'renewal|plan|service',
                    r'account (status|update)'
                ],
                'header_indicators': [
                    'subscription', 'membership', 'service', 'account'
                ]
            },
            'BUSINESS COMMUNICATION': {
                'senders': [
                    '@company.', '@corp.', '@business.', '@team.', '@hr.', '@consulting.'
                ],
                'strong_keywords': [
                    'meeting', 'agenda', 'project', 'business', 'collaboration', 'team',
                    'report', 'proposal', 'task', 'update', 'deadline', 'strategy',
                    'client', 'conference', 'presentation'
                ],
                'subject_patterns': [
                    r'project|report|business',
                    r'collaboration|meeting|update',
                    r'proposal|deadline|strategy'
                ],
                'header_indicators': [
                    'business', 'project', 'team', 'update', 'meeting'
                ]
            }
        }
        
        # Initialize TF-IDF vectorizer
        self.vectorizer = TfidfVectorizer(
            max_features=1000,
            stop_words='english',
            ngram_range=(1, 2)
        )
        
    def preprocess_text(self, text):
        """Clean and process text using spaCy."""
        doc = self.nlp(text)
        # Keep only nouns, verbs, adjectives, and proper nouns
        tokens = [
            token.lemma_.lower() for token in doc 
            if (token.pos_ in ['NOUN', 'VERB', 'ADJ', 'PROPN'] and 
                not token.is_stop and 
                not token.is_punct)
        ]
        return ' '.join(tokens)
    
    def calculate_category_score(self, email, category, patterns):
        """Calculate confidence score for a category."""
        score = 0
        confidence_factors = []
        
        # Preprocess email content
        text = self.preprocess_text(f"{email['subject']} {email['body']}").lower()
        sender = email['sender'].lower()
        
        # Check sender domain (highest weight)
        for domain in patterns['senders']:
            if domain in sender:
                score += 10
                confidence_factors.append(f"Sender matches {domain}")
        
        # Check strong keywords
        keyword_matches = 0
        for keyword in patterns['strong_keywords']:
            if keyword in text:
                keyword_matches += 1
        score += keyword_matches * 3
        if keyword_matches > 0:
            confidence_factors.append(f"Found {keyword_matches} keywords")
        
        # Check subject patterns
        for pattern in patterns['subject_patterns']:
            if re.search(pattern, email['subject'].lower()):
                score += 5
                confidence_factors.append(f"Subject matches pattern {pattern}")
        
        # Check headers
        headers = email['headers']
        for indicator in patterns['header_indicators']:
            if any(indicator in str(value).lower() for value in headers.values()):
                score += 4
                confidence_factors.append(f"Header contains {indicator}")
        
        return score, confidence_factors
    
    def label_email(self, email):
        """Assign category with confidence score."""
        scores = {}
        all_factors = {}
        
        # Calculate scores for each category
        for category, patterns in self.category_patterns.items():
            score, factors = self.calculate_category_score(email, category, patterns)
            scores[category] = score
            all_factors[category] = factors
        
        # Get the highest scoring category
        max_score = max(scores.values())
        best_category = max(scores.items(), key=lambda x: x[1])[0]
        
        # Calculate confidence level
        confidence_level = min(max_score / 20, 1.0)  # Normalize to 0-1
        
        return {
            'category': best_category,
            'confidence': confidence_level,
            'score': max_score,
            'factors': all_factors[best_category]
        }
    
    def batch_label_emails(self, emails):
        """Label multiple emails and return results."""
        results = []
        for email in emails:
            label_info = self.label_email(email)
            results.append({
                'message_id': email['message_id'],
                'category': label_info['category'],
                'confidence': label_info['confidence'],
                'factors': label_info['factors']
            })
        return results


# Usage example:
labeler = EmailLabeler()

# Label single email
sample_email = {
    'message_id': '123',
    'subject': 'Your Amazon order has shipped',
    'body': 'Your recent purchase has been shipped. Track your delivery...',
    'sender': 'ship-confirm@amazon.com',
    'headers': {'List-Id': 'shipping_notifications'}
}
result = labeler.label_email(sample_email)
print(f"Category: {result['category']}")
print(f"Confidence: {result['confidence']:.2f}")
print("Factors:", result['factors'])

# Batch label emails
labeled_data = labeler.batch_label_emails(df.to_dict('records'))
results_df = pd.DataFrame(labeled_data)

Category: SHOPPING
Confidence: 1.00
Factors: ['Sender matches @amazon.', 'Found 3 keywords', 'Header contains shipping']


In [17]:
# Initialize the labeler
labeler = EmailLabeler()

# Process all emails from our DataFrame
labeled_results = labeler.batch_label_emails(df.to_dict('records'))

# Convert results to DataFrame for analysis
results_df = pd.DataFrame(labeled_results)

# Show distribution of categories
print("\nCategory Distribution:")
print(results_df['category'].value_counts())

# Show average confidence per category
print("\nAverage Confidence by Category:")
print(results_df.groupby('category')['confidence'].mean())

# Show some sample results with high and low confidence
print("\nSample High Confidence Classifications:")
high_conf = results_df[results_df['confidence'] > 0.8].head(3)
for _, row in high_conf.iterrows():
    print(f"\nCategory: {row['category']}")
    print(f"Confidence: {row['confidence']:.2f}")
    print("Factors:", row['factors'])

print("\nSample Low Confidence Classifications:")
low_conf = results_df[results_df['confidence'] < 0.4].head(3)
for _, row in low_conf.iterrows():
    print(f"\nCategory: {row['category']}")
    print(f"Confidence: {row['confidence']:.2f}")
    print("Factors:", row['factors'])

# Save the labeled dataset
results_df.to_csv('../data/processed/auto_labeled_emails.csv', index=False)


Category Distribution:
category
BUSINESS COMMUNICATION    91
PERSONAL                  89
FINANCE                   66
CAREER                    47
SUBSCRIPTIONS             36
SHOPPING                  14
Name: count, dtype: int64

Average Confidence by Category:
category
BUSINESS COMMUNICATION    0.490659
CAREER                    0.286170
FINANCE                   0.418182
PERSONAL                  0.059551
SHOPPING                  0.192857
SUBSCRIPTIONS             0.255556
Name: confidence, dtype: float64

Sample High Confidence Classifications:

Category: BUSINESS COMMUNICATION
Confidence: 1.00
Factors: ['Found 4 keywords', 'Subject matches pattern project|report|business', 'Header contains project']

Category: BUSINESS COMMUNICATION
Confidence: 0.85
Factors: ['Found 4 keywords', 'Subject matches pattern project|report|business']

Category: BUSINESS COMMUNICATION
Confidence: 1.00
Factors: ['Found 8 keywords', 'Subject matches pattern project|report|business']

Sample Low Confid

In [18]:
# Merge the original email data with the labels
detailed_results = pd.merge(
    df, 
    results_df, 
    on='message_id', 
    how='left'
)

# Select and reorder columns, then print
columns_to_show = ['body', 'category', 'confidence']
detailed_results[columns_to_show]

Unnamed: 0,body,category,confidence
0,I am copying this to Tom and Teresa for info p...,FINANCE,0.45
1,Scripts and the coresponding talking points we...,FINANCE,0.30
2,Document attached. \n\n - 2k01 sc design m2.doc,PERSONAL,0.00
3,"March!\n \nSeriously, I will find time this we...",PERSONAL,0.00
4,"Michelle -- per our discussion, please prepare...",FINANCE,0.15
...,...,...,...
338,I havn't been able yet to pin down the posted ...,FINANCE,0.15
339,You too! I saw yor 5:30 AM message !\n-------...,PERSONAL,0.00
340,"\nAttached for your review and comment, please...",PERSONAL,0.00
341,"Michelle,\n \nThank you for your reply. I have...",BUSINESS COMMUNICATION,0.30


In [19]:
for _, row in detailed_results.iterrows():
    print(f"\nCategory: {row['category']}, Confidence: {row['confidence']:.2f}")
    print(f"Subject: {row['subject']}")
    print(f"Body: {row['body']}...")  # Show first 200 characters
    print("-" * 80)


Category: FINANCE, Confidence: 0.45
Subject: RE: Subsidiary Equity or Phantom Equity
Body: I am copying this to Tom and Teresa for info purposes. Tom can advise John but I think that it would be benefitial for you to leave John a message if you cannot reach him directly since I defer to your expertise in this area. I would be interested in participating in a discussion on this next week but do not hold things up on my account. Thanks, Nancy

-----Original Message-----
From: Cash, Michelle 
Sent: Thursday, October 11, 2001 2:29 AM
To: Corbet, Nancy
Subject: FW: Subsidiary Equity or Phantom Equity


FYI re:  equity grants/phantom equity grants for NuTec and related companies.  Any equity-type grants must go to the Enron Board Compensation Committee for approval, which is not simply a rubber stamp.  There must be significant justification as to why such a plan is necessary.  Do you want to let John know about this, or should I?

I have not addressed the equity/phantom equity in the emplo

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

class EmailPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
    def clean_text(self, text):
        """Clean and preprocess text."""
        if not isinstance(text, str):
            return ''
            
        # Convert to lowercase
        text = text.lower()
        
        # Remove email addresses
        text = re.sub(r'\S*@\S*\s?', '', text)
        
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [
            self.lemmatizer.lemmatize(token) 
            for token in tokens 
            if token not in self.stop_words
        ]
        
        return ' '.join(tokens)
    
    def preprocess_email(self, row):
        """Combine and preprocess subject and body."""
        subject = str(row['subject']) if pd.notnull(row['subject']) else ''
        body = str(row['body']) if pd.notnull(row['body']) else ''
        
        # Combine subject and body with more weight to subject
        combined_text = subject + ' ' + subject + ' ' + body
        return self.clean_text(combined_text)

# Prepare the data
def prepare_data(df):
    # Initialize preprocessor
    preprocessor = EmailPreprocessor()
    
    # Preprocess emails
    print("Preprocessing emails...")
    df['processed_text'] = df.apply(preprocessor.preprocess_email, axis=1)
    
    # Create feature vectors using TF-IDF
    print("Creating TF-IDF features...")
    vectorizer = TfidfVectorizer(
        max_features=1000,    # Limit features to top 1000 terms
        min_df=2,            # Ignore terms that appear in less than 2 documents
        max_df=0.95,         # Ignore terms that appear in more than 95% of docs
        ngram_range=(1, 2)   # Include both unigrams and bigrams
    )
    
    # Create features and split data
    X = vectorizer.fit_transform(df['processed_text'])
    y = df['category']
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print("Data preparation completed!")
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")
    
    return X_train, X_test, y_train, y_test, vectorizer

# Analyze processed text
def analyze_features(vectorizer, X):
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Calculate mean TF-IDF score for each feature
    mean_tfidf = X.mean(axis=0).A1
    
    # Create DataFrame of features and their scores
    feature_scores = pd.DataFrame({
        'feature': feature_names,
        'mean_tfidf': mean_tfidf
    })
    
    # Show top features
    print("\nTop 20 most important features:")
    print(feature_scores.sort_values('mean_tfidf', ascending=False).head(20))

# Use the preprocessing pipeline
X_train, X_test, y_train, y_test, vectorizer = prepare_data(detailed_results)

# Analyze the features
analyze_features(vectorizer, X_train)

# Save processed data if needed
processed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'vectorizer': vectorizer
}

import joblib
joblib.dump(processed_data, '../data/processed/processed_email_data.pkl')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samiransari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samiransari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/samiransari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/samiransari/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Preprocessing emails...
Creating TF-IDF features...
Data preparation completed!
Training set size: (274, 1000)
Test set size: (69, 1000)

Top 20 most important features:
              feature  mean_tfidf
284             enron    0.042297
274          employee    0.039067
549          michelle    0.039006
540           message    0.037066
666            please    0.033034
803              sent    0.031473
863           subject    0.029450
993             would    0.029374
664              plan    0.029246
622          original    0.027962
672                pm    0.027940
623  original message    0.027111
349                fw    0.026173
716          question    0.026100
72           attached    0.024869
907            thanks    0.023748
27          agreement    0.023069
577              need    0.022963
121              cash    0.022515
978              week    0.022023


['../data/processed/processed_email_data.pkl']

In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

def train_model(X_train, y_train):
    """Train a Multinomial Naive Bayes model."""
    print("Training Multinomial Naive Bayes model...")
    model = MultinomialNB()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the trained model on the test set."""
    print("Evaluating model...")
    y_pred = model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")

# Use the preprocessing pipeline
X_train, X_test, y_train, y_test, vectorizer = prepare_data(detailed_results)

# Train the model
model = train_model(X_train, y_train)

# Evaluate the model
evaluate_model(model, X_test, y_test)

# Save the trained model and vectorizer if needed
trained_model = {
    'model': model,
    'vectorizer': vectorizer
}
joblib.dump(trained_model, '../models/email_classifier.pkl')

Preprocessing emails...
Creating TF-IDF features...
Data preparation completed!
Training set size: (274, 1000)
Test set size: (69, 1000)
Training Multinomial Naive Bayes model...
Evaluating model...
Classification Report:
                        precision    recall  f1-score   support

BUSINESS COMMUNICATION       0.59      0.72      0.65        18
                CAREER       1.00      0.10      0.18        10
               FINANCE       0.62      0.77      0.69        13
              PERSONAL       0.40      0.67      0.50        18
              SHOPPING       0.00      0.00      0.00         3
         SUBSCRIPTIONS       0.00      0.00      0.00         7

              accuracy                           0.52        69
             macro avg       0.44      0.38      0.34        69
          weighted avg       0.52      0.52      0.46        69

Accuracy: 0.5217


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['../models/email_classifier.pkl']

In [38]:
def predict_email(model, vectorizer, email_text):
    """Predict the category and confidence score of a new email."""
    # Preprocess the email text
    preprocessor = EmailPreprocessor()
    processed_text = preprocessor.clean_text(email_text)
    
    # Transform the preprocessed text using the loaded vectorizer
    email_features = vectorizer.transform([processed_text])
    
    # Make predictions using the loaded model
    predicted_category = model.predict(email_features)[0]
    
    # Get the confidence score for the predicted category
    confidence_score = model.predict_proba(email_features)[0, model.classes_.tolist().index(predicted_category)]
    
    return predicted_category, confidence_score

# Load the trained model and vectorizer
loaded_model = joblib.load('../models/email_classifier.pkl')
model = loaded_model['model']
vectorizer = loaded_model['vectorizer']

# Example usage
new_email_text = "Subject: Important Meeting\nHello Team,\n\nJust a reminder that we have an important meeting scheduled for tomorrow at 10 AM. Please make sure to attend.\n\nBest regards,\nJohn"
new_email_text2 = "Subject: Invoice Payment #1234\nDear Client,\n\nThis is a reminder that your invoice payment of $500 is due by next week. Please process the payment at your earliest convenience.\n\nBank Account Details:\nAccount Number: 1234-5678\nBank: Finance Bank\n\nPlease let us know once the payment is processed.\n\nBest regards,\nBilling Team"
email_text3 = "Subject: Medical Appointment Reminder\nDear Mr. Smith,\n\nThis is a reminder about your upcoming medical appointment scheduled for tomorrow at 2 PM with Dr. Johnson. Please arrive 15 minutes early to complete any necessary paperwork.\n\nPlease bring:\n- Insurance card\n- List of current medications\n- Recent test results if any\n\nIf you need to reschedule, please call our office at least 24 hours in advance.\n\nBest regards,\nCity Health Clinic"

predicted_category, confidence = predict_email(model, vectorizer, email_text3)
print(f"Predicted Category: {predicted_category}")
print(f"Confidence Score: {confidence:.4f}")

Predicted Category: PERSONAL
Confidence Score: 0.2835


In [36]:
import os
import joblib

def predict_email_files(model, vectorizer, email_directory):
    """Predict categories for multiple email files in a directory."""
    preprocessor = EmailPreprocessor()
    
    for filename in os.listdir(email_directory):
        if filename.endswith("."):  # Assuming email files have .txt extension
            file_path = os.path.join(email_directory, filename)
            
            with open(file_path, 'r') as file:
                email_text = file.read()
            
            processed_text = preprocessor.clean_text(email_text)
            email_features = vectorizer.transform([processed_text])
            
            predicted_category = model.predict(email_features)[0]
            confidence_score = model.predict_proba(email_features)[0, model.classes_.tolist().index(predicted_category)]
            
            print(f"File: {filename}")
            print(f"Predicted Category: {predicted_category}")
            print(f"Confidence Score: {confidence_score:.4f}")
            print("---")

# Load the trained model and vectorizer
loaded_model = joblib.load('../models/email_classifier.pkl')
model = loaded_model['model']
vectorizer = loaded_model['vectorizer']

# Directory containing the email files
email_directory = '../data/raw/enron_dataset/allen-p/inbox'

# Predict categories for email files
predict_email_files(model, vectorizer, email_directory)

File: 36.
Predicted Category: BUSINESS COMMUNICATION
Confidence Score: 0.2949
---
File: 19.
Predicted Category: PERSONAL
Confidence Score: 0.2929
---
File: 3.
Predicted Category: PERSONAL
Confidence Score: 0.3690
---
File: 9.
Predicted Category: PERSONAL
Confidence Score: 0.2828
---
File: 13.
Predicted Category: FINANCE
Confidence Score: 0.3076
---
File: 75.
Predicted Category: PERSONAL
Confidence Score: 0.2921
---
File: 35.
Predicted Category: BUSINESS COMMUNICATION
Confidence Score: 0.3504
---
File: 10.
Predicted Category: BUSINESS COMMUNICATION
Confidence Score: 0.3655
---
File: 84.
Predicted Category: FINANCE
Confidence Score: 0.3121
---
File: 42.
Predicted Category: BUSINESS COMMUNICATION
Confidence Score: 0.2663
---
File: 24.
Predicted Category: BUSINESS COMMUNICATION
Confidence Score: 0.3464
---
File: 67.
Predicted Category: BUSINESS COMMUNICATION
Confidence Score: 0.6507
---
File: 41.
Predicted Category: PERSONAL
Confidence Score: 0.2929
---
File: 87.
Predicted Category: FINANC