In [1]:
# 1 step 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download required NLTK data
nltk.download('stopwords')

class EmailPreprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
        self.vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
        self.label_encoder = LabelEncoder()
    
    def clean_text(self, text):
        """Clean and preprocess email text"""
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Tokenize and remove stopwords
        words = text.split()
        words = [self.stemmer.stem(word) for word in words if word not in self.stop_words]
        
        return ' '.join(words)
    
    def prepare_features(self, df):
        """Prepare features from email data"""
        # Combine subject and body
        df['combined_text'] = df['Subject'].fillna('') + ' ' + df['Body'].fillna('')
        
        # Clean the combined text
        df['cleaned_text'] = df['combined_text'].apply(self.clean_text)
        
        # Vectorize the text
        X = self.vectorizer.fit_transform(df['cleaned_text'])
        
        # Encode labels
        y = self.label_encoder.fit_transform(df['category'])
        
        return X, y


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# import joblib

# class EmailClassifier:
#     def __init__(self):
#         self.models = {
#             'Multinomial_NB': MultinomialNB(),
#             'Decision_Tree': DecisionTreeClassifier(random_state=42),
#             'Random_Forest': RandomForestClassifier(n_estimators=100, random_state=42),
#             'Logistic_Regression': LogisticRegression(max_iter=1000, random_state=42),
#             'SVM': SVC(kernel='linear', random_state=42)
#         }
#         self.best_model = None
#         self.best_accuracy = 0
#         self.preprocessor = EmailPreprocessor()
    
#     def train_models(self, X_train, X_test, y_train, y_test):
#         """Train all models and find the best one"""
#         results = {}
        
#         for name, model in self.models.items():
#             print(f"Training {name}...")
            
#             # Train the model
#             model.fit(X_train, y_train)
            
#             # Make predictions
#             y_pred = model.predict(X_test)
            
#             # Calculate accuracy
#             accuracy = accuracy_score(y_test, y_pred)
#             results[name] = {
#                 'model': model,
#                 'accuracy': accuracy,
#                 'classification_report': classification_report(y_test, y_pred)
#             }
            
#             print(f"{name} Accuracy: {accuracy:.4f}")
            
#             # Update best model
#             if accuracy > self.best_accuracy:
#                 self.best_accuracy = accuracy
#                 self.best_model = model
#                 self.best_model_name = name
        
#         return results
    
#     def save_best_model(self, filepath='best_email_classifier.pkl'):
#         """Save the best model and preprocessor"""
#         model_data = {
#             'model': self.best_model,
#             'preprocessor': self.preprocessor,
#             'model_name': self.best_model_name,
#             'accuracy': self.best_accuracy
#         }
#         joblib.dump(model_data, filepath)
#         print(f"Best model ({self.best_model_name}) saved with accuracy: {self.best_accuracy:.4f}")


In [4]:
# Enhanced model training with better data handling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib

class ImprovedEmailClassifier:
    def __init__(self):
        self.models = {
            'Multinomial_NB': MultinomialNB(alpha=1.0),  # Better alpha value
            'Decision_Tree': DecisionTreeClassifier(
                random_state=42, 
                max_depth=10,  # Prevent overfitting
                min_samples_split=10,
                min_samples_leaf=5
            ),
            'Random_Forest': RandomForestClassifier(
                n_estimators=100, 
                random_state=42,
                max_depth=10,
                min_samples_split=10,
                class_weight='balanced'  # Handle imbalanced data
            ),
            'Logistic_Regression': LogisticRegression(
                max_iter=1000, 
                random_state=42,
                class_weight='balanced',  # Important for imbalanced data
                C=1.0
            ),
            'SVM': SVC(
                kernel='linear', 
                random_state=42,
                class_weight='balanced',  # Handle imbalanced data
                probability=True  # Enable probability predictions
            )
        }
        self.best_model = None
        self.best_accuracy = 0
        self.preprocessor = ImprovedEmailPreprocessor()
    
    def check_data_balance(self, y):
        """Check if data is balanced across categories"""
        unique, counts = np.unique(y, return_counts=True)
        print("Data Distribution:")
        for label, count in zip(unique, counts):
            category = self.preprocessor.label_encoder.inverse_transform([label])[0]
            percentage = (count / len(y)) * 100
            print(f"  {category}: {count} samples ({percentage:.1f}%)")
        
        # Check if any category is over-represented
        max_percentage = max(counts) / len(y) * 100
        if max_percentage > 70:
            print(f"⚠️  Warning: Data imbalance detected! {max_percentage:.1f}% in one category")
            return False
        return True
    
    def train_models(self, X_train, X_test, y_train, y_test):
        """Train all models with better evaluation"""
        results = {}
        
        # Check data balance
        print("Training Data Analysis:")
        self.check_data_balance(y_train)
        print("\nTesting Data Analysis:")
        self.check_data_balance(y_test)
        
        for name, model in self.models.items():
            print(f"\nTraining {name}...")
            
            # Train the model
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test)
            
            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            
            # Get detailed classification report
            report = classification_report(y_test, y_pred, output_dict=True)
            
            results[name] = {
                'model': model,
                'accuracy': accuracy,
                'classification_report': report,
                'confusion_matrix': confusion_matrix(y_test, y_pred)
            }
            
            print(f"{name} Accuracy: {accuracy:.4f}")
            
            # Check if model is biased toward one class
            pred_unique, pred_counts = np.unique(y_pred, return_counts=True)
            if len(pred_unique) == 1:
                print(f"⚠️  WARNING: {name} predicts only one class!")
            else:
                print(f"✅ {name} predicts {len(pred_unique)} different classes")
            
            # Update best model (consider balanced accuracy for imbalanced data)
            if accuracy > self.best_accuracy and len(pred_unique) > 1:
                self.best_accuracy = accuracy
                self.best_model = model
                self.best_model_name = name
        
        return results

class ImprovedEmailPreprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
        # Better TF-IDF parameters
        self.vectorizer = TfidfVectorizer(
            max_features=5000, 
            stop_words='english',
            ngram_range=(1, 2),  # Include bigrams
            min_df=2,  # Ignore terms that appear in less than 2 documents
            max_df=0.8  # Ignore terms that appear in more than 80% of documents
        )
        self.label_encoder = LabelEncoder()
    
    def clean_text(self, text):
        """Enhanced text cleaning"""
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters but keep some punctuation
        text = re.sub(r'[^a-zA-Z\s!?.]', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Tokenize and remove stopwords
        words = text.split()
        words = [self.stemmer.stem(word) for word in words if word not in self.stop_words and len(word) > 2]
        
        return ' '.join(words)
    
    def prepare_features(self, df):
        """Enhanced feature preparation with better handling"""
        # Combine subject and body with weights
        df['combined_text'] = (df['subject'].fillna('') * 2 + ' ' + df['body'].fillna(''))  # Give more weight to subject
        
        # Clean the combined text
        df['cleaned_text'] = df['combined_text'].apply(self.clean_text)
        
        # Remove empty texts
        df = df[df['cleaned_text'].str.len() > 0]
        
        # Vectorize the text
        X = self.vectorizer.fit_transform(df['cleaned_text'])
        
        # Encode labels
        y = self.label_encoder.fit_transform(df['category'])
        
        print(f"Features created: {X.shape}")
        print(f"Categories: {list(self.label_encoder.classes_)}")
        
        return X, y, df


In [5]:
def improved_training():
    """Improved training with better data handling"""
    print("🔧 Starting Improved Email Classification Training")
    print("=" * 50)
    
    # Load your email data
    try:
        df = pd.read_csv('email_data.csv')
        print(f"✅ Loaded {len(df)} emails from dataset")
    except FileNotFoundError:
        print("❌ Error: email_data.csv not found!")
        print("Please make sure your training data file exists.")
        return
    
    # Check required columns
    required_columns = ['subject', 'body', 'category']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"❌ Error: Missing columns: {missing_columns}")
        return
    
    # Display data info
    print(f"\nDataset Info:")
    print(f"Total emails: {len(df)}")
    print(f"Categories distribution:")
    category_counts = df['category'].value_counts()
    for category, count in category_counts.items():
        percentage = (count / len(df)) * 100
        print(f"  {category}: {count} ({percentage:.1f}%)")
    
    # Check for data imbalance
    min_category_count = category_counts.min()
    max_category_count = category_counts.max()
    if max_category_count / min_category_count > 10:
        print("⚠️  Warning: Severe data imbalance detected!")
        print("Consider balancing your dataset or using class weights.")
    
    # Initialize classifier
    classifier = ImprovedEmailClassifier()
    
    # Prepare features
    try:
        X, y, cleaned_df = classifier.preprocessor.prepare_features(df)
    except Exception as e:
        print(f"❌ Error in feature preparation: {e}")
        return
    
    # Split data with stratification to maintain class distribution
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"\nTraining set: {X_train.shape[0]} emails")
    print(f"Testing set: {X_test.shape[0]} emails")
    
    # Train models and get results
    results = classifier.train_models(X_train, X_test, y_train, y_test)
    
    # Save the best model
    if classifier.best_model is not None:
        classifier.save_best_model()
        
        # Test with sample emails
        test_sample_classifications(classifier)
    else:
        print("❌ No suitable model found! All models may be biased.")
        
    # Print detailed results
    print("\n" + "="*50)
    print("FINAL RESULTS")
    print("="*50)
    for name, result in results.items():
        print(f"\n{name}:")
        print(f"Accuracy: {result['accuracy']:.4f}")
        print("Per-class performance:")
        for class_name in classifier.preprocessor.label_encoder.classes_:
            if class_name in result['classification_report']:
                f1 = result['classification_report'][class_name]['f1-score']
                precision = result['classification_report'][class_name]['precision']
                recall = result['classification_report'][class_name]['recall']
                print(f"  {class_name}: F1={f1:.3f}, Precision={precision:.3f}, Recall={recall:.3f}")
    
    if classifier.best_model is not None:
        print(f"\n🏆 Best Model: {classifier.best_model_name}")
        print(f"🎯 Best Accuracy: {classifier.best_accuracy:.4f}")
    
def test_sample_classifications(classifier):
    """Test the model with known examples"""
    test_emails = [
        ("Free money! Click here now!", "Get rich quick! Limited time offer!", "spam"),
        ("Meeting tomorrow at 2 PM", "Hi, don't forget our meeting tomorrow", "primary"),
        ("Your order has shipped", "Thank you for your purchase. Tracking: 123", "updates"),
        ("Friend posted on Facebook", "John posted a new photo", "social"),
        ("Win free iPhone!", "Congratulations! You've won! Click to claim", "spam")
    ]
    
    print("\n🧪 Testing with sample emails:")
    print("-" * 40)
    
    for subject, body, expected in test_emails:
        result = classifier.classifier.classify_email(subject, body)
        status = "✅" if result['category'] == expected else "❌"
        print(f"{status} Expected: {expected}, Got: {result['category']} (confidence: {result['confidence']:.3f})")
        print(f"   Subject: {subject[:50]}...")


In [6]:
class EmailClassificationService:
    def __init__(self, model_path='best_email_classifier.pkl'):
        """Load the trained model with error handling"""
        try:
            self.model_data = joblib.load(model_path)
            self.model = self.model_data['model']
            self.preprocessor = self.model_data['preprocessor']
            print(f"✅ Model loaded: {self.model_data.get('model_name', 'Unknown')}")
            print(f"🎯 Training accuracy: {self.model_data.get('accuracy', 'Unknown')}")
        except FileNotFoundError:
            print("❌ Error: Model file not found! Please train the model first.")
            raise
        except Exception as e:
            print(f"❌ Error loading model: {e}")
            raise
        
        # Category mapping
        self.categories = ['spam', 'non_spam', 'primary', 'social', 'updates']
    
    def classify_email(self, subject, body):
        """
        Enhanced email classification with debugging
        """
        try:
            # Combine subject and body (give more weight to subject)
            combined_text = f"{subject} {subject} {body}"  # Subject appears twice for more weight
            
            # Clean the text
            cleaned_text = self.preprocessor.clean_text(combined_text)
            
            # Debug: Check if text is being cleaned properly
            if len(cleaned_text.strip()) == 0:
                print(f"⚠️  Warning: Text became empty after cleaning")
                return {
                    'category': 'non_spam',  # Default to non_spam instead of spam
                    'confidence': 0.5,
                    'is_spam': False,
                    'debug_info': 'Empty text after cleaning'
                }
            
            # Vectorize
            text_vector = self.preprocessor.vectorizer.transform([cleaned_text])
            
            # Predict
            prediction = self.model.predict(text_vector)[0]
            
            # Get prediction probabilities if available
            if hasattr(self.model, 'predict_proba'):
                probabilities = self.model.predict_proba(text_vector)[0]
                confidence = float(max(probabilities))
                
                # Debug: Check if model is always predicting the same class
                all_probs = {
                    cat: float(prob) for cat, prob in zip(self.preprocessor.label_encoder.classes_, probabilities)
                }
                print(f"🔍 Debug - All probabilities: {all_probs}")
                
            else:
                confidence = 1.0
                all_probs = None
            
            # Decode the prediction
            category = self.preprocessor.label_encoder.inverse_transform([prediction])[0]
            
            # Additional spam detection rules (as backup)
            spam_keywords = ['free', 'win', 'winner', 'click here', 'limited time', 'urgent', 'congratulations']
            text_lower = combined_text.lower()
            spam_score = sum(1 for keyword in spam_keywords if keyword in text_lower)
            
            # If confidence is low and we detect spam keywords, adjust
            if confidence < 0.7 and spam_score >= 2:
                category = 'spam'
                confidence = 0.8
            
            return {
                'category': str(category),
                'confidence': confidence,
                'is_spam': category == 'spam',
                'all_probabilities': all_probs,
                'spam_keyword_score': spam_score,
                'cleaned_text_length': len(cleaned_text)
            }
            
        except Exception as e:
            print(f"❌ Classification error: {e}")
            return {
                'category': 'non_spam',  # Default to non_spam, not spam
                'confidence': 0.0,
                'error': str(e),
                'is_spam': False
            }


In [9]:
import mysql.connector
from datetime import datetime
import logging
import numpy as np

class EmailDatabase:
    def __init__(self, host='localhost', user='root', 
                 password='Mh11@SrA', database='email_management'):
        self.connection_params = {
            'host': host,
            'user': user,
            'password': password,
            'database': database
        }
        self.classifier = EmailClassificationService()
    
    def get_connection(self):
        """Get database connection"""
        return mysql.connector.connect(**self.connection_params)
    
    def store_email(self, subject, body, sender_email=None):
        """Store email in database with classification"""
        try:
            # Classify the email
            classification = self.classifier.classify_email(subject, body)
            
            # Connect to database
            conn = self.get_connection()
            cursor = conn.cursor()
            
            # Convert NumPy types to Python native types
            confidence_value = float(classification['confidence'])
            
            # Insert email
            query = """
            INSERT INTO emails (subject, body, sender_email, category, confidence, is_processed)
            VALUES (%s, %s, %s, %s, %s, %s)
            """
            
            values = (
                str(subject),
                str(body),
                str(sender_email) if sender_email else None,
                str(classification['category']),
                confidence_value,
                True
            )
            
            cursor.execute(query, values)
            conn.commit()
            
            email_id = cursor.lastrowid
            
            cursor.close()
            conn.close()
            
            print(f"✅ Email stored with ID: {email_id}")
            print(f"📧 Classification: {classification['category']}")
            print(f"🎯 Confidence: {confidence_value:.2f}")
            
            return {
                'email_id': email_id,
                'classification': classification,
                'stored': True
            }
            
        except Exception as e:
            print(f"❌ Error storing email: {e}")
            return {'stored': False, 'error': str(e)}
    
    def get_spam_emails(self):
        """Get all spam emails"""
        try:
            conn = self.get_connection()
            cursor = conn.cursor(dictionary=True)
            
            query = "SELECT * FROM emails WHERE category = 'spam' ORDER BY created_at DESC"
            cursor.execute(query)
            
            results = cursor.fetchall()
            
            cursor.close()
            conn.close()
            
            return results
            
        except Exception as e:
            print(f"❌ Error fetching spam emails: {e}")
            return []
    
    def get_emails_by_category(self, category):
        """Get emails by specific category"""
        try:
            conn = self.get_connection()
            cursor = conn.cursor(dictionary=True)
            
            query = "SELECT * FROM emails WHERE category = %s ORDER BY created_at DESC LIMIT 20"
            cursor.execute(query, (category,))
            
            results = cursor.fetchall()
            
            cursor.close()
            conn.close()
            
            return results
            
        except Exception as e:
            print(f"❌ Error fetching emails for category {category}: {e}")
            return []
    
    def get_all_emails(self, limit=50):
        """Get all emails with limit"""
        try:
            conn = self.get_connection()
            cursor = conn.cursor(dictionary=True)
            
            query = "SELECT * FROM emails ORDER BY created_at DESC LIMIT %s"
            cursor.execute(query, (limit,))
            
            results = cursor.fetchall()
            
            cursor.close()
            conn.close()
            
            return results
            
        except Exception as e:
            print(f"❌ Error fetching all emails: {e}")
            return []
    
    def manual_cleanup_spam(self, hours_old=1):
        """Manually cleanup old spam emails"""
        try:
            conn = self.get_connection()
            cursor = conn.cursor()
            
            # Get count before deletion for logging
            count_query = """
            SELECT COUNT(*) FROM emails 
            WHERE category = 'spam' 
            AND created_at <= DATE_SUB(NOW(), INTERVAL %s HOUR)
            """
            cursor.execute(count_query, (hours_old,))
            count_before = cursor.fetchone()[0]
            
            if count_before > 0:
                # Delete the spam emails
                delete_query = """
                DELETE FROM emails 
                WHERE category = 'spam' 
                AND created_at <= DATE_SUB(NOW(), INTERVAL %s HOUR)
                """
                cursor.execute(delete_query, (hours_old,))
                conn.commit()
                
                deleted_count = cursor.rowcount
                
                cursor.close()
                conn.close()
                
                print(f"🗑️  Cleaned up {deleted_count} old spam emails")
                return deleted_count
            else:
                cursor.close()
                conn.close()
                print("🧹 No old spam emails to clean up")
                return 0
                
        except Exception as e:
            print(f"❌ Error during cleanup: {e}")
            return 0


In [12]:
def complete_email_system():
    """Complete email classification and storage system"""
    
    # Initialize database
    db = EmailDatabase()
    
    print("Complete Email Classification System")
    print("=" * 40)
    
    while True:
        print("\nOptions:")
        print("1. 📝 Classify and store new email")
        print("2. 🚨 View spam emails")
        print("3. 📂 View emails by category")
        print("4. 🗑️  Manual spam cleanup")
        print("5. 📋 View all emails")
        print("6. 🚪 Exit")
        
        choice = input("Enter your choice (1-6): ")
        
        if choice == '1':
            print("\nEnter email details:")
            subject = input("Subject: ")
            body = input("Body: ")
            sender = input("Sender email (optional): ") or None
            
            # Store email with classification
            result = db.store_email(subject, body, sender)
            
            if result['stored']:
                classification = result['classification']
                print(f"\n✅ Email stored successfully!")
                print(f"📧 Category: {classification['category']}")
                print(f"🎯 Confidence: {classification['confidence']:.2f}")
                
                if classification['is_spam']:
                    print("⚠️  This spam email will be automatically deleted!")
            else:
                print(f"❌ Error: {result['error']}")
                
        elif choice == '2':
            spam_emails = db.get_spam_emails()
            print(f"\nFound {len(spam_emails)} spam emails:")
            
            for email in spam_emails[:5]:  # Show first 5
                print(f"🆔 ID: {email['id']}")
                print(f"📧 Subject: {email['subject'][:50]}...")
                print(f"⏰ Created: {email['created_at']}")
                print(f"🎯 Confidence: {email['confidence']:.2f}")
                print("-" * 30)
                
        elif choice == '3':
            print("\nAvailable categories: spam, non_spam, primary, social, updates")
            category = input("Enter category: ")
            
            emails = db.get_emails_by_category(category)
            print(f"\nFound {len(emails)} emails in '{category}' category:")
            
            for email in emails[:5]:  # Show first 5
                print(f"🆔 ID: {email['id']}")
                print(f"📧 Subject: {email['subject'][:50]}...")
                print(f"🎯 Confidence: {email['confidence']:.2f}")
                print("-" * 30)
                
        elif choice == '4':
            hours = input("Delete spam older than how many hours? (default: 1): ")
            try:
                hours = int(hours) if hours else 1
            except:
                hours = 1
                
            deleted_count = db.manual_cleanup_spam(hours_old=hours)
                
        elif choice == '5':
            limit = input("How many emails to show? (default: 10): ")
            try:
                limit = int(limit) if limit else 10
            except:
                limit = 10
                
            emails = db.get_all_emails(limit=limit)
            print(f"\nShowing {len(emails)} most recent emails:")
            
            for email in emails:
                print(f"🆔 ID: {email['id']}")
                print(f"📧 Subject: {email['subject'][:50]}...")
                print(f"📂 Category: {email['category']}")
                print(f"🎯 Confidence: {email['confidence']:.2f}")
                print(f"⏰ Created: {email['created_at']}")
                print("-" * 30)
                
        elif choice == '6':
            print("👋 Goodbye!")
            break
        else:
            print("❌ Invalid choice. Please try again.")

# Run the complete system
if __name__ == "__main__":
    complete_email_system()


✅ Model loaded: Random_Forest
🎯 Training accuracy: 0.9489381636477202
Complete Email Classification System

Options:
1. 📝 Classify and store new email
2. 🚨 View spam emails
3. 📂 View emails by category
4. 🗑️  Manual spam cleanup
5. 📋 View all emails
6. 🚪 Exit


Enter your choice (1-6):  1



Enter email details:


Subject:  Interview Request for NielsenIQ
Body:  Please see attachment for Teams Link (Video Interview)*  Dear Sajjan,  Congratulations on making it to the next stage of the interview process for the Executive, Data Scientist position at NielsenIQ!     We have scheduled an interview appointment for you on Wednesday, July 30, 2025 at 11:45 AM GMT+5:30. Please find the interview invite attached. If you need to reschedule to a different time and/or date, feel free to contact us at   or pearl.silveira@nielseniq.com.  For your reference, here's the link to the job ad: https://jobs.smartrecruiters.com/ni/NielsenIQ/3dd0fbe8-f1a3-4f50-a624-ca2457889557-executive-data-scientist  To help you best prepare for your interview, please refer to our Candidate booklet.pdf. We built this document to help guide you and provide you with more information about our business, our culture, what to expect from your hiring process, and tips from our team to prepare for success.   We're looking forward to meetin

🔍 Debug - All probabilities: {'Primary': 0.17, 'Promotion': 0.03, 'Social': 0.02, 'Spam': 0.76, 'Updates': 0.02}
✅ Email stored with ID: 14
📧 Classification: Spam
🎯 Confidence: 0.76

✅ Email stored successfully!
📧 Category: Spam
🎯 Confidence: 0.76

Options:
1. 📝 Classify and store new email
2. 🚨 View spam emails
3. 📂 View emails by category
4. 🗑️  Manual spam cleanup
5. 📋 View all emails
6. 🚪 Exit


Enter your choice (1-6):  2



Found 3 spam emails:
🆔 ID: 14
📧 Subject: Interview Request for NielsenIQ...
⏰ Created: 2025-07-27 18:34:56
🎯 Confidence: 0.76
------------------------------
🆔 ID: 13
📧 Subject: You have a new friend request from Priya Sharma...
⏰ Created: 2025-07-27 18:25:52
🎯 Confidence: 0.55
------------------------------
🆔 ID: 12
📧 Subject: 🔥 Flat 50% Off on All Fashion – Today Only!...
⏰ Created: 2025-07-27 18:25:01
🎯 Confidence: 0.49
------------------------------

Options:
1. 📝 Classify and store new email
2. 🚨 View spam emails
3. 📂 View emails by category
4. 🗑️  Manual spam cleanup
5. 📋 View all emails
6. 🚪 Exit


Enter your choice (1-6):  6


👋 Goodbye!
