In [9]:
import os
import json
import shutil
import glob

In [3]:
# Path to the JSON file
file_path = "raw_email_labels_refined.json"

# Load the JSON file
with open(file_path, "r") as file:
    email_labels = json.load(file)

# Define valid fields and values
valid_fields = {
    "Spam": ["Yes", "No"],
    "Time_Sensitive": ["Yes", "No"],
    "Type": ["Event", "Reminder", "N/A"],
    "Category": ["Work", "Study", "Leisure"],
    "Format": ["Online", "In-person"],
    "Action_Required": ["Yes", "No"],
    "Priority_Level": ["Low", "Medium", "High", "Urgent"],
}

# Define all required fields (template)
required_fields = [
    "Spam", "Subject", "Sender", "send_date",
    "Time_Sensitive", "Start", "End", "Type",
    "Category", "Format", "Location",
    "Action_Required", "Priority_Level"
]

# Check each email label
for i, email in enumerate(email_labels):
    missing_fields = [field for field in required_fields if field not in email]
    invalid_fields = []
    
    # Validate existing fields
    for field, valid_values in valid_fields.items():
        if field in email and email[field] not in valid_values:
            invalid_fields.append((field, email[field]))
    
    # Print results for missing or invalid fields
    if missing_fields:
        print(f"Missing fields at index {i}: {missing_fields}")
    if invalid_fields:
        print(f"Invalid values at index {i}: {invalid_fields}")

In [4]:
len(email_labels)

32

In [5]:
for i in range(len(email_labels)):
    email_path = f"./raw_emails/atariarchive{i+1}.txt"

In [7]:
# Path to the raw email labels JSON
file_path = "raw_email_labels_refined.json"

# Directories
raw_email_dir = "./raw_emails"
real_emails_dir = "./real_emails"
spam_dir = "./spams"

# Ensure output directories exist
os.makedirs(real_emails_dir, exist_ok=True)
os.makedirs(spam_dir, exist_ok=True)

# Load email labels
with open(file_path, "r") as file:
    email_labels = json.load(file)

# Initialize JSON data for spam and non-spam
non_spam_json = []
spam_json = []

# Process each email
for i in range(len(email_labels)):
    email_path = os.path.join(raw_email_dir, f"atariarchive{i+1}.txt")
    
    # Check if the email path is valid
    if os.path.exists(email_path):
        with open(email_path, "r") as email_file:
            content = email_file.read()
        # Prepare entry
        email_entry = {"path": email_path, "content": content, "label": email_labels[i]}
        
        # Classify email based on Spam label
        if email_labels[i]["Spam"] == "No":
            non_spam_json.append(email_entry)
            # Copy to non-spam directory
            shutil.copy(email_path, real_emails_dir)
        elif email_labels[i]["Spam"] == "Yes":
            spam_json.append(email_entry)
            # Copy to spam directory
            shutil.copy(email_path, spam_dir)
    else:
        print(f"Invalid path: {email_path}")

# Write the JSON files
with open("./PAIRED_real_email.json", "w") as non_spam_file:
    json.dump(non_spam_json, non_spam_file, indent=4)

with open("./PAIRED_spam_email.json", "w") as spam_file:
    json.dump(spam_json, spam_file, indent=4)

print("Processing complete. JSON files and email copies created.")

Processing complete. JSON files and email copies created.


In [12]:
# Path to Enron spam emails
spam_files_path = "./spams/Enron*.txt"

# Gather all spam file paths
spam_file_paths = glob.glob(spam_files_path)


In [13]:
spam_file_paths

['./spams/Enron5.txt',
 './spams/Enron4.txt',
 './spams/Enron6.txt',
 './spams/Enron7.txt',
 './spams/Enron3.txt',
 './spams/Enron2.txt',
 './spams/Enron1.txt']

In [15]:

# Initialize the list for JSON entries
spam_json = []

# Define the spam label template
spam_template = {
    "Spam": "Yes",
    "Time_Sensitive": "No",
    "Type": "N/A",
    "Category": "Leisure",
    "Format": "Online",
    "Location": "N/A",
    "Action_Required": "No",
    "Priority_Level": "Low"
}

# Process each spam file
for spam_file in spam_file_paths:
    if os.path.exists(spam_file):
        # Read the content of the spam email
        with open(spam_file, "r") as file:
            content = file.read()
        
        # Extract metadata (you may add logic to parse Subject, Sender, etc., if needed)
        email_entry = {
            "path": spam_file,
            "content": content,
            "label": {
                "Spam": "Yes",
                "Subject": "",
                "Sender": "",
                "send_date": "",
                "Time_Sensitive": "No",
                "Start": "",
                "End": "",
                "Type": "N/A",
                "Category": "Leisure",
                "Format": "Online",
                "Location": "N/A",
                "Action_Required": "No",
                "Priority_Level": "Low"
            }
        }
        
        # Add the entry to the JSON list
        spam_json.append(email_entry)
    else:
        print(f"File not found: {spam_file}")

# Save the spam JSON file
with open("./spam_emails.json", "w") as spam_file:
    json.dump(spam_json, spam_file, indent=4)

print("Spam emails processed and stored in spam_emails.json.")

Spam emails processed and stored in spam_emails.json.
