In [None]:
!pip install pandas

Collecting pandas
  Downloading pandas-3.0.0-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.4.2-cp312-cp312-win_amd64.whl.metadata (6.6 kB)
Collecting tzdata (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-3.0.0-cp312-cp312-win_amd64.whl (9.7 MB)
   ---------------------------------------- 0.0/9.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/9.7 MB 1.2 MB/s eta 0:00:08
   --- ------------------------------------ 0.8/9.7 MB 1.3 MB/s eta 0:00:07
   ----- ---------------------------------- 1.3/9.7 MB 1.3 MB/s eta 0:00:07
   ------ --------------------------------- 1.6/9.7 MB 1.3 MB/s eta 0:00:07
   ------- -------------------------------- 1.8/9.7 MB 1.3 MB/s eta 0:00:07
   -------- ------------------------------- 2.1/9.7 MB 1.

In [None]:
import json
import pandas as pd

file_path = "../data/raw/cleaned_enron_emails.json"

with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# data is already a list of email dictionaries
df = pd.DataFrame(data)

# Optional but recommended: remove junk macOS files
df = df[df["Filename"] != ".DS_Store"]
df = df[df["Body"].str.strip() != ""]

df.reset_index(drop=True, inplace=True)

df.head()
df.info()


<class 'pandas.DataFrame'>
RangeIndex: 516793 entries, 0 to 516792
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   From       516793 non-null  str  
 1   To         516793 non-null  str  
 2   Subject    516793 non-null  str  
 3   Date       516793 non-null  str  
 4   Body       516793 non-null  str  
 5   ThreadKey  516793 non-null  str  
 6   Filename   516793 non-null  str  
dtypes: str(7)
memory usage: 27.6 MB


In [None]:
df["Date"] = pd.to_datetime(
    df["Date"],
    errors="coerce",
    utc=True   
)

df = df.dropna(subset=["Date"])
df["year"] = df["Date"].dt.year
df["month"] = df["Date"].dt.month



In [None]:
import re
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.strip()

df["clean_body"] = df["Body"].apply(preprocess_text)
df["clean_subject"] = df["Subject"].fillna("").apply(preprocess_text)


In [None]:
df["email_length"] = df["clean_body"].apply(len)
df["num_recipients"] = df["To"].apply(lambda x: len(str(x).split(",")))

df["sender_domain"] = df["From"].apply(
    lambda x: x.split("@")[-1] if isinstance(x, str) and "@" in x else "unknown"
)


In [None]:
IMPORTANT_KEYWORDS = [
    "meeting", "deadline", "urgent", "contract",
    "invoice", "report", "approval", "schedule"
]

def label_importance(body, subject):
    combined = f"{subject} {body}"
    for kw in IMPORTANT_KEYWORDS:
        if kw in combined:
            return 1
    return 0

df["label"] = df.apply(
    lambda x: label_importance(x["clean_body"], x["clean_subject"]),
    axis=1
)



In [None]:
df["label"].value_counts()


label
0    320189
1    196604
Name: count, dtype: int64

In [None]:
output_path = "../data/processed/emails_with_labels.csv"
df.to_csv(output_path, index=False)

print("Saved to:", output_path)


Saved to: ../data/processed/emails_with_labels.csv


: 