
# Email Classification Notebook (Rebuilt)

This notebook was reconstructed from a Python script pasted from Colab to fix the error:
> **"Invalid Notebook — The Notebook Does Not Appear to Be Valid JSON"**

All cells below mirror your original workflow (data download, cleaning, feature engineering, modeling, and Streamlit app scaffolding). You can run them step-by-step in Jupyter or Colab.


In [None]:

# Optional: clean environment (disabled by default)
# !rm -rf /content/*

!pip install --upgrade --no-cache-dir gdown


In [None]:

# Download source files from Google Drive (uncomment/adjust as needed)
# Links you previously used:
# https://drive.google.com/file/d/1tFWUoD4mxga4KTVRRQcF8ZuIZD-KkgMr/view?usp=sharing
# https://drive.google.com/file/d/1RrXMsrppQnKb0Absnr9EDQ4wygAYvOnh/view?usp=sharing
# https://drive.google.com/file/d/1_JEJ-2uszpe9p_eOEHYRE9fkEmElgng5/view?usp=sharing
# https://drive.google.com/file/d/1kNZ7dt8M0sxk5aSLA3utzqG4XS_czyaP/view?usp=sharing

# Uncomment if you need to re-download in Colab:
# !gdown https://drive.google.com/uc?id=1_JEJ-2uszpe9p_eOEHYRE9fkEmElgng5
# !gdown https://drive.google.com/uc?id=1kNZ7dt8M0sxk5aSLA3utzqG4XS_czyaP
# !gdown https://drive.google.com/uc?id=1tFWUoD4mxga4KTVRRQcF8ZuIZD-KkgMr
# !gdown https://drive.google.com/uc?id=1RrXMsrppQnKb0Absnr9EDQ4wygAYvOnh

# If you downloaded a zip named archive.zip:
# !unzip -o /content/archive.zip && rm -f /content/archive.zip


In [None]:

import numpy as np
import pandas as pd
import seaborn as sns
import email
import matplotlib.pyplot as plt

# Load the raw emails CSV
# Adjust the path if needed
df = pd.read_csv("/content/emails.csv")

# Save a sample email body to file (optional)
idx = 112233 if len(df) > 112233 else 0
with open('/content/sample_email.txt','w') as f:
    f.write(df['message'].values[idx])


In [None]:

# Explore a single email
message = df.loc[1, 'message']
e = email.message_from_string(message)

print("Headers:", e.items())
print("Date:", e.get('Date'))
print("Subject:", e.get('Subject'))
print("Body (raw payload):", e.get_payload()[:500], "...")


In [None]:

def get_field(field, messages):
    column = []
    for m in messages:
        e = email.message_from_string(m)
        column.append(e.get(field))
    return column

df['date'] = get_field("Date", df['message'])
df['subject'] = get_field("Subject", df['message'])
df['X-Folder'] = get_field("X-Folder", df['message'])
df['X-From'] = get_field("X-From", df['message'])
df['X-To'] = get_field("X-To", df['message'])
df.head(3)


In [None]:

def body(messages):
    column = []
    for m in messages:
        e = email.message_from_string(m)
        column.append(e.get_payload())
    return column

df['body'] = body(df['message'])
df.head(3)


In [None]:

if 'file' in df.columns:
    def employee(file_series):
        column = []
        for string in file_series:
            try:
                column.append(string.split("/")[0])
            except Exception:
                column.append(np.nan)
        return column

    df['employee'] = employee(df['file'])
    df.head(3)
else:
    print("Column 'file' not found in df; skipping employee extraction.")


In [None]:

print("Rows:", df.shape[0])
if 'X-Folder' in df.columns:
    print("Unique folders:", df['X-Folder'].nunique())
    unique_emails = pd.DataFrame(df['X-Folder'].value_counts()).reset_index()
    unique_emails.columns = ['folder_name', 'count']
    display(unique_emails.head(20))

    plt.figure(figsize=(10,6))
    sns.barplot(x='count', y='folder_name', data=unique_emails.iloc[:20, :], palette="Blues_d")
    plt.title("Top 20 folders")
    plt.xlabel("Count")
    plt.ylabel("Folder_Name")
    plt.show()
else:
    print("Column 'X-Folder' not found; skipping folder analysis.")


In [None]:

if 'employee' in df.columns:
    top_20 = pd.DataFrame(df['employee'].value_counts()[:20]).reset_index()
    top_20.columns = ["Employee_name", "Counts"]
    display(top_20)

    plt.figure(figsize=(10,8))
    sns.barplot(y="Employee_name", x="Counts", data=top_20, palette="Blues_d")
    plt.title("Top 20 highest email sender employees")
    plt.xlabel("Count")
    plt.ylabel("Employee_name")
    plt.show()
else:
    print("Column 'employee' not found; skipping top senders.")


In [None]:

from dateutil import parser

def change_type(dates):
    column = []
    for d in dates:
        try:
            column.append(parser.parse(d).strftime("%d-%m-%Y %H:%M:%S"))
        except Exception:
            column.append(np.nan)
    return column

if 'date' in df.columns:
    df['date'] = change_type(df['date'])

def preprocess_folder(folders):
    column = []
    for folder in folders:
        if (folder is None or folder == ""):
            column.append(np.nan)
        else:
            try:
                column.append(folder.split("\\")[-1].lower())
            except Exception:
                column.append(np.nan)
    return column

if 'X-Folder' in df.columns:
    df['X-Folder'] = preprocess_folder(df['X-Folder'])

def replace_empty_with_nan(series):
    column = []
    for val in series:
        if (val == "") or (val is None):
            column.append(np.nan) 
        else:
            column.append(val)
    return column

for col in ['subject','X-To']:
    if col in df.columns:
        df[col] = replace_empty_with_nan(df[col])

# Drop rows with any missing values (as in your script)
df.dropna(axis=0, inplace=True)

# Drop columns not needed
cols_to_drop = [c for c in ['file','message','date','X-From','X-To','employee'] if c in df.columns]
df.drop(cols_to_drop, axis=1, inplace=True)

# Save cleaned data
df.to_csv("/content/cleaned_data.csv", index=False)
print("Saved cleaned data -> /content/cleaned_data.csv. Shape:", df.shape)


In [None]:

import re, string, time
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

df = pd.read_csv("/content/cleaned_data.csv")

# Keep folders with more than n emails
def remove_folders(emails_df, n=150):
    email_count = dict(emails_df['X-Folder'].value_counts())
    small_folders = [key for key, val in email_count.items() if val <= n]
    emails = emails_df.loc[~emails_df['X-Folder'].isin(small_folders)]
    return emails

if 'X-Folder' in df.columns:
    df = remove_folders(df, 150)

# combine subject + body
if {'subject','body'}.issubset(df.columns):
    df['text'] = df['subject'].astype(str) + " " + df['body'].astype(str)
    df.drop(['subject','body'], axis=1, inplace=True)
else:
    print("Columns 'subject'/'body' not found; ensure they exist before combining.")

def preprocess(x):
    x = x.lower()
    x = re.sub(r'\n+', ' ', x)
    x = re.sub("["+string.punctuation+"]", " ", x)
    x = re.sub(r'\s+', ' ', x)
    return x

t0 = time.time()
df['text'] = df['text'].map(preprocess).apply(lambda s: ' '.join([w for w in s.split() if w not in stop]))
print("Preprocess time (sec):", time.time() - t0)

# (Optional) Filter to specific labels if desired
# labels = ['management','calender','logistics','corporate','online trading','universities','it']
# df = df[df['X-Folder'].isin(labels)].reset_index(drop=True)

df.to_csv("/content/preprocessed.csv", index=False)
print("Saved preprocessed data -> /content/preprocessed.csv. Shape:", df.shape)


In [None]:

!pip install -q sentence-transformers

import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

data = pd.read_csv("/content/preprocessed.csv")
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

text_data = data['text'].astype(str).tolist()
embs = []
for t in tqdm(text_data, total=len(text_data)):
    embs.append(model.encode(t).ravel().tolist())

emb_df = pd.DataFrame(embs)
emb_df['class'] = data['X-Folder'].values.tolist()
emb_df.to_csv('/content/emb_data.csv', index=False)
print("Saved embeddings -> /content/emb_data.csv. Shape:", emb_df.shape)


In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, balanced_accuracy_score, confusion_matrix
from sklearn.base import clone
import matplotlib.pyplot as plt

df = pd.read_csv('/content/emb_data.csv')
X = df.drop(columns=['class'])
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, stratify=y)

model_obj = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(class_weight='balanced', n_jobs=-1, max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = balanced_accuracy_score(y_test, y_pred_lr)
model_obj[acc_lr] = clone(lr)

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(class_weight='balanced', random_state=101)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = balanced_accuracy_score(y_test, y_pred_dt)
model_obj[acc_dt] = clone(dt)

# Linear SVC
from sklearn.svm import LinearSVC
svc = LinearSVC(C=1.0, class_weight='balanced')
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)
acc_svc = balanced_accuracy_score(y_test, y_pred_svc)
model_obj[acc_svc] = clone(svc)

# Extra Trees
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=100, class_weight='balanced', random_state=101)
etc.fit(X_train, y_train)
y_pred_etc = etc.predict(X_test)
acc_etc = balanced_accuracy_score(y_test, y_pred_etc)
model_obj[acc_etc] = clone(etc)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=101)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
acc_rfc = balanced_accuracy_score(y_test, y_pred_rfc)
model_obj[acc_rfc] = clone(rfc)

print("Balanced Accuracies -> LR:{:.3f}  DT:{:.3f}  SVC:{:.3f}  ETC:{:.3f}  RFC:{:.3f}"
      .format(acc_lr, acc_dt, acc_svc, acc_etc, acc_rfc))

# Pick best model
best_score = max(model_obj.keys())
best_model = clone(model_obj[best_score])
best_model.fit(X_train, y_train)

import joblib
joblib.dump(best_model, '/content/model.sav')
print("Saved best model -> /content/model.sav  (score: {:.3f})".format(best_score))


In [None]:

!pip install -q streamlit==1.20.0 sentence-transformers

# Creates a Streamlit app file (uncomment to write it out)
app_code = r'''
import os
import streamlit as st
from sentence_transformers import SentenceTransformer
import email as emlib
import uuid
import numpy as np
import joblib
import re
import string
import warnings
warnings.filterwarnings("ignore")

@st.cache_resource
def load_models():
    emb_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    pred_model = joblib.load("model.sav")
    return emb_model, pred_model

def preprocess(x: str) -> str:
    x = x.lower()
    x = re.sub(r'\n+', ' ', x)
    x = re.sub("["+string.punctuation+"]", " ", x)
    x = re.sub(r'\s+', ' ', x)
    return x

emb_model, pred_model = load_models()

def predict_email_class2(subject, body):
    s = (subject or "") + " " + (body or "")
    s = preprocess(s)
    arr = emb_model.encode(s).ravel()
    cls = pred_model.predict([arr])[0]
    return cls

st.title("Email Classification Web App")
mode = st.selectbox("Input Mode", ["Upload .txt", "Type Manually"], index=0)

subject = ""
body = ""
file = None

if mode == "Upload .txt":
    file = st.file_uploader("Upload a plain-text email file", type=["txt"])
else:
    subject = st.text_input("Subject")
    body = st.text_area("Body", height=200)

if st.button("Predict"):
    if file is not None:
        s = file.read().decode("utf-8")
        e = emlib.message_from_string(s)
        subj = e.get("Subject", "")
        payload = e.get_payload()
        pred = predict_email_class2(subj, payload)
        st.success(f"Predicted folder/class: {pred}")
    else:
        pred = predict_email_class2(subject, body)
        st.success(f"Predicted folder/class: {pred}")
'''

with open('/content/app.py', 'w') as f:
    f.write(app_code)

print("Wrote Streamlit app to /content/app.py")
print("Run with:  streamlit run /content/app.py --server.port 8501")
