In [1]:
import os
import re
import email
import numpy as np
import pandas as pd
import tqdm

In [2]:
#  Read emails from the data folders
path = 'enron_with_categories/'
labels = []
data = None
for i in range(1, 7):
    list_emails = []
    dir_path = path + str(i) + '/'
    for file_path in os.listdir(dir_path):
        if '.txt' in file_path:
            labels.append(i-1)
            with open(dir_path + file_path, 'r', encoding="utf-8") as file:
                email_lines = file.read()
                list_emails.append(email_lines)
    d = pd.DataFrame()
    d['text'] = list_emails
    if data is None:
        data = d
    else:
        data = pd.concat([data, d])
data['label'] = labels
data

Unnamed: 0,text,label
0,Message-ID: <23486926.1075842966554.JavaMail.e...,0
1,Message-ID: <18218267.1075862047342.JavaMail.e...,0
2,Message-ID: <15377587.1075846173597.JavaMail.e...,0
3,Message-ID: <17415116.1075863429863.JavaMail.e...,0
4,Message-ID: <26691844.1075852531386.JavaMail.e...,0
...,...,...
138,Message-ID: <30156147.1075849864036.JavaMail.e...,5
139,Message-ID: <1421029.1075849864316.JavaMail.ev...,5
140,Message-ID: <26990460.1075858882856.JavaMail.e...,5
141,Message-ID: <11049182.1075858884165.JavaMail.e...,5


In [3]:
# Function to compute the ratio of non-alphabet to alphabet characters in a line
def compute_ratio(string):
    # Initialize counters for non-alphabet and alphabet characters
    non_alphabet_count = 0
    alphabet_count = 0

    # Iterate through each character in the string
    for char in string:
        if char.isalpha() or char in ['.', ',']:
            alphabet_count += 1
        else:
            non_alphabet_count += 1

    # Calculate the ratio
    if alphabet_count == 0:
        ratio = 100000000
    else:
        ratio = non_alphabet_count / alphabet_count

    return ratio

# Function to remove email addresses
def remove_email_addresses(text):
    # Define the regex pattern for matching email addresses
    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

    # Use the findall function to find all email addresses in the text
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

In [4]:
# Discard lines with the following texts and whose compute_ratio is more than 0.3. 
discard = ['forwarded by', 'to:', 'cc:', 'subject', 'from:', 'sent:', '--']

emails = []
for email_text in data['text']:
    msg = email.message_from_string(email_text)
    payload = msg.get_payload()
    payload = remove_email_addresses(payload)
    ans = []
    for line in payload.split('\n'):
        line = line.lower()
        flag = 0
        for phrase in discard:
            if phrase in line:
                flag = 1
                break
        if flag == 0 and len(line) > 0 and compute_ratio(line) < 0.3:
            ans.append(line)
    ans = ' '.join(ans)
    if len(ans) == 0:
        payload = re.sub(r'\s+', ' ', payload).strip()
        ans = payload
    emails.append(ans)

In [5]:
# Store the emails and their labels in a dataframe
input_df = pd.DataFrame()
input_df['text'] = emails
input_df['label'] = labels
input_df

Unnamed: 0,text,label
0,we will be asking for confidential treatment u...,0
1,sent from my blackberry wireless handheld (www...,0
2,thaniks for the note. christie is now organiz...,0
3,"dear all, we have developed analytical souluti...",0
4,"in our july 30, 2001 document entitled ""receip...",0
...,...,...
1658,"looks fine steve and cindy, attached is the el...",5
1659,steve: please review the attached ene officer ...,5
1660,bonnie: please forward to michael kirby. i th...,5
1661,my suggestions are attached. take the gloves o...,5


In [6]:
# Check class distribution
input_df['label'].value_counts()

0    834
3    476
5    143
2    100
4     74
1     36
Name: label, dtype: int64

In [7]:
# To solve the issue of class imbalance, use back translation on minority classes to upsample them
import nlpaug.augmenter.word as naw
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)

In [9]:
from sklearn.model_selection import train_test_split

data = input_df

# Split the data into training, validation, and test sets with stratification
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42, stratify=test_data['label'])

In [10]:
train_data['label'].value_counts()

0    667
3    381
5    114
2     80
4     59
1     29
Name: label, dtype: int64

In [11]:
# Augment minority class data
count_dict = dict(train_data['label'].value_counts())
max_count = max([v for k, v in count_dict.items()])
add_dict = {}
new_train_data = [train_data]
for k in list(count_dict.keys()):
    minority_resamples_count = max_count - count_dict[k]
    minority_samples = []
    labels = []
    if minority_resamples_count != 0:
        for i in range(int(max_count/count_dict[k])):
            minority_class_text = train_data[train_data['label'] == k]['text'].tolist()
            for text in tqdm.tqdm(minority_class_text):
                text_to_augment = back_translation_aug.augment(text)
                if len(text_to_augment) != 0:
                    minority_samples.append(text_to_augment[0])
                    labels.append(k)
        minority_samples_df = pd.DataFrame()
        minority_samples_df['text'] = minority_samples
        minority_samples_df['label'] = labels
        new_train_data.append(minority_samples_df)
new_train_data = pd.concat(new_train_data, axis=0)
new_train_data['label'].value_counts()

100%|██████████| 381/381 [2:01:01<00:00, 19.06s/it]  
100%|██████████| 114/114 [38:55<00:00, 20.48s/it]
100%|██████████| 114/114 [37:09<00:00, 19.56s/it]
100%|██████████| 114/114 [36:32<00:00, 19.23s/it]
100%|██████████| 114/114 [35:39<00:00, 18.77s/it]
100%|██████████| 114/114 [36:42<00:00, 19.32s/it]
100%|██████████| 80/80 [23:01<00:00, 17.27s/it]
100%|██████████| 80/80 [22:32<00:00, 16.90s/it]
100%|██████████| 80/80 [22:37<00:00, 16.96s/it]
100%|██████████| 80/80 [23:16<00:00, 17.45s/it]
100%|██████████| 80/80 [24:23<00:00, 18.29s/it]
100%|██████████| 80/80 [22:24<00:00, 16.81s/it]
100%|██████████| 80/80 [22:15<00:00, 16.69s/it]
100%|██████████| 80/80 [22:58<00:00, 17.23s/it]
100%|██████████| 59/59 [21:14<00:00, 21.59s/it]
100%|██████████| 59/59 [20:58<00:00, 21.34s/it]
100%|██████████| 59/59 [20:48<00:00, 21.15s/it]
100%|██████████| 59/59 [20:41<00:00, 21.03s/it]
100%|██████████| 59/59 [22:16<00:00, 22.66s/it]
100%|██████████| 59/59 [23:14<00:00, 23.63s/it]
100%|██████████| 59/59 [

3    762
2    720
4    708
1    696
5    684
0    667
Name: label, dtype: int64

In [12]:
# Save the new and the old training data and validation and test data
new_train_data.to_csv('new_train_data.csv', index=None)
train_data.to_csv('train_data.csv', index=None)
val_data.to_csv('val_data.csv', index=None)
test_data.to_csv('test_data.csv', index=None)