In [1]:
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F
import pickle

In [2]:
df = pd.read_csv("/content/focused_synthetic_email_dataseta.csv")

In [3]:
def preprocess_data(df):
    # Encode labels
    le_category = LabelEncoder()
    le_email_type = LabelEncoder()

    df['category_encoded'] = le_category.fit_transform(df['category'])
    df['email_type_encoded'] = le_email_type.fit_transform(df['email_type'])

    # Create a vocabulary (you might want to use a more sophisticated tokenization method)
    vocab = set()
    for text in df['subject'] + ' ' + df['body']:
        vocab.update(text.split())

    word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
    word_to_idx['<PAD>'] = 0

    return df, word_to_idx, le_category, le_email_type

In [4]:
class EmailDataset(Dataset):
    def __init__(self, df, word_to_idx, max_length=100):
        self.df = df
        self.word_to_idx = word_to_idx
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        subject = self.text_to_indices(row['subject'])
        body = self.text_to_indices(row['body'])
        category = row['category_encoded']
        email_type = row['email_type_encoded']

        return subject, body, category, email_type

    def text_to_indices(self, text):
        indices = [self.word_to_idx.get(word, 0) for word in text.split()[:self.max_length]]
        if len(indices) < self.max_length:
            indices += [0] * (self.max_length - len(indices))
        return torch.tensor(indices)


In [5]:
class EmailClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes_category, num_classes_email_type, dropout_rate=0.5):
        super(EmailClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads=8, batch_first=True)
        self.conv = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
        self.fc = nn.Linear(hidden_dim, hidden_dim)
        self.category_out = nn.Linear(hidden_dim, num_classes_category)
        self.email_type_out = nn.Linear(hidden_dim, num_classes_email_type)

        # Add dropout layers
        self.dropout = nn.Dropout(dropout_rate)

        # Randomly freeze layers
        self.freeze_layers()

    def freeze_layers(self):
        layers = [self.embedding, self.lstm, self.self_attention, self.conv, self.fc]
        for layer in random.sample(layers, k=2):  # Randomly freeze 2 layers
            for param in layer.parameters():
                param.requires_grad = False

    def forward(self, subject, body):
        x = torch.cat((subject, body), dim=1)
        x = self.embedding(x)
        x = self.dropout(x)  # Apply dropout after embedding

        lstm_out, _ = self.lstm(x)
        lstm_out = self.dropout(lstm_out)  # Apply dropout after LSTM

        attn_out, _ = self.self_attention(lstm_out, lstm_out, lstm_out)
        attn_out = self.dropout(attn_out)  # Apply dropout after self-attention

        conv_out = self.conv(attn_out.transpose(1, 2)).transpose(1, 2)
        conv_out = self.dropout(conv_out)  # Apply dropout after convolution

        pooled = torch.max(conv_out, dim=1)[0]
        fc_out = torch.relu(self.fc(pooled))
        fc_out = self.dropout(fc_out)  # Apply dropout after fully connected layer

        category_output = self.category_out(fc_out)
        email_type_output = self.email_type_out(fc_out)
        return category_output, email_type_output


In [6]:
df, word_to_idx, le_category, le_email_type = preprocess_data(df)

In [15]:
df, word_to_idx, le_category, le_email_type = preprocess_data(df)

# Split the data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Create datasets and dataloaders
train_dataset = EmailDataset(train_df, word_to_idx)
val_dataset = EmailDataset(val_df, word_to_idx)

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


In [37]:
VOCAB_SIZE = len(word_to_idx)
EMBED_DIM = 200
HIDDEN_DIM = 256
NUM_CLASSES_CATEGORY = len(le_category.classes_)
NUM_CLASSES_EMAIL_TYPE = len(le_email_type.classes_)
LEARNING_RATE = 0.0005
NUM_EPOCHS = 8

# Create model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EmailClassifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_CLASSES_CATEGORY, NUM_CLASSES_EMAIL_TYPE, dropout_rate=0.5).to(device)

# Loss and optimizer
category_criterion = nn.CrossEntropyLoss()
email_type_criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5)


In [20]:
state_dict = torch.load('/content/email_classification_model5.pth')

# Load the state dict into your model
model.load_state_dict(state_dict)

  state_dict = torch.load('/content/email_classification_model5.pth')


<All keys matched successfully>

In [21]:
model.eval()

EmailClassifier(
  (embedding): Embedding(2426, 200)
  (lstm): LSTM(200, 256, batch_first=True)
  (self_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
  )
  (conv): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (fc): Linear(in_features=256, out_features=256, bias=True)
  (category_out): Linear(in_features=256, out_features=3, bias=True)
  (email_type_out): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [17]:
def unfreeze_all_layers(model):
    for param in model.parameters():
        param.requires_grad = True

In [31]:
df.columns

Index(['subject', 'body', 'category', 'email_type', 'category_encoded',
       'email_type_encoded'],
      dtype='object')

In [38]:
for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0
    for batch in train_loader:
        subjects, bodies, categories, email_types = [b.to(device) for b in batch]

        optimizer.zero_grad()
        category_output, email_type_output = model(subjects, bodies)

        category_loss = category_criterion(category_output, categories)
        email_type_loss = email_type_criterion(email_type_output, email_types)
        total_loss = category_loss + email_type_loss

        total_loss.backward()
        optimizer.step()

        train_loss += total_loss.item()

    # Validation
    model.eval()
    val_loss = 0
    category_correct = 0
    email_type_correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            subjects, bodies, categories, email_types = [b.to(device) for b in batch]

            category_output, email_type_output = model(subjects, bodies)

            category_loss = category_criterion(category_output, categories)
            email_type_loss = email_type_criterion(email_type_output, email_types)
            total_loss = category_loss + email_type_loss

            val_loss += total_loss.item()

            _, category_predicted = torch.max(category_output, 1)
            _, email_type_predicted = torch.max(email_type_output, 1)

            total += categories.size(0)
            category_correct += (category_predicted == categories).sum().item()
            email_type_correct += (email_type_predicted == email_types).sum().item()

    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    category_accuracy = category_correct / total
    email_type_accuracy = email_type_correct / total

    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}]")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Category Accuracy: {category_accuracy:.4f}")
    print(f"Email Type Accuracy: {email_type_accuracy:.4f}")
    print()

    scheduler.step(val_loss)


Epoch [1/8]
Train Loss: 2.2079
Validation Loss: 2.1891
Category Accuracy: 0.3900
Email Type Accuracy: 0.4100

Epoch [2/8]
Train Loss: 2.0911
Validation Loss: 1.8230
Category Accuracy: 0.8100
Email Type Accuracy: 0.4200

Epoch [3/8]
Train Loss: 1.3995
Validation Loss: 1.4837
Category Accuracy: 0.9950
Email Type Accuracy: 0.3250

Epoch [4/8]
Train Loss: 1.2058
Validation Loss: 1.3492
Category Accuracy: 1.0000
Email Type Accuracy: 0.3350

Epoch [5/8]
Train Loss: 1.1784
Validation Loss: 1.3367
Category Accuracy: 1.0000
Email Type Accuracy: 0.4150

Epoch [6/8]
Train Loss: 1.1508
Validation Loss: 1.2840
Category Accuracy: 1.0000
Email Type Accuracy: 0.4050

Epoch [7/8]
Train Loss: 1.1308
Validation Loss: 1.3308
Category Accuracy: 1.0000
Email Type Accuracy: 0.4250

Epoch [8/8]
Train Loss: 1.1033
Validation Loss: 1.2667
Category Accuracy: 0.9850
Email Type Accuracy: 0.5600



In [36]:
# Save model weights
torch.save(model.state_dict(), 'email_classification_model5.pth')
print("Model weights saved to 'email_classification_model5.pth'")


Model weights saved to 'email_classification_model5.pth'


In [58]:
def classify_email(model, subject, body, word_to_idx, le_category, le_email_type, max_length=100, device='cpu'):
    # Preprocess the input
    def text_to_indices(text):
        indices = [word_to_idx.get(word, 0) for word in text.split()[:max_length]]
        if len(indices) < max_length:
            indices += [0] * (max_length - len(indices))
        return torch.tensor(indices).unsqueeze(0)  # Add batch dimension

    # Convert text to indices
    subject_tensor = text_to_indices(subject).to(device)
    body_tensor = text_to_indices(body).to(device)

    # Set model to evaluation mode
    model.eval()

    # Get predictions
    with torch.no_grad():
        category_output, email_type_output = model(subject_tensor, body_tensor)

        # Apply softmax to get probabilities
        category_probs = F.softmax(category_output, dim=1)
        email_type_probs = F.softmax(email_type_output, dim=1)

        # Get the predicted classes
        _, category_pred = torch.max(category_probs, 1)
        _, email_type_pred = torch.max(email_type_probs, 1)

    # Convert predictions to original labels
    predicted_category = le_category.inverse_transform(category_pred.cpu().numpy())[0]
    predicted_email_type = le_email_type.inverse_transform(email_type_pred.cpu().numpy())[0]

    # Get top 3 probabilities for each classification
    top_3_category = torch.topk(category_probs, 3)
    top_3_email_type = torch.topk(email_type_probs, 3)

    # Prepare results
    results = {
        predicted_category, predicted_email_type
    }

    return results

# Usage example:
# Assuming you have your trained model, word_to_idx, le_category, and le_email_type

model.to(device)  # Make sure the model is on the same device as the input tensors

custom_subject = "course syllabus"
custom_body = "Could you send me the updated syllabus for the upcoming course on Machine Learning? I want to prepare early."

results = classify_email(model, custom_subject, custom_body, word_to_idx, le_category, le_email_type, device=device)

# print(f"Predicted Category: {results['predicted_category']}")
# print(f"Predicted Email Type: {results['predicted_email_type']}")
# print("\nTop 3 Categories:")
# for category, prob in results['top_3_categories']:
#     print(f"  {category}: {prob:.4f}")
# print("\nTop 3 Email Types:")
# for email_type, prob in results['top_3_email_types']:
#     print(f"  {email_type}: {prob:.4f}")

print(results)

{'Research Query', 'Student inquiries'}


In [42]:
subject = "Query about peer review request application partnership"
body = "I'm conducting research on research data sharing and would like to inquire about potential resources or collaborations urgently and privately "

In [43]:
results = classify_email(model, subject, body, word_to_idx, le_category, le_email_type, device=device)

print(f"Predicted Category: {results['predicted_category']}")
print(f"Predicted Email Type: {results['predicted_email_type']}")
print("\nTop 3 Categories:")
for category, prob in results['top_3_categories']:
    print(f"  {category}: {prob:.4f}")
print("\nTop 3 Email Types:")
for email_type, prob in results['top_3_email_types']:
    print(f"  {email_type}: {prob:.4f}")

Predicted Category: Student inquiries
Predicted Email Type: Research Query

Top 3 Categories:
  Student inquiries: 0.6261
  Academic collaboration inquiries: 0.3233
  Corporate inquiries: 0.0506

Top 3 Email Types:
  Research Query: 0.3897
  Sensitive Email: 0.3059
  General Information: 0.3044


In [45]:
import pickle

In [46]:
data_to_save = {
    'word_to_idx': word_to_idx,
    'le_category': le_category,
    'le_email_type': le_email_type
}
with open('model_data.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)

In [48]:
VOCAB_SIZE

2426

In [63]:
class EmailClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes_category, num_classes_email_type, dropout_rate=0.5):
        super(EmailClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads=8, batch_first=True)
        self.conv = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
        self.fc = nn.Linear(hidden_dim, hidden_dim)
        self.category_out = nn.Linear(hidden_dim, num_classes_category)
        self.email_type_out = nn.Linear(hidden_dim, num_classes_email_type)

        # Add dropout layers
        self.dropout = nn.Dropout(dropout_rate)

        # Randomly freeze layers
        self.freeze_layers()

    def freeze_layers(self):
        layers = [self.embedding, self.lstm, self.self_attention, self.conv, self.fc]
        for layer in random.sample(layers, k=2):  # Randomly freeze 2 layers
            for param in layer.parameters():
                param.requires_grad = False

    def forward(self, subject, body):
        x = torch.cat((subject, body), dim=1)
        x = self.embedding(x)
        x = self.dropout(x)  # Apply dropout after embedding

        lstm_out, _ = self.lstm(x)
        lstm_out = self.dropout(lstm_out)  # Apply dropout after LSTM

        attn_out, _ = self.self_attention(lstm_out, lstm_out, lstm_out)
        attn_out = self.dropout(attn_out)  # Apply dropout after self-attention

        conv_out = self.conv(attn_out.transpose(1, 2)).transpose(1, 2)
        conv_out = self.dropout(conv_out)  # Apply dropout after convolution

        pooled = torch.max(conv_out, dim=1)[0]
        fc_out = torch.relu(self.fc(pooled))
        fc_out = self.dropout(fc_out)  # Apply dropout after fully connected layer

        category_output = self.category_out(fc_out)
        email_type_output = self.email_type_out(fc_out)
        return category_output, email_type_output

def classify_email(model, subject, body, word_to_idx, le_category, le_email_type, max_length=100, device='cpu'):
    # Preprocess the input
    def text_to_indices(text):
        indices = [word_to_idx.get(word, 0) for word in text.split()[:max_length]]
        if len(indices) < max_length:
            indices += [0] * (max_length - len(indices))
        return torch.tensor(indices).unsqueeze(0)  # Add batch dimension

    # Convert text to indices
    subject_tensor = text_to_indices(subject).to(device)
    body_tensor = text_to_indices(body).to(device)

    # Set model to evaluation mode
    model.eval()

    # Get predictions
    with torch.no_grad():
        category_output, email_type_output = model(subject_tensor, body_tensor)

        # Apply softmax to get probabilities
        category_probs = F.softmax(category_output, dim=1)
        email_type_probs = F.softmax(email_type_output, dim=1)

        # Get the predicted classes
        _, category_pred = torch.max(category_probs, 1)
        _, email_type_pred = torch.max(email_type_probs, 1)

    # Convert predictions to original labels
    predicted_category = le_category.inverse_transform(category_pred.cpu().numpy())[0]
    predicted_email_type = le_email_type.inverse_transform(email_type_pred.cpu().numpy())[0]

    # Get top 3 probabilities for each classification
    top_3_category = torch.topk(category_probs, 3)
    top_3_email_type = torch.topk(email_type_probs, 3)

    # Prepare results
    results = {
        predicted_category, predicted_email_type
    }

    return results


embed_dim = 200
hidden_dim = 256
num_classes_category = 3 #len(le_category.classes_)
num_classes_email_type = 3 #len(le_email_type.classes_)
LEARNING_RATE = 0.0005
NUM_EPOCHS = 8


def load_model_and_data():
    # Load the model
    vocab_size = 2426
    model = EmailClassifier(vocab_size, embed_dim, hidden_dim, num_classes_category, num_classes_email_type)
    model.load_state_dict(torch.load('/content/email_classification_model5.pth', map_location=torch.device('cpu')))
    model.eval()

    # Load other necessary data
    with open('model_data.pkl', 'rb') as f:
        data = pickle.load(f)

    return model, data['word_to_idx'], data['le_category'], data['le_email_type']

# Load model and data
model, word_to_idx, le_category, le_email_type = load_model_and_data()

results = classify_email(model, subject, body, word_to_idx, le_category, le_email_type)
results_list = list(results)

# Extract category and email_type
category = results_list[0]
email_type = results_list[1]

print(f"Category: {category}")
print(f"Email Type: {email_type}")


Category: General Information
Email Type: Student inquiries


  model.load_state_dict(torch.load('/content/email_classification_model5.pth', map_location=torch.device('cpu')))


In [64]:
results

{'General Information', 'Student inquiries'}