# **Email Categorization by fine tuning BERT model**

### Note: ChatGPT is used here for generating training and testing data

In [3]:
!pip install torch transformers scikit-learn pandas numpy



# Generating training data for finetuning BERT model

In [1]:
import random
import csv

## Generating email data from student to HOD

In [2]:
student_email_subjects = [
    "Request for Course Material",
    "Inquiry About Academic Progress",
    "Clarification on Assignment",
    "Extension Request for Submission",
    "Feedback on Recent Lecture",
    "Meeting Request for Project Discussion",
    "Guidance for Exam Preparation",
    "Syllabus Update Query",
    "Suggestions for Research Topic",
    "Request for Office Hours"
]

student_email_bodies = [
    "Dear Professor, I hope this message finds you well. I am requesting the lecture notes from last week's class, as I was unable to attend. Your help is appreciated.",
    "Respected Sir/Madam, Could you kindly update me on my academic standing and current GPA? I would also appreciate advice on how I can improve.",
    "Hello, I am seeking clarification regarding the machine learning assignment. Could you please specify the format in which you expect the submission?",
    "Dear Sir, Due to personal circumstances, I am requesting an extension for the project deadline. I apologize for the inconvenience and appreciate your understanding.",
    "Dear Professor, I wanted to offer some feedback on the last lecture. It was insightful, but the pace was quite fast, and I had difficulty following along.",
    "Respected Sir/Madam, I would like to arrange a meeting to discuss my final year project and would appreciate your advice on my research direction.",
    "Dear Sir, With exams approaching, I wanted to ask for your guidance on which topics to focus on and if you could recommend any study materials.",
    "Hello Professor, I wanted to check if there have been any updates to the syllabus. Could you provide the latest version for reference?",
    "Dear Professor, I am exploring research topics for my project and was hoping you could suggest some relevant areas in artificial intelligence.",
    "Respected Sir, I wanted to inquire about your availability during office hours to discuss some questions related to the course content."
]

# Increasing variation in email body by randomizing structure and wording
def generate_student_data():
    greetings = ["Dear Professor,",
                 "Hello Professor,",
                 "Respected Sir/Madam,",
                 "Hi, Professor,","Greetings, Professor,",
                 "Dear Sir,",
                 "Dear Madam,"]

    ending = ["Thank you for your time.",
              "I appreciate your help.",
              "Looking forward to your reply.",
              "Thanks in advance for your support.",
              "Your guidance is much appreciated.",
              "Thank you for considering my request."]

    return f"{random.choice(greetings)} {random.choice(student_email_bodies)} {random.choice(ending)}"

student_email_data = [
    {
        "email_address": f"student{index}@univeristy.edu",
        "subject": random.choice(student_email_subjects),
        "body": generate_student_data()
    }
    for index in range(1,101)
]

csv_file_path_student = "/content/student_HOD_emails.csv"

with open(csv_file_path_student, mode = 'w', newline = '') as file:
    writer = csv.DictWriter(file, fieldnames = ['email_address', 'subject', 'body'])
    writer.writeheader()
    writer.writerows(student_email_data)



## Generating email data from corporate to HOD

In [3]:
corporate_email_subjects = [
    "Internship Request",
    "Placement Inquiry",
    "Follow-up on Job Opportunities",
    "Request for Placement Details",
    "Inquiry About Internship Program",
    "Seeking Corporate Internship",
    "Placement Status Inquiry",
    "Request for Job Shadowing",
    "Application for Internship",
    "Corporate Internship Details"
]

corporate_email_bodies = [
    "Dear Sir/Madam, I hope this message finds you well. I am writing to inquire about available internship opportunities at your company. I would be grateful if you could provide details on the application process.",
    "Respected Sir/Madam, I would like to express my interest in the upcoming campus placements and request information regarding the companies that will be visiting for recruitment.",
    "Hello, I am a final year student looking for internship opportunities to gain industry exposure. Could you kindly let me know if there are any openings in your organization?",
    "Dear Sir/Madam, I recently applied for the internship position at your company and would like to follow up on the status of my application. I look forward to your response.",
    "Respected Sir/Madam, I wanted to inquire about the corporate internship programs available this summer and whether students from my department would be eligible.",
    "Dear Sir, I am seeking information on placement opportunities in your esteemed company. Could you please provide details on the requirements and deadlines?",
    "Hello, I would like to inquire if your company offers job shadowing programs for students looking to gain insights into corporate roles and responsibilities.",
    "Dear Sir/Madam, I am a final-year student and would like to inquire about internship placements. Could you provide information on available programs and how to apply?",
    "Greetings, I am interested in an internship at your company to gain experience in the field of software engineering. Could you provide the application guidelines?",
    "Dear Sir, I would like to request more details on the corporate internship program your company offers. Specifically, I am interested in the eligibility criteria and duration."
]

def generate_corporate_data():
  greetings = [
      "Dear Sir/Madam,",
      "Hello,",
      "Respected Sir/Madam,",
      "Greetings,",
      "Hi, Sir/Madam,",
  ]

  ending = [
      "Thank you for your time.",
      "Looking forward to your response.",
      "I appreciate your assistance.",
      "Thanks in advance for your help.",
      "Your consideration is greatly appreciated.",
  ]

  return f"{random.choice(greetings)} {random.choice(corporate_email_bodies)} {random.choice(ending)}"

corporate_email_data = [
    {
        "email_address": f"corporate{index}@univeristy.edu",
        "subject": random.choice(corporate_email_subjects),
        "body": generate_corporate_data()
    }
    for index in range(1,151)
]

csv_file_path_corporate = "/content/corporate_HOD_emails.csv"

with open(csv_file_path_corporate, mode = 'w', newline = '') as file:
    writer = csv.DictWriter(file, fieldnames = ['email_address', 'subject', 'body'])
    writer.writeheader()
    writer.writerows(corporate_email_data)


## Generating email data from researcher to HOD

In [4]:
researcher_email_subjects = [
    "Request for Shared Research Data",
    "Inquiry About Research Cooperation",
    "Request for Research Collaboration",
    "Inquiry Regarding Research Facilities",
    "Request for Research Partnership",
    "Follow-up on Data Sharing Request",
    "Cooperation Opportunity in Research",
    "Query About Shared Research Resources",
    "Collaboration on Research Paper",
    "Inquiry on Research Facility Usage"
]


researcher_email_bodies = [
    "Dear Professor, I hope you are doing well. I am currently working on a project related to artificial intelligence and was hoping to gain access to some of the data your team has worked on. Could you provide information on how to proceed with a data-sharing request?",
    "Respected Sir/Madam, I am interested in collaborating on research in the field of sustainable energy and would appreciate it if we could explore possible cooperation opportunities. Your expertise would be invaluable.",
    "Dear Professor, I am currently conducting research in the field of biotechnology and am writing to inquire if we could collaborate on a joint paper. I believe our interests align, and I would love to discuss potential topics.",
    "Respected Professor, I wanted to inquire about the facilities available at your research lab for conducting experiments in material science. I am looking for specific instruments and would like to know if external researchers are allowed to use them.",
    "Dear Sir/Madam, I hope this email finds you well. I would like to propose a research partnership on quantum computing and explore avenues where our expertise could complement each other. Could we arrange a meeting?",
    "Dear Professor, I am following up on my previous request for access to the research data on environmental impact analysis. Could you kindly update me on the status of the data-sharing process?",
    "Respected Sir/Madam, I am reaching out to inquire if you would be interested in collaborating on research focused on renewable energy sources. I believe we could achieve great results together.",
    "Dear Professor, I wanted to request access to shared research resources in your lab, specifically in the area of microbiology. Could you guide me on the process for external researchers?",
    "Respected Professor, I am currently working on a paper on artificial intelligence and wanted to explore if you would be open to collaborating on this research. I believe your insights would add significant value.",
    "Dear Sir, I am interested in using the advanced facilities at your research center for conducting an experiment on nanotechnology. Could you please provide details on how to gain access as an external researcher?"
]

def generate_researcher_data():
  greetings = [
      "Dear Professor,",
      "Hello Professor,",
      "Respected Professor,",
      "Hi, Professor,",
      "Greetings, Professor,"
  ]

  ending = [
      "Thank you for considering my request.",
      "I look forward to your response.",
      "Your assistance would be greatly appreciated.",
      "I hope to hear from you soon.",
      "Thanks in advance for your guidance."
  ]

  return f"{random.choice(greetings)} {random.choice(researcher_email_bodies)} {random.choice(ending)}"

researcher_email_data = [
    {
        "email_address": f"researcher{index}@univeristy.edu",
        "subject": random.choice(researcher_email_subjects),
        "body": generate_researcher_data()
    }
    for index in range(1,151)
]

csv_file_path_researcher = "/content/researcher_HOD_emails.csv"

with open(csv_file_path_researcher, mode = 'w', newline = '') as file:
    writer = csv.DictWriter(file, fieldnames = ['email_address', 'subject', 'body'])
    writer.writeheader()
    writer.writerows(corporate_email_data)


# Fine tuning the BERT model

In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import logging

In [17]:
# Suppressing warnings from the transformers library
logging.set_verbosity_error()

# Loading the CSV files
df_student = pd.read_csv("/content/student_HOD_emails.csv")
df_corporate = pd.read_csv("/content/corporate_HOD_emails.csv")
df_researcher = pd.read_csv("/content/researcher_HOD_emails.csv")

# Adding labels: 0 = student, 1 = corporate, 2 = researcher
df_student['label'] = 0
df_corporate['label'] = 1
df_researcher['label'] = 2

# Combining datasets into one dataframe
df = pd.concat([df_student,df_corporate,df_researcher])

# Shuffling the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Splitting the dataset into training and testings sets
df_train, df_test = train_test_split(df,test_size = 0.2)

# Defining the Hugging face for the model access token
token = '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'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',use_auth_token = token)

# Creating class for Pytorch
class EmailDataset(Dataset):
  def __init__(self,dataframe,tokenizer,max_len):
    self.dataframe = dataframe
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.dataframe)

  def __getitem__(self,index):
    email = self.dataframe.iloc[index]
    subject = email['subject']
    body = email['body']
    label = email['label']

  # Tokenizing the input (concatenate subject and body)
    inputs = self.tokenizer(
      subject+ " " + body,
      padding = 'max_length',
      truncation = True,
      max_length = self.max_len,
      return_tensors = "pt"
    )

    input_ids = inputs['input_ids'].squeeze(0)
    attention_mask = inputs['attention_mask'].squeeze(0)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'label': torch.tensor(label,dtype = torch.long)
    }

# Defining the parameters
MAX_LEN = 256
BATCH_SIZE = 8
n_EPOCHS = 3
ALPHA = 2e-5


# Preparing the datasets and dataloaders
train_data = EmailDataset(df_train,tokenizer,MAX_LEN)
test_data = EmailDataset(df_test,tokenizer,MAX_LEN)

train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(test_data, batch_size = BATCH_SIZE, shuffle = False)

# Initializing the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=3,use_auth_token = token)

# Optimizer
optimizer = AdamW(model.parameters(),lr = ALPHA)

# Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

for epoch in range(n_EPOCHS):
  model.train()
  total_loss = 0

  for batch in train_loader:
    optimizer.zero_grad()

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

    total_loss += loss.item()


  avg_loss = total_loss / len(train_loader)
  print(f"Epoch {epoch + 1}/{n_EPOCHS}, Loss: {avg_loss:.4f}")


#Evaluation

model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
  for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    outputs = model(input_ids,attention_mask)
    logits = outputs.logits
    predicts = torch.argmax(logits,dim=1)

    all_predictions.extend(predicts.cpu().numpy())
    all_labels.extend(labels.cpu().numpy())


# Calculating accuracy
acc = accuracy_score(all_labels,all_predictions)
print(f"Test Accuracy: {acc * 100:.2f}%")






Epoch 1/3, Loss: 0.9776
Epoch 2/3, Loss: 0.5635
Epoch 3/3, Loss: 0.5339
Test Accuracy: 60.00%


# Testing the model

In [8]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd

In [18]:
token = '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'

# Load the trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_auth_token=token)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3, use_auth_token=token)

# Sample test emails
test_emails = [
    {"subject":"Inquiry About Course Material","body":"Dear HOD, I hope this message finds you well. I would like to know if there are any specific textbooks or resources recommended for the upcoming semester's course. Thank you!", "label": 0},
    {"subject":"Internship Opportunity Inquiry","body":"Dear HOD, I am reaching out to inquire if your department is open to hosting interns from our university. We are keen on establishing a collaboration. Looking forward to your response.","label":1},
    {"subject":"Collaboration on Research Project","body":"Dear HOD, I am a researcher interested in exploring collaboration opportunities with your department. I believe our ongoing projects align well. Could we schedule a meeting to discuss this?","label":2},
    {"subject":"Question About Academic Progress","body":"Hello HOD, I hope you are doing well. I wanted to ask about my academic progress and any areas I should focus on to improve. Thank you for your guidance.","label":0},
    {"subject":"Placement Procedure Inquiry","body":"Dear HOD, I am writing to ask about the placement procedure for the upcoming semester. Could you please provide the details? Thank you!","label":1}
]

# Evaluate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

predictions = []
actual_labels = []

with torch.no_grad():
    for email in test_emails:
        # Prepare input
        inputs = tokenizer(email['subject'] + " " + email['body'],
                           padding='max_length',
                           truncation=True,
                           max_length=256,
                           return_tensors="pt")
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Get model predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).cpu().item()

        # Append predictions and actual labels
        predictions.append(pred)
        actual_labels.append(email['label'])

correct = sum(p==a for p,a in zip(predictions,actual_labels))
acc = correct/len(actual_labels)*100
print(f"Test Accuracy: {acc:.2f}%")

# Printing results
for email,prediction in zip(test_emails,predictions):
  print(f"Subject: {email['subject']}\nPredicted Label: {prediction}\nActual Label: {email['label']}\n")




Test Accuracy: 40.00%
Subject: Inquiry About Course Material
Predicted Label: 0
Actual Label: 0

Subject: Internship Opportunity Inquiry
Predicted Label: 0
Actual Label: 1

Subject: Collaboration on Research Project
Predicted Label: 0
Actual Label: 2

Subject: Question About Academic Progress
Predicted Label: 0
Actual Label: 0

Subject: Placement Procedure Inquiry
Predicted Label: 0
Actual Label: 1

