# Testing prompts for Classifying emails

## Importing libraries

In [1]:
import os
import numpy as np
import requests
from dotenv import load_dotenv
from langsmith import traceable
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    accuracy_score, confusion_matrix, hamming_loss
)
import re

import warnings
warnings.filterwarnings("ignore")

### Getting Path of current file
import os
import sys
from pathlib import Path

path = Path(os.path.dirname(os.getcwd()))
path = str(path)
print(path)
sys.path.insert(1, path)

/Users/saideepbunny/Projects/Email-Assistant-using-Generative-AI


## Setting API keys

In [2]:
load_dotenv(dotenv_path=f"{path}/config/nvidia_token.env")

model1 = "google/gemma-3n-e4b-it"
model2 = "mistralai/mistral-nemotron"
model3 = "openai/gpt-oss-120b"

token = os.getenv("NVIDIA_API_KEY")

## Classification function

In [3]:
def classify_email(model, token, prompt_msg, email_content: str, examples: list = None):
    """
    Classify emails using an NVIDIA LLM with optional few-shot examples.

    Args:
        model (str): NVIDIA LLM model name.
        token (str): API key for NVIDIA LLM.
        prompt_msg (str): Classification instruction message.
        email_content (str): Email text to classify.
        examples (list, optional): Few-shot examples in the form 
                                   [{"email": "example email", "label": "JOB"}, ...]

    Returns:
        str: Cleaned classification result (only capital letters).
    """

    # Build few-shot messages
    messages = []

    # Add few-shot examples if provided
    if examples:
        for ex in examples:
            messages.append(("user", ex["email"]))
            messages.append(("assistant", ex["label"]))

    # Add the actual email to classify
    messages.append((
        "user",
        f"""{{email_content}}\n\n{prompt_msg}"""
    ))

    # Create prompt template
    prompt = ChatPromptTemplate.from_messages(messages)

    # Use the NVIDIA LLM via LangChain
    llm = ChatNVIDIA(
        model=model,
        api_key=token,
        temperature=0.2,
        max_tokens=4096,
        streaming=False
    )

    # Define chain
    chain = prompt | llm | StrOutputParser()

    # Run it
    result = chain.invoke({"email_content": email_content})

    # Extract only capital letters (JOB, MEET, OTHER)
    clean_result = re.sub(r'[^A-Za-z,]+', '', result)
    final_result = clean_result.split(",")
    return np.array(final_result)


## Information extraction function

In [4]:
def extract_JOB_info(model, token, prompt_msg, email_content: str, examples: list = None):
    """
    Extract information from JOB emails using an NVIDIA LLM with optional few-shot examples.

    Args:
        model (str): NVIDIA LLM model name.
        token (str): API key for NVIDIA LLM.
        prompt_msg (str): Classification instruction message.
        email_content (str): Email text to classify.
        examples (list, optional): Few-shot examples in the form 
                                   [{"email": "example email", "label": "JOB"}, ...]

    Returns:
        str: Extracted information result.
    """

    # Build few-shot messages
    messages = []

    # Add few-shot examples if provided
    if examples:
        for ex in examples:
            messages.append(("user", ex["email"]))
            messages.append(("assistant", ex["label"]))

    # Add the actual email to classify
    messages.append((
        "user",
        f"""{{email_content}}\n\n{prompt_msg}"""
    ))

    # Create prompt template
    prompt = ChatPromptTemplate.from_messages(messages)

    # Use the NVIDIA LLM via LangChain
    llm = ChatNVIDIA(
        model=model,
        api_key=token,
        temperature=0.2,
        max_tokens=4096,
        streaming=False
    )

    # Define chain
    chain = prompt | llm | StrOutputParser()

    # Run it
    result = chain.invoke({"email_content": email_content})
    return np.array(result.split("|"))


def job_information_extraction(row, prompt_msg, model, token, examples=None):
    if eval(row['mistral_JOB']) == True:
        result = extract_JOB_info(model, token, prompt_msg, row['email'], examples)
        return result

    else:
        return np.array([np.nan, np.nan, np.nan, np.nan])




In [5]:
df = pd.read_excel(f"{path}/data/email_classification_annotated_data.xlsx", sheet_name="updated_annotated_data")
df

Unnamed: 0,email,JOB,MEET,OTHER
0,"Hi Saideep,\n\nThanks for your interest in iSp...",True,False,False
1,"Hello, Saideep,\n\nWe received your job applic...",True,False,False
2,"Hi Saideep,\n\nWe're super-pumped that you sha...",True,False,False
3,"Hi Saideep,\n\nThank you for applying to the S...",True,False,False
4,"Hello Saideep,\n\nThank you for your interest ...",True,False,False
5,"Dear Saideep,\n\nThank you for your interest i...",True,False,False
6,"Dear Saideep,\n\nThank you for your interest i...",True,False,False
7,"Hi Saideep,\nThank you for your interest in th...",True,False,False
8,"Hi Saideep,\n\nThanks for your interest in Red...",True,False,False
9,"Dear Saideep,\n\nThank you for giving us the o...",True,False,False


## Email Classification

In [6]:
prompt = """You are an expert email class identification system. Your task is to analyze the provided email and identify if the email falls it each category or not. Your response must be **only** the True/False for each category without any additional text or metadata.

### **Categories**

**1. JOB**
Identify an email as `JOB` if it concerns the status of a specific job application. Recipient of the mail will be informed about the status of their application. This includes notifications that a candidate has:
*   Successfully applied/in the process of applying for a job (application confirmation).
*   Been shortlisted for a position.
*   Been invited to an interview or assessment.
*   Been rejected for a position.
*   Job recommendations or suggestions or ads from job boards, recruiters, companies, etc., do not classify as JOB.

**2. MEET**
Identify an email as `MEET` if the sender is requesting to schedule a meeting, virtual or in-person.
*   Can be a virtual meeting via Zoom, Google Meet, Teams, Video call, etc.
*   Can be an in-person meeting at a specific location.
*   Can be an invitation to job interviews, assessments as well.
*   Cannot be classified as MEET unless it is mentioned explicitly in the email. Future steps of a process or a job application having a meeting do not classify as MEET.

**3. OTHER**
Identify an email as `OTHER` only if it does not fit into the `JOB` or `MEET` categories. This includes, but is not limited to:
*   General job-related discussions that are not about application status (e.g., networking, asking about a role, advertisements, job suggestions or requesting for an interview).
*   Newsletters, marketing emails, personal correspondence, etc.

**Important Rules:**
*   An email cannot be identified as any other class if it classifies as OTHER.
*   An email can be identified as either JOB or either MEET or both JOB and MEET.
*   Possible combinations of classes for email are:
    *   JOB, MEET
    *   JOB
    *   MEET
    *   OTHER

### **Output Format**

Your answer must be three True or False values, one for each JOB, MEET and OTHER in this exact order:
<is a JOB>, <is a MEET>, <is OTHER>"""

In [7]:
data_df = df.copy()
data_df[['mistral_JOB', 'mistral_MEET', 'mistral_OTHER']] = (
    data_df['email']
    .apply(lambda x: classify_email(model2, token, prompt, x))
    .apply(pd.Series)  # convert array to 3 separate columns
)

In [8]:
data_df

Unnamed: 0,email,JOB,MEET,OTHER,mistral_JOB,mistral_MEET,mistral_OTHER
0,"Hi Saideep,\n\nThanks for your interest in iSp...",True,False,False,True,False,False
1,"Hello, Saideep,\n\nWe received your job applic...",True,False,False,True,False,False
2,"Hi Saideep,\n\nWe're super-pumped that you sha...",True,False,False,True,False,False
3,"Hi Saideep,\n\nThank you for applying to the S...",True,False,False,True,False,False
4,"Hello Saideep,\n\nThank you for your interest ...",True,False,False,True,False,False
5,"Dear Saideep,\n\nThank you for your interest i...",True,False,False,True,False,False
6,"Dear Saideep,\n\nThank you for your interest i...",True,False,False,True,False,False
7,"Hi Saideep,\nThank you for your interest in th...",True,False,False,True,False,False
8,"Hi Saideep,\n\nThanks for your interest in Red...",True,False,False,True,False,False
9,"Dear Saideep,\n\nThank you for giving us the o...",True,False,False,True,False,False


## Information extraction

### Prompt-1 test

In [9]:
info_extract_prompt1 = """You are an information extraction assistant.  
You will be given the full text of an email regarding a candidate’s job application status.

Your task is to read the email carefully and extract the following details exactly as they appear in the email (or return empty string "" if the detail is missing):

1. Company Name — The company or organization sending the email.
2. Job Role — The job title or role mentioned in the email.
3. Job ID — The job requisition ID, reference number, or posting number mentioned.
4. Application Status — One of the following categories that best describes the current stage:
   - application incomplete
   - applied
   - assessment
   - interview
   - job offered
   - rejected
   - withdrawn
   - other (if none of the above applies)

### Output format:
<company_name>|<job_role>|<job_id>|<application_status>

Return the extracted information in the format above, with each detail separated by a pipe, without any additional text or metadata. If any detail is not present in the email, return an empty string for that detail (e.g., if the job ID is not mentioned, return <company_name>,<job_role>,,<application_status>)."""

In [10]:
info_data_df = data_df.copy()
info_data_df[['company_name', 'job_role', 'job_id', 'application_status']] = (
    data_df
    .apply(lambda row: job_information_extraction(row, info_extract_prompt1, model1, token), axis=1)
    .apply(pd.Series)  # convert array to 3 separate columns
)
info_data_df

ValueError: Columns must be same length as key

In [11]:
info_data_df = data_df.copy()
info_data_df[['company_name', 'job_role', 'job_id', 'application_status']] = (
    data_df
    .apply(lambda row: job_information_extraction(row, info_extract_prompt1, model2, token), axis=1)
    .apply(pd.Series)  # convert array to 3 separate columns
)
info_data_df

Unnamed: 0,email,JOB,MEET,OTHER,mistral_JOB,mistral_MEET,mistral_OTHER,company_name,job_role,job_id,application_status
0,"Hi Saideep,\n\nThanks for your interest in iSp...",True,False,False,True,False,False,iSpot,Research Data Scientist,,applied
1,"Hello, Saideep,\n\nWe received your job applic...",True,False,False,True,False,False,CHS Career Site,Enterprise Data Scientist,121420.0,applied
2,"Hi Saideep,\n\nWe're super-pumped that you sha...",True,False,False,True,False,False,GoodRx,Sr. Data Analyst,,applied
3,"Hi Saideep,\n\nThank you for applying to the S...",True,False,False,True,False,False,Klaviyo,Software Engineer II - Data Lake,,applied
4,"Hello Saideep,\n\nThank you for your interest ...",True,False,False,True,False,False,Embark,,,applied
5,"Dear Saideep,\n\nThank you for your interest i...",True,False,False,True,False,False,AEP,Data Scientist (Associate - Senior),,rejected
6,"Dear Saideep,\n\nThank you for your interest i...",True,False,False,True,False,False,Chan Zuckerberg Imaging Institute,"Data Scientist, Cryo-ET & AI/ML",,rejected
7,"Hi Saideep,\nThank you for your interest in th...",True,False,False,True,False,False,Astera,Machine Learning Scientist,,rejected
8,"Hi Saideep,\n\nThanks for your interest in Red...",True,False,False,True,False,False,Reddit,"Senior Machine Learning Engineer, App Ads",,rejected
9,"Dear Saideep,\n\nThank you for giving us the o...",True,False,False,True,False,False,PulsePoint,,,rejected


In [12]:
info_data_df = data_df.copy()
info_data_df[['company_name', 'job_role', 'job_id', 'application_status']] = (
    data_df
    .apply(lambda row: job_information_extraction(row, info_extract_prompt1, model3, token), axis=1)
    .apply(pd.Series)  # convert array to 3 separate columns
)
info_data_df

Unnamed: 0,email,JOB,MEET,OTHER,mistral_JOB,mistral_MEET,mistral_OTHER,company_name,job_role,job_id,application_status
0,"Hi Saideep,\n\nThanks for your interest in iSp...",True,False,False,True,False,False,iSpot,Research Data Scientist,,applied
1,"Hello, Saideep,\n\nWe received your job applic...",True,False,False,True,False,False,CHS Career Site,Enterprise Data Scientist,121420.0,applied
2,"Hi Saideep,\n\nWe're super-pumped that you sha...",True,False,False,True,False,False,GoodRx,Sr. Data Analyst,,applied
3,"Hi Saideep,\n\nThank you for applying to the S...",True,False,False,True,False,False,Klaviyo,Software Engineer II - Data Lake,,applied
4,"Hello Saideep,\n\nThank you for your interest ...",True,False,False,True,False,False,Embark,,,applied
5,"Dear Saideep,\n\nThank you for your interest i...",True,False,False,True,False,False,AEP Talent Acquisition,Data Scientist (Associate - Senior),,rejected
6,"Dear Saideep,\n\nThank you for your interest i...",True,False,False,True,False,False,Chan Zuckerberg Imaging Institute,"Data Scientist, Cryo-ET & AI/ML",,rejected
7,"Hi Saideep,\nThank you for your interest in th...",True,False,False,True,False,False,Astera,Machine Learning Scientist Role,,rejected
8,"Hi Saideep,\n\nThanks for your interest in Red...",True,False,False,True,False,False,Reddit,"Senior Machine Learning Engineer, App Ads",,rejected
9,"Dear Saideep,\n\nThank you for giving us the o...",True,False,False,True,False,False,The WebMD Recruiting Team,,,rejected


### Prompt-2 test

In [13]:
info_extract_prompt2 = """You are an information extraction assistant.  
You will be given the full text of an email regarding a candidate’s job application status.

Your task is to read the email carefully and extract the following details exactly as they appear in the email (or return empty string "" if the detail is missing):

1. Company Name — The company or organization sending the email. This referes to the name of the company that is mentioned in the email and does not necessarily refer to the sender's name. For example, if the email is sent by Amazon talent acquisition team or Amazon career site or Amazon hiring manager, etc., the company name is Amazon.
2. Job Role — The job title or role mentioned in the email.
3. Job ID — The job requisition ID, reference number, or posting number mentioned.
4. Application Status — One of the following categories that best describes the current stage:
   - application incomplete
   - applied
   - assessment
   - interview
   - job offered
   - rejected
   - withdrawn
   - other (if none of the above applies)

### Output format:
<company_name>|<job_role>|<job_id>|<application_status>

Return the extracted information in the format above, with each detail separated by a pipe, without any additional text or metadata. If any detail is not present in the email, return an empty string for that detail (e.g., if the job ID is not mentioned, return <company_name>|<job_role>||<application_status>)."""

In [14]:
info_data_df = data_df.copy()
info_data_df[['company_name', 'job_role', 'job_id', 'application_status']] = (
    data_df
    .apply(lambda row: job_information_extraction(row, info_extract_prompt2, model1, token), axis=1)
    .apply(pd.Series)  # convert array to 3 separate columns
)
info_data_df

ValueError: Columns must be same length as key

In [15]:
info_data_df = data_df.copy()
info_data_df[['company_name', 'job_role', 'job_id', 'application_status']] = (
    data_df
    .apply(lambda row: job_information_extraction(row, info_extract_prompt2, model2, token), axis=1)
    .apply(pd.Series)  # convert array to 3 separate columns
)
info_data_df

Unnamed: 0,email,JOB,MEET,OTHER,mistral_JOB,mistral_MEET,mistral_OTHER,company_name,job_role,job_id,application_status
0,"Hi Saideep,\n\nThanks for your interest in iSp...",True,False,False,True,False,False,iSpot,Research Data Scientist,,applied
1,"Hello, Saideep,\n\nWe received your job applic...",True,False,False,True,False,False,CHS,Enterprise Data Scientist,121420.0,applied
2,"Hi Saideep,\n\nWe're super-pumped that you sha...",True,False,False,True,False,False,GoodRx,Sr. Data Analyst,,applied
3,"Hi Saideep,\n\nThank you for applying to the S...",True,False,False,True,False,False,Klaviyo,Software Engineer II - Data Lake,,applied
4,"Hello Saideep,\n\nThank you for your interest ...",True,False,False,True,False,False,Embark,,,applied
5,"Dear Saideep,\n\nThank you for your interest i...",True,False,False,True,False,False,AEP,Data Scientist (Associate - Senior),,rejected
6,"Dear Saideep,\n\nThank you for your interest i...",True,False,False,True,False,False,Chan Zuckerberg Imaging Institute,"Data Scientist, Cryo-ET & AI/ML",,rejected
7,"Hi Saideep,\nThank you for your interest in th...",True,False,False,True,False,False,Astera,Machine Learning Scientist,,rejected
8,"Hi Saideep,\n\nThanks for your interest in Red...",True,False,False,True,False,False,Reddit,"Senior Machine Learning Engineer, App Ads",,rejected
9,"Dear Saideep,\n\nThank you for giving us the o...",True,False,False,True,False,False,PulsePoint,,,rejected


In [16]:
info_data_df = data_df.copy()
info_data_df[['company_name', 'job_role', 'job_id', 'application_status']] = (
    data_df
    .apply(lambda row: job_information_extraction(row, info_extract_prompt2, model3, token), axis=1)
    .apply(pd.Series)  # convert array to 3 separate columns
)
info_data_df

Unnamed: 0,email,JOB,MEET,OTHER,mistral_JOB,mistral_MEET,mistral_OTHER,company_name,job_role,job_id,application_status
0,"Hi Saideep,\n\nThanks for your interest in iSp...",True,False,False,True,False,False,iSpot,Research Data Scientist,,applied
1,"Hello, Saideep,\n\nWe received your job applic...",True,False,False,True,False,False,CHS,Enterprise Data Scientist,121420.0,applied
2,"Hi Saideep,\n\nWe're super-pumped that you sha...",True,False,False,True,False,False,GoodRx,Sr. Data Analyst,,applied
3,"Hi Saideep,\n\nThank you for applying to the S...",True,False,False,True,False,False,Klaviyo,Software Engineer II - Data Lake,,applied
4,"Hello Saideep,\n\nThank you for your interest ...",True,False,False,True,False,False,Embark,,,applied
5,"Dear Saideep,\n\nThank you for your interest i...",True,False,False,True,False,False,AEP,Data Scientist (Associate - Senior),,rejected
6,"Dear Saideep,\n\nThank you for your interest i...",True,False,False,True,False,False,Chan Zuckerberg Imaging Institute,"Data Scientist, Cryo-ET & AI/ML",,rejected
7,"Hi Saideep,\nThank you for your interest in th...",True,False,False,True,False,False,Astera,Machine Learning Scientist Role,,rejected
8,"Hi Saideep,\n\nThanks for your interest in Red...",True,False,False,True,False,False,Reddit,"Senior Machine Learning Engineer, App Ads",,rejected
9,"Dear Saideep,\n\nThank you for giving us the o...",True,False,False,True,False,False,WebMD,,,rejected
