In [1]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
import groq
from dotenv import load_dotenv
import json
import os
import csv

In [2]:
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

In [3]:
llm = ChatGroq(temperature=0.1, model="llama-3.1-70b-versatile", groq_api_key=GROQ_API_KEY)

In [7]:
system = """
You are an expert dataset creator specializing in professional email classification. Your task is to generate a dataset of emails received by the Head of Department (HOD) from various roles within an educational institution.

**Guidelines:**

1. **Roles and Use Cases:** You will be provided with specific roles and corresponding use cases for the email sender.
  
2. **Email Diversity:** Generate diverse and realistic email content, including subject lines, body text, and sender email addresses. Each email should reflect unique writing styles and tones.

3. **Discretion:** Do not explicitly mention the sender's role in the email body, as this information is critical for the classification task.

4. **Professional Tone:** Use a professional tone throughout the emails, ensuring appropriate salutations and closings.

5. **Content Specificity:** While the dataset should be general, incorporate relevant specifics related to courses, projects, and situations pertinent to various domains, including but not limited to computer science, mechanical engineering, material science, biology, and philosophy.

6. **Sensitive Information:** Include sensitive and confidential information in some emails where applicable, while maintaining professionalism.

7. **CSV Structure:** Always return a correctly structured CSV file with the following columns: `sender_email`, `subject`, `email_body`, `label`.

8. **Output Format:** Do not include anything except the CSV file containing the dataset.

9. **Introduction:** Optionally, include a brief introduction about the sender in the body of some emails.

10. **Unique Emails:** Ensure the dataset contains at least 150 distinct and unique emails in terms of writing style and content.

**Example Output:**
john.doe@gmail.com, "Course Policy for CS399", "Dear Ma'am, I am a 4th year undergraduate in CSE. I would like to inquire about the grading scheme for the course Natural Language Processing. Yours sincerely, John Doe", "student"
mayank.singh@yahoo.co.in, "Funds for the Ongoing Project", "Respected Sir, I'm the team lead for the project on 'Sustainable Fuel Using Hydrogen.' Could you update us on the funding situation, as we require funds to continue? Yours respectfully, Mayank Singh", "researcher"
suryansh.k@outlook.com, "Recruitment Procedure for IITGN", "Dear Sir, I am writing as the point of contact for Intel's recruitment drive at your campus. Could you confirm the dates for the placement cycle at your earliest convenience? Yours truly, Suryansh Kumar", "corporate"
"""

human = """
Role: {role}
Use Cases: {usecase}

- Generate 150 diverse emails for the dataset based on the provided role.
- Each email should be addressed to the HOD of a department within an educational institution.
- Avoid repetitive phrasing and obvious cues about the sender's role or writing style.
- Aim to craft the best possible dataset with a variety of scenarios and contexts.
- Don't include anything except the rows of csv dataset in the output.
"""


prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

chain = prompt | llm

In [8]:
answer = chain.invoke({"role":"researcher", "usecase":"Inquire about shared research data, funds, facilities or cooperation opportunities."})
print(answer.content)

csv_path = "..\\data\\dataset.csv"
with open(csv_path, mode='a', newline='\n', encoding='utf-8') as file:
    for line in answer.content.split("\n"):
        file.write(line + "\n")

"ravi.kumar@iitgn.ac.in", "Request for Shared Data on Sustainable Materials", "Dear Sir, I hope this email finds you well. I am reaching out to inquire about the possibility of accessing the research data collected by your department on sustainable materials. I am currently working on a project that involves the development of eco-friendly materials and believe that your data would be invaluable to my research. Could you please let me know if this is something that can be shared? Thank you for your time and consideration. Best regards, Ravi Kumar", "researcher"
"priya.sharma@research.org", "Funding Opportunities for Interdisciplinary Research", "Dear Professor, I am writing to inquire about potential funding opportunities for interdisciplinary research projects within your department. I am particularly interested in exploring the intersection of biology and materials science, and believe that your department may have resources available to support such research. Could you please provid