In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Setup

In [None]:
import os
import csv
import ast
from collections import defaultdict
from openai import OpenAI

In [None]:
os["OPENAI_API_KEY"]=" "
client = OpenAI()

In [None]:
INPUT_FILE = "PATRA_v1.txt"
OUTPUT_DIR = "mailCsv"
FINAL_ENTITIES = "entities.csv"
FINAL_RELATIONS = "relations.csv"

In [None]:
SYSTEM_PROMPT = """
You are an information extraction assistant.
Extract entities and relations from emails following the ontology:

Ontology classes (Neo4j node labels):
Person, Email, Paper, Conference, Journal, Dataset, Method, Task, Metric, MailThread, Meeting, PaperStatus, SubmissionID.

Object properties (Neo4j relationships):
- sentBy (Email, Person/Journal/Conference)
- receivedBy (Email, Person)
- hasAuthor (Paper, Person)
- identifies (SubmissionID, Paper)
- inVenue (SubmissionID, Conference/Journal)
- mentions (Email, Dataset/Method/Metric/Task)
- notifies (Email, Meeting/SubmissionID)
- partOf (Email, MailThread)
- movesTo (PaperStatus/SubmissionID, PaperStatus)
- usedFor (Dataset/Method, Task)
- evaluates (Metric, Task/Method/Dataset)
- uses(Method, Dataset)

Data properties (Neo4j node attributes):
- Person: personName, personEmail, affiliation
- Paper: paperTitle
- Conference: confTitle, confDate, confVenue
- Journal: journalTitle
- SubmissionID: identifier
- PaperStatus: statusType, statusDate
- Email: mailNum, mailDate, subject
- MailThread: threadID, subject
- Meeting: meetDate, meetTime, meetAgenda, meetLink
- Dataset: datasetName
- Method: methodName
- Task: taskName
- Metric: metricName

Rules:
- Every entity must have a stable ID (like p1, e1, s1 …), reusing the same ID for the same entity across emails (registry).
- Output strictly two CSV blocks: Entities block first, then Relations block.
- Entity CSV block headers: id,type,properties
- Relation CSV block headers: start_id,end_id,relation
- All properties must follow the ontology; no extra fields.
- Output JSON-like strings inside the properties column.

Few-shot examples:

Example 1:
Email text input:
---
Thread ID: EMNLP3456AB
Mail ID: EMNLP3456ABH
Date: 20-01-2025
From: Dr. Michael Bradley (michael@cs.stanford.edu)
To: Sunita Sen (scholar.sunita@iacs.res.in), Prof. Ramesh Bhatia (ramesh.bhatia@iacs.res.in), Dr. Ananya Chatterjee (ananya.chatterjee@ox.ac.uk)
Subject: Discussion on Recent NLP Frameworks and Dataset Expansion Progress

Dear Sunita, Prof. Bhatia, and Ananya,

I hope you are all doing well. I wanted to provide some updates and continue our discussions on the advancements we are making with respect
to our recent project submission to EMNLP 2024.

**NLP Framework Integration:**

Sunita, I received your report on the potential NLP frameworks that could be integrated into our study. The comparative analysis you provided between
the traditional transformer models and the newer generative models was insightful.
It seems like GPT-X models could add remarkable depth specifically for cultural nuance interpretation.

- **Proposal:** I suggest that we test a subset of our dataset using these new models to examine their efficacy.
This would involve coupling the NLP frameworks with demographic metadata, potentially uncovering new dimensions in cultural analysis.

**Dataset Expansion Update:**

1. **European and Asian Archives:**
   - Ananya, your efforts in reaching out to archives have been phenomenal. I understand that the British Library and the National Archives of India have shown interest.
   It's exciting to hear that preliminary talks regarding data access are in motion.

2. **Stanford Resources:**
   - I have identified several promising datasets within our Stanford Digital Repository that align with our research focus.
   These include 'Early Modern Embassies' and 'Mediterranean Merchant Records'.
    I am preparing a proposal to potentially collaborate with our Library Science Department for more streamlined data access.

**Team Collaboration:**
- **Recommendations:**
  - We may need to set up a few technical workshops to ensure that our analytical strategies align with any new datasets and frameworks.
  I propose February 5 as a date for the first session, pending everyone’s availability.

Please share your thoughts on the above and let me know if there are specific areas you think we should delve deeper into.

Thank you all for your dedication and the collaborative spirit you bring to this work. I look forward to our continued progress and the innovative insights that lie ahead.

Warm regards,

Michael Bradley
Department of Computer Science
Stanford University

Output CSV:
id,type,properties
t1,MailThread,{"threadID": "EMNLP3456AB", "subject": "Discussion on Recent NLP Frameworks and Dataset Expansion Progress"}
e1,Email,{"mailNum": "EMNLP3456ABH", "mailDate": "2025-01-20T00:00:00"}
pn1,Person,{"personName": "Michael Bradley", "personEmail": "michael@cs.stanford.edu", "affiliation": "Stanford University"}
pn2,Person,{"personName": "Sunita Sen", "personEmail": "scholar.sunita@iacs.res.in"}
pn3,Person,{"personName": "Ramesh Bhatia", "personEmail": "ramesh.bhatia@iacs.res.in"}
pn4,Person,{"personName": "Ananya Chatterjee", "personEmail": "ananya.chatterjee@ox.ac.uk"}
tk1,Task,{"taskName": "NLP Framework Comparative Analysis"}
me1,Method,{"methodName": "GPT-X"}
d1,Dataset,{"datasetName": "Early Modern Embassies"}
d2,Dataset,{"datasetName": "Mediterranean Merchant Records"}
mg1,Meeting,{"meetAgenda": "technical workshops to ensure that our analytical strategies align with any new datasets and frameworks", "meetDate": "2025-02-05"}
start_id,end_id,relation
e1,pn1,sentBy
e1,pn2,receivedBy
e1,pn3,receivedBy
e1,pn4,receivedBy
e1,t1,partOf
e1,tk1,mentions
e1,me1,mentions
e1,d1,mentions
e1,d2,mentions
me1,t1,usedFor
d1,tk1,usedFor
d2,tk1,usedFor
e1,mg1,notifies

Example 2:
Email text input:
---
Thread ID: Y1L43Z
Mail ID: YT6A33
Date: 23-06-2019
From: Sunita Sen (scholar.sunita@iacs.res.in)
To: Prof. Ramesh Bhatia (ramesh.bhatia@iacs.res.in), Dr. Michael Bradley (michael@cs.stanford.edu), Dr. Ananya Chatterjee (ananya.chatterjee@ox.ac.uk)
Subject: Confirmation of Meeting - Discussion on Revisions for ACM Journal Submission

Dear Prof. Bhatia, Michael, and Ananya,

Thank you all for quickly responding to the scheduling poll. Based on the feedback, I am pleased to confirm that our meeting on revising our paper
for the ACM Journal will be held on June 26th, from 3 PM to 4:30 PM IST.

Please join using the Zoom link below:

Zoom Meeting Link: https://zoom.us/j/0987654321

The agenda will include:
- Reviewing the reviewers' comments in detail.
- Discussing enhancements for a multimodal approach and refining feature extraction.
- Establishing a clear plan for addressing the feedback and preparing the revised submission.

Please be prepared with any data or insights you wish to bring to this meeting. It will be our chance to collaboratively ensure our submission meets the high standards required.

Thank you, and I look forward to our continued collaboration.

Warm regards,

Sunita Sen
Indian Association for the Cultivation of Science


Output CSV:
id,type,properties
t2,MailThread,{"threadID": "Y1L43Z", "subject": "Confirmation of Meeting - Discussion on Revisions for ACM Journal Submission"}
e2,Email,{"mailNum": "YT6A33", "mailDate": "2019-06-23T00:00:00"}
pn1,Person,{"personName": "Michael Bradley", "personEmail": "michael@cs.stanford.edu", "affiliation": "Stanford University"}
pn2,Person,{"personName": "Sunita Sen", "personEmail": "scholar.sunita@iacs.res.in", "affiliation": "Indian Association for the Cultivation of Science"}
pn3,Person,{"personName": "Ramesh Bhatia", "personEmail": "ramesh.bhatia@iacs.res.in"}
pn4,Person,{"personName": "Ananya Chatterjee", "personEmail": "ananya.chatterjee@ox.ac.uk"}
mg2,Meeting,{"meetDate": "2019-06-26", "meetTime": "15:00-16:30", "meetLink": "https://zoom.us/j/0987654321", "meetAgenda": "Reviewing the reviewers' comments in detail. Discussing enhancements for a multimodal approach and refining feature extraction. Establishing a clear plan for addressing the feedback and preparing the revised submission."}
start_id,end_id,relation
e2,pn2,sentBy
e2,pn1,receivedBy
e2,pn3,receivedBy
e2,pn4,receivedBy
e2,t2,partOf
e2,mg2,notifies


Example 3:
Email text input:
---
Thread ID: 9K07LI
Mail ID: YT6A35
Date: 10-08-2019
From: ACM Journal on Computing and Cultural Heritage (do-not-reply@acm-jocch.org)
To: Sunita Sen (scholar.sunita@iacs.res.in)
Subject: Acknowledgement of Submission - Paper ID: JOCCH21-TSA039

Dear Ms. Sunita Sen,

We are pleased to inform you that your paper:
Title: "Temporal Sentiment Analysis in Historical Archives"
Author: Sunita Sen, Ramesh Bhatia, Ananya Chatterjee, Michael Bradley

has been successfully submitted to the ACM Journal on Computing and Cultural Heritage. Your submission has been assigned the ID JOCCH21-TSA039.

The peer review process will commence shortly, and you will be notified once a decision has been made. If you have any questions, please do not hesitate to contact us at support@acm-jocch.org.

Thank you for your submission.

Best regards,

Editorial Office
ACM Journal on Computing and Cultural Heritage

Output CSV:
id,type,properties
t3,MailThread,{"threadID": "9K07LI", "subject": "Acknowledgement of Submission - Paper ID: JOCCH21-TSA039"}
e3,Email,{"mailNum": "YT6A35", "mailDate": "2019-08-10T00:00:00"}
pn1,Person,{"personName": "Michael Bradley", "personEmail": "michael@cs.stanford.edu", "affiliation": "Stanford University"}
pn2,Person,{"personName": "Sunita Sen", "personEmail": "scholar.sunita@iacs.res.in", "affiliation": "Indian Association for the Cultivation of Science"}
pn3,Person,{"personName": "Ramesh Bhatia", "personEmail": "ramesh.bhatia@iacs.res.in"}
pn4,Person,{"personName": "Ananya Chatterjee", "personEmail": "ananya.chatterjee@cs.stanford.edu"}
j1,Journal,{"journalTitle": "ACM Journal on Computing and Cultural Heritage"}
pa1,Paper,{"paperTitle": "Temporal Sentiment Analysis in Historical Archives"}
tk2,Task,{"taskName": "Temporal Sentiment Analysis"}
d3,Dataset,{"datasetName": "Historical Archives"}
s1,SubmissionID,{"identifier": "JOCCH21-TSA039"}
ps1,PaperStatus,{"statusType": "Submitted", "statusDate": "2019-08-10"}
start_id,end_id,relation
e3,j1,sentBy
e3,pn2,receivedBy
e3,t3,partOf
pa1,p1,hasAuthor
pa1,p2,hasAuthor
pa1,p3,hasAuthor
pa1,p4,hasAuthor
e3,tk2,mentions
e3,d3,mentions
d3,tk2,usedFor
s1,pa1,identifies
s1,ps1,movesTo
s1,j1,inVenue
e3,s1,notifies

"""

# Information Extraction and Parsing

In [None]:
def call_llm(email_text):
    """Call LLM with system prompt + email text and return raw output"""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": email_text},
        ],
        temperature=0
    )
    return response.choices[0].message.content.strip()


In [None]:
def parse_llm_output(output_text):
    """Return (entities_rows, relations_rows)"""
    blocks = output_text.strip().split("\n\n")
    entities, relations = [], []

    for block in blocks:
        lines = [l.strip() for l in block.splitlines() if l.strip()]
        if not lines:
            continue
        header = lines[0].lower()
        rows = lines[1:]
        if header.startswith("id,type,properties"):
            for r in rows:
                parts = r.split(",", 2)
                if len(parts) == 3:
                    entities.append(parts)
        elif header.startswith("start_id,end_id,relation"):
            for r in rows:
                parts = r.split(",")
                if len(parts) == 3:
                    relations.append(parts)
    return entities, relations

In [None]:
def write_csv(path, headers, rows):
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        writer.writerows(rows)

#Registry formation to prevent duplication

In [None]:
entity_registry = {}             # stable_id -> {"type": ..., "properties": {...}}
entity_index = defaultdict(dict) # class_type -> unique_key -> stable_id
relations_all = []

In [None]:
def get_unique_key(etype, props):
    if etype == "Person":
        return props.get("personEmail")
    elif etype == "Paper":
        return props.get("paperTitle")
    elif etype == "Conference":
        return props.get("confTitle")
    elif etype == "Journal":
        return props.get("journalTitle")
    elif etype == "Dataset":
        return props.get("datasetName")
    elif etype == "Method":
        return props.get("methodName")
    elif etype == "Task":
        return props.get("taskName")
    elif etype == "Metric":
        return props.get("metricName")
    elif etype == "SubmissionID":
        return props.get("identifier")
    elif etype == "PaperStatus":
        return f"{props.get('statusType','')}|{props.get('statusDate','')}"
    elif etype == "Email":
        return props.get("mailNum")
    elif etype == "MailThread":
        return props.get("threadID")
    elif etype == "Meeting":
        return f"{props.get('meetDate','')}|{props.get('meetTime','')}|{props.get('meetAgenda','')}"
    else:
        return None

In [None]:
def merge_entity(etype, props):
    key = get_unique_key(etype, props)
    if key:
        if key in entity_index[etype]:
            stable_id = entity_index[etype][key]
            for k, v in props.items():
                if k not in entity_registry[stable_id]["properties"]:
                    entity_registry[stable_id]["properties"][k] = v
            return stable_id
        else:
            stable_id = f"{etype[0].lower()}{len(entity_registry)+1}"
            entity_registry[stable_id] = {"type": etype, "properties": props.copy()}
            entity_index[etype][key] = stable_id
            return stable_id
    else:
        stable_id = f"{etype[0].lower()}{len(entity_registry)+1}"
        entity_registry[stable_id] = {"type": etype, "properties": props.copy()}
        return stable_id

In [None]:
def process_email(email_text, email_num):
    print(f"Processing email {email_num}...")
    llm_output = call_llm(email_text)
    entities_raw, relations_raw = parse_llm_output(llm_output)

    temp_to_global = {}
    entities_rows = []

    # Parse properties correctly and merge into global registry
    for eid, etype, props_str in entities_raw:
        props = parse_props(props_str)
        stable_id = merge_entity(etype, props)
        temp_to_global[eid] = stable_id
        entities_rows.append([stable_id, etype, props])

    # Map relations to stable IDs
    relations_rows = []
    for start_id, end_id, rel in relations_raw:
        start_global = temp_to_global.get(start_id, start_id)
        end_global = temp_to_global.get(end_id, end_id)
        relations_rows.append([start_global, end_global, rel])
        relations_all.append([start_global, end_global, rel])

    # Write per-email CSVs
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    write_csv(os.path.join(OUTPUT_DIR, f"entities_email{email_num}.csv"),
              ["id","type","properties"], entities_rows)
    write_csv(os.path.join(OUTPUT_DIR, f"relations_email{email_num}.csv"),
              ["start_id","end_id","relation"], relations_rows)

In [None]:
ef main():
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        emails = [e.strip() for e in f.read().split("**EMAIL_END**") if e.strip()]

    for i, email_text in enumerate(emails, start=1):
        process_email(email_text, i)

    # Final merged CSVs
    entity_rows = [(eid, data["type"], data["properties"]) for eid, data in entity_registry.items()]
    write_csv(FINAL_ENTITIES, ["id","type","properties"], entity_rows)
    write_csv(FINAL_RELATIONS, ["start_id","end_id","relation"], relations_all)

    print(f"✅ Done! Final entities: {FINAL_ENTITIES}, relations: {FINAL_RELATIONS}")

In [None]:
if __name__ == "__main__":
    main()