In [35]:
%pip install transformers open_clip_torch sentence_transformers pysentimiento "accelerate>=0.26.0" --quiet
%pip install git+https://github.com/mlfoundations/open_clip.git --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [36]:
emails_path = '../../../../assets/emails.txt'
with open(emails_path, 'r') as file:
    file = [line.strip() for line in file if line.strip()]
# print(emails)

In [37]:
header_lines = file[:4]
emails = file[4:]

SEPARATOR = "-------------------------------------------------------------------------------"

# junta todas as linhas em uma única string
emails_str = "\n".join(emails)

# agora sim você pode usar split!
blocos = [b.strip() for b in emails_str.split(SEPARATOR) if b.strip()]


In [38]:
def email_to_json(bloco, id):
    linhas = bloco.split("\n")
    
    dados = {
        "id": id,
        "from": linhas[0].replace("De: ", "").strip(),
        "to": linhas[1].replace("Para: ", "").strip(),
        "date": linhas[2].replace("Data: ", "").strip(),
        "subject": linhas[3].replace("Assunto: ", "").strip(),
        "body": "\n".join(linhas[5:]).strip()
    }
    
    return dados


emails_json = []

for i, b in enumerate(blocos, start=1):
    emails_json.append(email_to_json(b, i))


In [46]:
from transformers import pipeline


def sentiment_pipeline(text: str):
    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

    classifier = pipeline(
        "sentiment-analysis",
        model=model_name,
        tokenizer=model_name,
        device=-1
    )

    result = classifier(text)[0]

    stars = int(result["label"][0])

    if stars <= 2:
        label = "NEG"
    elif stars == 3:
        label = "NEU"
    else:
        label = "POS"

    return {
        "label": label,
        "raw_label": result["label"],
        "score": result["score"]
    }


def zero_shot_pipeline(text: str, labels):
    classifier = pipeline(
        "zero-shot-classification",
        model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
    )
    
    result = classifier(text, candidate_labels=labels, multi_label=True)
    return {"labels": result["labels"], "scores": result["scores"]}


def initial_impression_pipeline(emails_json):
    results = []

    for email in emails_json:
        subject = email.get("subject", "")
        body = email.get("body", "")
        sender = email.get("from", "").lower()

        text = subject + "\n" + body

        sentiment = sentiment_pipeline(text)

        topics = zero_shot_pipeline(text, [
            "conspiracy",
            "suspicious",
            "complaint",
            "procedural",
            "work-related",
            "personal"
        ])

        conspiracy_score = topics["scores"][topics["labels"].index("conspiracy")]
        sentiment_neg = 1 if sentiment["label"] == "NEG" else 0
        sender_flag = 1 if "michael" in sender else 0
        mentions_toby = 1 if "toby" in text.lower() else 0

        suspicion_score = (
            0.45 * sender_flag +
            0.15 * mentions_toby +
            0.25 * conspiracy_score +
            0.15 * sentiment_neg
        )

        results.append({
            "id": email["id"],
            "from": email.get("from", ""),
            "subject": subject,
            "body": body,
            "sentiment": sentiment,
            "topics": topics,
            "suspicion_score": suspicion_score
        })

    return results


# -----------------------------
# EXECUTAR
# -----------------------------
scores_json = initial_impression_pipeline(emails_json)

for result in scores_json:
    print(result)
    print("====================================================")


{'id': 1, 'from': 'Toby Flenderson <toby.flenderson@dundermifflin.com>', 'subject': 'Atualização do Formulário de Seguro Dental', 'body': 'Olá a todos.\nA Blue Cross mudou o provedor de seguro dental. Quem tiver dependentes precisa preencher o formulário 12-B na minha mesa até sexta-feira. Se você não preencher, perderá a cobertura de ortodontia.\nPor favor, não desenhem no formulário.', 'sentiment': {'label': 'NEG', 'raw_label': '1 star', 'score': 0.46641939878463745}, 'topics': {'labels': ['work-related', 'procedural', 'complaint', 'personal', 'suspicious', 'conspiracy'], 'scores': [0.021610170602798462, 0.004070137161761522, 0.001913633313961327, 0.0004450864507816732, 0.0003704614646267146, 0.0003320922842249274]}, 'suspicion_score': 0.15008302307105623}
{'id': 2, 'from': 'Phyllis Lapin-Vance <phyllis.vance@dundermifflin.com>', 'subject': 'Guardanapos para a festa de aniversário do Creed', 'body': 'Angela, comprei os guardanapos verdes como você pediu. O recibo foi de $12,50. Vou d

In [48]:
high_suspicion = []

for result in scores_json:
    if result["suspicion_score"] > 0.8:
        # pegar o email original com base no id
        original = next(email for email in emails_json if email["id"] == result["id"])

        high_suspicion.append({
            **result,
            "subject": original.get("subject", ""),
            "body": original.get("body", "")
        })

for email in high_suspicion:
    print("ID:", email["id"])
    print("Suspicion Score:", email["suspicion_score"])
    print("Subject:", email["subject"])
    print("Body:", email["body"])
    print("====================================================")


ID: 16
Suspicion Score: 0.9998988956212997
Subject: CÓDIGO VERMELHO - O INIMIGO ESTÁ ENTRE NÓS
Body: Dwight, apague este e-mail imediatamente após ler.
Tive um sonho premonitório. O Toby não é apenas chato. Ele é um agente infiltrado da Vance Refrigeration ou talvez da Staples. Eu vi ele anotando coisas no caderno enquanto olhava para mim.
Precisamos iniciar a OPERAÇÃO FÊNIX DOURADA.
Encontre um local seguro no depósito. Precisamos de suprimentos.
ID: 28
Suspicion Score: 0.9995934665203094
Subject: Re: Barulhos no depósito
Body: Cale a boca, Toby. Você é a razão pela qual as pessoas desistem dos seus sonhos.
As armadilhas são para ratos. Ratos gigantes. Ratos espiões.
Não se meta na Operação Fênix.
ID: 44
Suspicion Score: 0.8444017827510834
Subject: Re: Teste de Radônio
Body: Você é o assassino silencioso, Toby. Você.
Vou jogar fora. O ar aqui é puro, temos ar condicionado.
ID: 49
Suspicion Score: 0.8446398884057998
Subject: Reunião de Emergência na Sala de Conferência
Body: Tópico: O 

In [56]:
from datetime import datetime, timedelta

def parse_date(date_str):
    return datetime.strptime(date_str, "%Y-%m-%d %H:%M")


def group_suspicious_with_michael_context(emails_json, scores_json, threshold=0.8):
    # 1. Mapear emails por ID para acesso rápido
    email_by_id = {email["id"]: email for email in emails_json}

    # 2. Filtrar emails suspeitos
    suspicious = [
        result for result in scores_json
        if result["suspicion_score"] >= threshold
    ]

    groups = []

    for sus in suspicious:
        sus_email = email_by_id[sus["id"]]
        sus_date = parse_date(sus_email["date"])

        lower = sus_date - timedelta(hours=32)
        upper = sus_date + timedelta(hours=32)


        # 3. Buscar emails do Michael no intervalo
        context = []
        for email in emails_json:
            sender = email["from"].lower()
            if "michael.scott" in sender:
                email_date = parse_date(email["date"])
                if lower <= email_date <= upper:
                    context.append(email)

        groups.append({
            "suspect_email": sus_email,
            "context_emails": context
        })

    return groups

groups = group_suspicious_with_michael_context(emails_json, scores_json)

for g in groups:
    print("=== EMAIL SUSPEITO ===")
    print(g["suspect_email"]["subject"])
    print(g["suspect_email"]["body"])
    print()

    print("--- CONTEXTO (MICHAEL ±36 hrs) ---")
    for ctx in g["context_emails"]:
        print(ctx["date"], "-", ctx["subject"])
    print("\n=================================\n")



=== EMAIL SUSPEITO ===
CÓDIGO VERMELHO - O INIMIGO ESTÁ ENTRE NÓS
Dwight, apague este e-mail imediatamente após ler.
Tive um sonho premonitório. O Toby não é apenas chato. Ele é um agente infiltrado da Vance Refrigeration ou talvez da Staples. Eu vi ele anotando coisas no caderno enquanto olhava para mim.
Precisamos iniciar a OPERAÇÃO FÊNIX DOURADA.
Encontre um local seguro no depósito. Precisamos de suprimentos.

--- CONTEXTO (MICHAEL ±36 hrs) ---
2008-04-02 10:00 - CÓDIGO VERMELHO - O INIMIGO ESTÁ ENTRE NÓS
2008-04-02 10:05 - Re: Re: CÓDIGO VERMELHO


=== EMAIL SUSPEITO ===
Re: Barulhos no depósito
Cale a boca, Toby. Você é a razão pela qual as pessoas desistem dos seus sonhos.
As armadilhas são para ratos. Ratos gigantes. Ratos espiões.
Não se meta na Operação Fênix.

--- CONTEXTO (MICHAEL ±36 hrs) ---
2008-04-11 08:00 - Ideia Brilhante / Pedido de Verba
2008-04-11 10:15 - Injustiça
2008-04-12 11:05 - Re: Barulhos no depósito


=== EMAIL SUSPEITO ===
Re: Teste de Radônio
Você é o as

In [None]:
import os
from dotenv import load_dotenv
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

load_dotenv()
API_KEY = os.getenv("NVIDIA_API_KEY")

llm = ChatOpenAI(
    model="nvidia/llama-3.3-nemotron-super-49b-v1.5",
    openai_api_key=API_KEY,
    openai_api_base="https://integrate.api.nvidia.com/v1"
)


AGENT_PROMPT = ChatPromptTemplate.from_messages([
    (
        "system",
        """
        You are an AI Investigation Agent.

        TASK CONTEXT:
        Toby suspects that Michael Scott is conspiring against him.
        Your job is to analyze emails and determine whether this conspiracy is real.

        For each cluster of emails (one suspicious email + contextual emails), you must:

        1. Produce a **narrative explanation** of what seems to be happening.
        2. Extract **evidence**, including quotes, timestamps, tone, and relevance.
        3. Generate **profiles of individuals involved**, focusing on Michael Scott and Toby Flenderson.
        4. Provide a **cluster conclusion** answering:
            “Does this cluster indicate potential conspiracy against Toby?”

        IMPORTANT RULES:
        - Stick ONLY to the content of the emails provided.
        - Do NOT hallucinate nonexistent content.
        - Make the analysis concise but thorough.
        - Assume the reader is a human investigator.

        Return the output in this structure:

        {
        "narrative": "...",
        "evidence": [ { ... }, ... ],
        "profiles": { "michael": "...", "toby": "...", "others": [...] },
        "cluster_conclusion": "..."
        }
        """
    ),
    ("human", "{cluster_text}")
])


def analyze_cluster_with_agent(cluster):
    """
    cluster = {
        "suspect_email": {...},
        "context_emails": [ ... ]
    }
    """

    lines = []
    lines.append("=== SUSPICIOUS EMAIL ===")
    s = cluster["suspect_email"]
    lines.append(f"ID: {s['id']}")
    lines.append(f"From: {s['from']}")
    lines.append(f"Date: {s['date']}")
    lines.append(f"Subject: {s['subject']}")
    lines.append(f"Body:\n{s['body']}")
    lines.append("\n=== CONTEXT EMAILS ===")

    for ctx in cluster["context_emails"]:
        lines.append("--------------------------")
        lines.append(f"ID: {ctx['id']}")
        lines.append(f"From: {ctx['from']}")
        lines.append(f"Date: {ctx['date']}")
        lines.append(f"Subject: {ctx['subject']}")
        lines.append(f"Body:\n{ctx['body']}")

    cluster_text = "\n".join(lines)

    response = llm([
        HumanMessage(content=AGENT_PROMPT.format(cluster_text=cluster_text))
    ])

    return response.content

REPORT_PROMPT = ChatPromptTemplate.from_messages([
    (
        "system",
        """
        You are an AI summarization and synthesis expert.

        Your task is to read multiple cluster analyses and produce a structured FINAL REPORT containing:

        1. **Narrative of the full investigation**  
        - Timeline of events  
        - How the conspiracy unfolds  
        - Behavior patterns of Michael Scott  
        - How Toby is affected  

        2. **Summary of Key Evidence**  
        List each cluster's evidence and relevance toward conspiracy.

        3. **Profiles of Individuals Involved**
        - Michael Scott  
        - Toby Flenderson  
        - Any other recurring individuals  

        4. **Final Conclusion**  
        Answer clearly:
        “Is Michael Scott conspiring against Toby?”  
        Provide justification based only on evidence.

        The final output should be well-written, factual, and concise.
        """
    ),
    ("human", "{cluster_analyses}")
])


def generate_final_report(cluster_analysis_texts):
    joined = "\n\n====== CLUSTER BREAK ======\n\n".join(cluster_analysis_texts)

    response = llm([
        HumanMessage(content=REPORT_PROMPT.format(cluster_analyses=joined))
    ])

    return response.content



clusters = detect_clusters(scores_json)

cluster_reports = []
for cluster in clusters:
    out = analyze_cluster_with_agent(cluster)
    cluster_reports.append(out)

final_report = generate_final_report(cluster_reports)

print(final_report)