In [None]:
import pandas as pd
from textdistance import jaro_winkler
from sentence_transformers import SentenceTransformer, util

# Load prompts
df = pd.read_csv('prompts.csv')

# Quick lexical check (fast, identifies exact/similar wording)
def lexical_duplication_check(df, threshold=0.88):
    dupes = []
    texts = df['Prompt'].tolist()
    for i in range(len(texts)):
        for j in range(i+1, len(texts)):
            sim = jaro_winkler.normalized_similarity(texts[i], texts[j])
            if sim > threshold:
                dupes.append((i, j, sim))
    return dupes

# Semantic drift check (more nuanced)
def semantic_drift_check(df, concept_col='ConceptTag', branch_col='Branch', lib_col='Library', threshold=0.78):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(df['Prompt'].tolist(), convert_to_tensor=True)
    drift_flags = []

    for idx, row in df.iterrows():
        matches = df[(df[concept_col] == row[concept_col]) & (df[branch_col] == row[branch_col]) & (df[lib_col] != row[lib_col])]
        for match_idx in matches.index:
            similarity = util.cos_sim(embeddings[idx], embeddings[match_idx]).item()
            if similarity > threshold:
                drift_flags.append((idx, match_idx, similarity))

    return drift_flags

# Run checks
lexical_duplicates = lexical_duplication_check(df)
semantic_drifts = semantic_drift_check(df)

# Review flagged pairs manually
print("Lexical duplicates flagged:")
for i, j, sim in lexical_duplicates:
    print(f"Rows {i} and {j}: similarity={sim:.2f}")

print("\nSemantic drift flagged:")
for i, j, sim in semantic_drifts:
    print(f"Rows {i} ({df.at[i, 'Library']}) and {j} ({df.at[j, 'Library']}): similarity={sim:.2f}")