In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

In [3]:
df = pd.read_csv('datasets/paragraphes_clauses_set.csv')

In [4]:


# Assume df is your preprocessed clause-level dataframe
# Columns: ["doc_id", "sample_id", "clause_text", "categories_set", ...]

# -------------------------------
# Step 1. Compute document-level labels
# -------------------------------
# For stratification we need some label per doc.
# We'll use whether the doc has at least one "no-risk" clause vs not,
# or fallback to balancing by doc_id distribution.

doc_labels = (
    df.groupby("doc_id")["categories_set"]
      .apply(lambda sets: any("no-risk" in s for s in sets))
      .astype(int)  # binary label: has_no_risk_clauses
      .reset_index()
)

# -------------------------------
# Step 2. Split by document_id
# -------------------------------
train_docs, temp_docs = train_test_split(
    doc_labels,
    test_size=0.2,   # 80/20 first split
    stratify=doc_labels["categories_set"],  # stratify by no-risk presence
    random_state=42
)

val_docs, test_docs = train_test_split(
    temp_docs,
    test_size=0.5,   # 10% / 10% final
    stratify=temp_docs["categories_set"],
    random_state=42
)

# -------------------------------
# Step 3. Filter original dataframe by split doc_ids
# -------------------------------
train_df = df[df["doc_id"].isin(train_docs["doc_id"])]
val_df   = df[df["doc_id"].isin(val_docs["doc_id"])]
test_df  = df[df["doc_id"].isin(test_docs["doc_id"])]

print("Train docs:", train_df["doc_id"].nunique(), "clauses:", len(train_df))
print("Val docs:", val_df["doc_id"].nunique(), "clauses:", len(val_df))
print("Test docs:", test_df["doc_id"].nunique(), "clauses:", len(test_df))

Train docs: 408 clauses: 5560
Val docs: 51 clauses: 800
Test docs: 51 clauses: 713


In [5]:
train_docs = set(train_df["doc_id"].unique())
val_docs = set(val_df["doc_id"].unique())
test_docs = set(test_df["doc_id"].unique())

# Overlaps
overlap_train_val = train_docs & val_docs
overlap_train_test = train_docs & test_docs
overlap_val_test = val_docs & test_docs

print("Doc ID Overlaps:")
print("Train-Val overlap:", overlap_train_val)
print("Train-Test overlap:", overlap_train_test)
print("Val-Test overlap:", overlap_val_test)

if not overlap_train_val and not overlap_train_test and not overlap_val_test:
    print("✅ No doc_id leakage across splits")
else:
    print("⚠️ Some doc_ids are present in multiple splits!")


Doc ID Overlaps:
Train-Val overlap: set()
Train-Test overlap: set()
Val-Test overlap: set()
✅ No doc_id leakage across splits


In [7]:
def label_distribution(df):
    """Flatten categories_list and count frequencies."""
    all_labels = [label for labels in df["categories_list"] for label in labels]
    return Counter(all_labels)

dist_all = label_distribution(pd.concat([train_df, val_df, test_df]))
dist_train = label_distribution(train_df)
dist_val = label_distribution(val_df)
dist_test = label_distribution(test_df)


KeyError: 'categories_list'

In [None]:
# Load splits
train_df = pd.read_parquet("train.parquet")
val_df = pd.read_parquet("val.parquet")
test_df = pd.read_parquet("test.parquet")

# -----------------------------
# 1. Check for doc_id leakage
# -----------------------------
train_docs = set(train_df["doc_id"].unique())
val_docs = set(val_df["doc_id"].unique())
test_docs = set(test_df["doc_id"].unique())

# Overlaps
overlap_train_val = train_docs & val_docs
overlap_train_test = train_docs & test_docs
overlap_val_test = val_docs & test_docs

print("Doc ID Overlaps:")
print("Train-Val overlap:", overlap_train_val)
print("Train-Test overlap:", overlap_train_test)
print("Val-Test overlap:", overlap_val_test)

if not overlap_train_val and not overlap_train_test and not overlap_val_test:
    print("✅ No doc_id leakage across splits")
else:
    print("⚠️ Some doc_ids are present in multiple splits!")

# -----------------------------
# 2. Label distribution check
# -----------------------------

def label_distribution(df):
    """Flatten categories_list and count frequencies."""
    all_labels = [label for labels in df["categories_list"] for label in labels]
    return Counter(all_labels)

dist_all = label_distribution(pd.concat([train_df, val_df, test_df]))
dist_train = label_distribution(train_df)
dist_val = label_distribution(val_df)
dist_test = label_distribution(test_df)

# Normalize to proportions
def normalize(counter):
    total = sum(counter.values())
    return {k: v/total for k,v in counter.items()}

norm_all = normalize(dist_all)
norm_train = normalize(dist_train)
norm_val = normalize(dist_val)
norm_test = normalize(dist_test)

# Compare distributions visually for a few top categories
def plot_distribution(norm_all, norm_split, split_name):
    categories = list(norm_all.keys())
    all_props = [norm_all[c] for c in categories]
    split_props = [norm_split.get(c, 0) for c in categories]

    plt.figure(figsize=(12,5))
    plt.scatter(all_props, split_props, alpha=0.7)
    plt.plot([0, max(all_props)], [0, max(all_props)], 'r--')  # diagonal
    plt.xlabel("Overall Distribution")
    plt.ylabel(f"{split_name} Distribution")
    plt.title(f"Label Distribution: {split_name} vs Overall")
    plt.show()

plot_distribution(norm_all, norm_train, "Train")
plot_distribution(norm_all, norm_val, "Validation")
plot_distribution(norm_all, norm_test, "Test")

# -----------------------------
# 3. Optional: Print KL divergence (numerical check)
# -----------------------------
from scipy.stats import entropy

def kl_divergence(p, q):
    # p, q are dicts with normalized values
    categories = set(p.keys()) | set(q.keys())
    p_vec = [p.get(c, 0) for c in categories]
    q_vec = [q.get(c, 0) for c in categories]
    return entropy(p_vec, q_vec)

print("KL Divergence Train vs Overall:", kl_divergence(norm_all, norm_train))
print("KL Divergence Val vs Overall:", kl_divergence(norm_all, norm_val))
print("KL Divergence Test vs Overall:", kl_divergence(norm_all, norm_test))