# Jigsaw ACRC - SetFit (Single Cell Solution)

All code in single cells to avoid variable scope issues.

In [None]:
# Check if sentence-transformers is already installed
try:
    import sentence_transformers
    print(f"sentence-transformers version: {sentence_transformers.__version__}")
    print("Already installed!")
except ImportError:
    print("NOT installed - need to install")
    import subprocess, sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'sentence-transformers'])

In [None]:
# Imports
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, pairwise
from sentence_transformers import SentenceTransformer
warnings.filterwarnings('ignore')
print("OK")

In [None]:
# Load data and model - ALL IN ONE CELL
print("Loading data...")
DATA_PATH = '/kaggle/input/jigsaw-agile-community-rules/'
train = pd.read_csv(DATA_PATH + 'train.csv')
test = pd.read_csv(DATA_PATH + 'test.csv')
print(f"Train: {len(train)}, Test: {len(test)}")

print("Loading model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Model loaded (dim={model.get_sentence_embedding_dimension()})")

In [None]:
# Generate ALL embeddings - EVERYTHING IN THIS ONE CELL
print("Generating embeddings (this takes ~3 minutes)...\n")

# Train
print("[1/10] Train body")
train_body = model.encode(
    [f"Rule: {r} Comment: {b}" for r, b in zip(train['rule'], train['body'])],
    batch_size=32, show_progress_bar=False, convert_to_numpy=True
)
print(f"  -> Shape: {train_body.shape}")

print("[2/10] Train pos1")
train_pos1 = model.encode(
    [f"Rule: {r} Comment: {e}" for r, e in zip(train['rule'], train['positive_example_1'])],
    batch_size=32, show_progress_bar=False, convert_to_numpy=True
)

print("[3/10] Train pos2")
train_pos2 = model.encode(
    [f"Rule: {r} Comment: {e}" for r, e in zip(train['rule'], train['positive_example_2'])],
    batch_size=32, show_progress_bar=False, convert_to_numpy=True
)

print("[4/10] Train neg1")
train_neg1 = model.encode(
    [f"Rule: {r} Comment: {e}" for r, e in zip(train['rule'], train['negative_example_1'])],
    batch_size=32, show_progress_bar=False, convert_to_numpy=True
)

print("[5/10] Train neg2")
train_neg2 = model.encode(
    [f"Rule: {r} Comment: {e}" for r, e in zip(train['rule'], train['negative_example_2'])],
    batch_size=32, show_progress_bar=False, convert_to_numpy=True
)

# Test
print("[6/10] Test body")
test_body = model.encode(
    [f"Rule: {r} Comment: {b}" for r, b in zip(test['rule'], test['body'])],
    batch_size=32, show_progress_bar=False, convert_to_numpy=True
)

print("[7/10] Test pos1")
test_pos1 = model.encode(
    [f"Rule: {r} Comment: {e}" for r, e in zip(test['rule'], test['positive_example_1'])],
    batch_size=32, show_progress_bar=False, convert_to_numpy=True
)

print("[8/10] Test pos2")
test_pos2 = model.encode(
    [f"Rule: {r} Comment: {e}" for r, e in zip(test['rule'], test['positive_example_2'])],
    batch_size=32, show_progress_bar=False, convert_to_numpy=True
)

print("[9/10] Test neg1")
test_neg1 = model.encode(
    [f"Rule: {r} Comment: {e}" for r, e in zip(test['rule'], test['negative_example_1'])],
    batch_size=32, show_progress_bar=False, convert_to_numpy=True
)

print("[10/10] Test neg2")
test_neg2 = model.encode(
    [f"Rule: {r} Comment: {e}" for r, e in zip(test['rule'], test['negative_example_2'])],
    batch_size=32, show_progress_bar=False, convert_to_numpy=True
)

print(f"\nAll embeddings generated! Shape: {train_body.shape}")

In [None]:
# Compute similarities - ALL IN ONE CELL
print("Computing similarity features...\n")

def compute_sims(body, pos1, pos2, neg1, neg2):
    n = len(body)
    feats = np.zeros((n, 9))
    
    for i in range(n):
        if i % 500 == 0:
            print(f"  {i}/{n}")
        
        b = body[i].reshape(1, -1)
        s_pos1 = pairwise.cosine_similarity(b, pos1[i].reshape(1, -1))[0][0]
        s_pos2 = pairwise.cosine_similarity(b, pos2[i].reshape(1, -1))[0][0]
        s_neg1 = pairwise.cosine_similarity(b, neg1[i].reshape(1, -1))[0][0]
        s_neg2 = pairwise.cosine_similarity(b, neg2[i].reshape(1, -1))[0][0]
        
        avg_pos = (s_pos1 + s_pos2) / 2
        avg_neg = (s_neg1 + s_neg2) / 2
        
        feats[i] = [
            s_pos1, s_pos2, s_neg1, s_neg2,
            avg_pos, avg_neg,
            max(s_pos1, s_pos2),
            min(s_neg1, s_neg2),
            avg_pos - avg_neg
        ]
    
    return feats

print("Train:")
X_train_sim = compute_sims(train_body, train_pos1, train_pos2, train_neg1, train_neg2)

print("\nTest:")
X_test_sim = compute_sims(test_body, test_pos1, test_pos2, test_neg1, test_neg2)

print(f"\nDone! Similarity features: {X_train_sim.shape}")

In [None]:
# Combine
X_train = np.hstack([train_body, X_train_sim])
X_test = np.hstack([test_body, X_test_sim])
y_train = train['rule_violation'].values
print(f"Final: {X_train.shape}")

In [None]:
# Cross-validation
print("5-Fold CV\n" + "="*50)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(X_train))
preds = np.zeros(len(X_test))
scores = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"\nFold {fold}/5")
    
    clf = LogisticRegression(
        max_iter=1000, C=1.0, class_weight='balanced',
        random_state=42, n_jobs=-1
    )
    
    clf.fit(X_train[tr_idx], y_train[tr_idx])
    oof[val_idx] = clf.predict_proba(X_train[val_idx])[:, 1]
    preds += clf.predict_proba(X_test)[:, 1] / 5
    
    auc = roc_auc_score(y_train[val_idx], oof[val_idx])
    scores.append(auc)
    print(f"  AUC: {auc:.6f}")

print(f"\n{'='*50}")
print(f"Overall: {roc_auc_score(y_train, oof):.6f}")
print(f"Mean: {np.mean(scores):.6f} Â± {np.std(scores):.6f}")

In [None]:
# Submission
sub = pd.DataFrame({
    'row_id': test['row_id'],
    'rule_violation': preds
})
sub.to_csv('submission.csv', index=False)
print(f"Saved! Min: {preds.min():.4f}, Max: {preds.max():.4f}, Mean: {preds.mean():.4f}")
print(sub.head())