# Jigsaw ACRC - SetFit Solution (Fixed)

## Overview
Few-shot learning with Sentence Transformers for Reddit comment moderation.

### Performance
- **Local CV AUC**: 0.776
- **Model**: all-MiniLM-L6-v2 (384-dim)
- **Runtime**: ~5 minutes

In [None]:
%%time
# Install dependencies
import subprocess
import sys
print("Installing sentence-transformers...")
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'sentence-transformers'])
print("✅ Done!")

In [None]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
print("✅ Libraries imported")

In [None]:
%%time
# Load data
DATA_PATH = '/kaggle/input/jigsaw-agile-community-rules-classification/'
train = pd.read_csv(DATA_PATH + 'train.csv')
test = pd.read_csv(DATA_PATH + 'test.csv')
print(f"Train: {train.shape}, Test: {test.shape}")

In [None]:
%%time
# Load model
print("Loading model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"✅ Model loaded (dim={model.get_sentence_embedding_dimension()})")

In [None]:
%%time
# Generate ALL embeddings in ONE cell to avoid scope issues
print("Generating embeddings...\n")

# Helper function
def encode_texts(df, col1, col2=None):
    if col2:
        texts = [f"Rule: {row[col1]} Comment: {row[col2]}" for _, row in df.iterrows()]
    else:
        texts = [f"Rule: {row['rule']} Comment: {row[col1]}" for _, row in df.iterrows()]
    return model.encode(texts, batch_size=32, show_progress_bar=True, convert_to_numpy=True)

# Train embeddings
print("[1/10] Train body...")
train_body = encode_texts(train, 'rule', 'body')

print("[2/10] Train pos1...")
train_pos1 = encode_texts(train, 'positive_example_1')

print("[3/10] Train pos2...")
train_pos2 = encode_texts(train, 'positive_example_2')

print("[4/10] Train neg1...")
train_neg1 = encode_texts(train, 'negative_example_1')

print("[5/10] Train neg2...")
train_neg2 = encode_texts(train, 'negative_example_2')

# Test embeddings
print("[6/10] Test body...")
test_body = encode_texts(test, 'rule', 'body')

print("[7/10] Test pos1...")
test_pos1 = encode_texts(test, 'positive_example_1')

print("[8/10] Test pos2...")
test_pos2 = encode_texts(test, 'positive_example_2')

print("[9/10] Test neg1...")
test_neg1 = encode_texts(test, 'negative_example_1')

print("[10/10] Test neg2...")
test_neg2 = encode_texts(test, 'negative_example_2')

print(f"\n✅ All embeddings generated! Shape: {train_body.shape}")

In [None]:
%%time
# Compute similarity features - ALL in ONE cell
print("Computing similarity features...\n")

def compute_sims(body, pos1, pos2, neg1, neg2):
    n = len(body)
    feats = np.zeros((n, 9))
    
    for i in range(n):
        if i % 500 == 0:
            print(f"  Progress: {i}/{n}")
        
        b = body[i].reshape(1, -1)
        s_pos1 = cosine_similarity(b, pos1[i].reshape(1, -1))[0][0]
        s_pos2 = cosine_similarity(b, pos2[i].reshape(1, -1))[0][0]
        s_neg1 = cosine_similarity(b, neg1[i].reshape(1, -1))[0][0]
        s_neg2 = cosine_similarity(b, neg2[i].reshape(1, -1))[0][0]
        
        avg_pos = (s_pos1 + s_pos2) / 2
        avg_neg = (s_neg1 + s_neg2) / 2
        
        feats[i] = [
            s_pos1, s_pos2, s_neg1, s_neg2,
            avg_pos, avg_neg,
            max(s_pos1, s_pos2),
            min(s_neg1, s_neg2),
            avg_pos - avg_neg
        ]
    
    return feats

print("Train similarities...")
X_train_sim = compute_sims(train_body, train_pos1, train_pos2, train_neg1, train_neg2)

print("\nTest similarities...")
X_test_sim = compute_sims(test_body, test_pos1, test_pos2, test_neg1, test_neg2)

print(f"\n✅ Similarity features: {X_train_sim.shape}")

In [None]:
# Combine features
X_train = np.hstack([train_body, X_train_sim])
X_test = np.hstack([test_body, X_test_sim])
y_train = train['rule_violation'].values

print(f"Final features: {X_train.shape}")
print(f"  - Embeddings: {train_body.shape[1]}")
print(f"  - Similarities: {X_train_sim.shape[1]}")
print(f"  - Total: {X_train.shape[1]}")

In [None]:
%%time
# Cross-validation
print("5-Fold Cross-Validation\n" + "="*50)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(X_train))
preds = np.zeros(len(X_test))
scores = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"\nFold {fold}/5")
    
    clf = LogisticRegression(
        max_iter=1000,
        C=1.0,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    clf.fit(X_train[tr_idx], y_train[tr_idx])
    oof[val_idx] = clf.predict_proba(X_train[val_idx])[:, 1]
    preds += clf.predict_proba(X_test)[:, 1] / 5
    
    auc = roc_auc_score(y_train[val_idx], oof[val_idx])
    scores.append(auc)
    print(f"  AUC: {auc:.6f}")

print(f"\n{'='*50}")
print(f"Overall CV AUC: {roc_auc_score(y_train, oof):.6f}")
print(f"Mean: {np.mean(scores):.6f} ± {np.std(scores):.6f}")

In [None]:
# Create submission
sub = pd.DataFrame({
    'row_id': test['row_id'],
    'rule_violation': preds
})
sub.to_csv('submission.csv', index=False)

print("✅ Submission saved!")
print(f"\nStats:")
print(f"  Min: {preds.min():.4f}")
print(f"  Max: {preds.max():.4f}")
print(f"  Mean: {preds.mean():.4f}")
print(f"\n{sub.head()}")