# Jigsaw ACRC - Baseline Submission

**Model**: TF-IDF + LightGBM  
**CV AUC**: 0.614

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

print('Libraries loaded successfully!')

## Load Data

In [None]:
# Load datasets
train = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
test = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')
sample_sub = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Sample submission shape: {sample_sub.shape}")

## Feature Engineering

In [None]:
def create_combined_text(df):
    """Combine all text features into one"""
    combined = (
        df['body'].fillna('') + ' [SEP] ' +
        df['rule'].fillna('') + ' [SEP] ' +
        df['positive_example_1'].fillna('') + ' [SEP] ' +
        df['positive_example_2'].fillna('') + ' [SEP] ' +
        df['negative_example_1'].fillna('') + ' [SEP] ' +
        df['negative_example_2'].fillna('')
    )
    return combined

# Create combined text features
X_train_text = create_combined_text(train)
X_test_text = create_combined_text(test)
y_train = train['rule_violation']

print(f"Training samples: {len(X_train_text)}")
print(f"Test samples: {len(X_test_text)}")

## TF-IDF Vectorization

In [None]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    strip_accents='unicode',
    lowercase=True,
    stop_words='english',
    sublinear_tf=True
)

# Fit and transform
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

print(f"TF-IDF features: {X_train_tfidf.shape[1]}")

## Model Training with Cross-Validation

In [None]:
# Stratified K-Fold
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Store OOF predictions
oof_preds = np.zeros(len(X_train_text))
test_preds = np.zeros(len(X_test_text))

cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_tfidf, y_train), 1):
    print(f"\nFold {fold}/{n_folds}")
    
    # Split data
    X_tr = X_train_tfidf[train_idx]
    y_tr = y_train.iloc[train_idx]
    X_val = X_train_tfidf[val_idx]
    y_val = y_train.iloc[val_idx]
    
    # Train model
    model = LGBMClassifier(
        objective='binary',
        metric='auc',
        learning_rate=0.05,
        num_leaves=31,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=5,
        n_estimators=1000,
        random_state=42,
        verbose=-1
    )
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric='auc'
    )
    
    # Predict
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test_tfidf)[:, 1] / n_folds
    
    # Calculate AUC
    fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
    cv_scores.append(fold_auc)
    print(f"Fold {fold} AUC: {fold_auc:.6f}")

# Overall CV score
overall_auc = roc_auc_score(y_train, oof_preds)
print(f"\n{'='*50}")
print(f"Overall CV AUC: {overall_auc:.6f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.6f} (+/- {np.std(cv_scores):.6f})")
print(f"{'='*50}")

## Generate Submission

In [None]:
# Create submission
submission = pd.DataFrame({
    'row_id': test['row_id'],
    'rule_violation': test_preds
})

# Save submission
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")
print(f"\nSubmission shape: {submission.shape}")
print(f"\nFirst 5 rows:")
print(submission.head())
print(f"\nPrediction statistics:")
print(f"  Min: {submission['rule_violation'].min():.6f}")
print(f"  Max: {submission['rule_violation'].max():.6f}")
print(f"  Mean: {submission['rule_violation'].mean():.6f}")
print(f"  Median: {submission['rule_violation'].median():.6f}")