# Jigsaw Reddit Competition - Preprocessing & Baseline Model

This notebook implements:
1. Robust Cross-Validation using StratifiedGroupKFold
2. Reddit-Specific Text Preprocessing
3. Feature Engineering with Rule Embeddings
4. Baseline Model with TF-IDF + LightGBM
5. Comprehensive Evaluation Metrics

In [1]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings

# ML libraries
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import LabelEncoder 
import lightgbm as lgb

# NLP libraries
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tqdm import tqdm
warnings.filterwarnings('ignore')
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


## 1. Load Data

In [6]:
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Unique rules in train: {train_df['rule'].nunique()}")

Train shape: (2029, 9)
Test shape: (10, 8)
Unique rules in train: 2


## 2. Reddit Text Preprocessing

In [7]:
def clean_reddit_text(text):
    """Clean Reddit-specific formatting"""
    if pd.isna(text):
        return ""
    
    text = str(text)
    
    # Handle Reddit markdown
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # Bold
    text = re.sub(r'\*(.*?)\*', r'\1', text)      # Italic
    text = re.sub(r'~~(.*?)~~', r'\1', text)      # Strikethrough
    
    # Handle quotes
    text = re.sub(r'^&gt;.*$', '[QUOTE]', text, flags=re.MULTILINE)
    
    # Normalize mentions
    text = re.sub(r'/u/\w+', '[USER]', text)
    text = re.sub(r'/r/\w+', '[SUBREDDIT]', text)
    
    # Handle URLs
    text = re.sub(r'http[s]?://\S+', '[URL]', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply preprocessing
train_df['body_cleaned'] = train_df['body'].apply(clean_reddit_text)
test_df['body_cleaned'] = test_df['body'].apply(clean_reddit_text)

## 3. Feature Engineering

In [None]:
# Text features
def get_text_features(text):
    """Extract text statistics"""
    if pd.isna(text):
        return {'char_count': 0, 'word_count': 0, 'exclamation_count': 0}
    
    text = str(text)
    return {
        'char_count': len(text),
        'word_count': len(text.split()),
        'exclamation_count': text.count('!'),
        'question_count': text.count('?'),
        'caps_ratio': sum(1 for c in text if c.isupper()) / len(text) if text else 0
    }

# Apply text features
train_text_features = pd.DataFrame(train_df['body'].apply(get_text_features).tolist())
test_text_features = pd.DataFrame(test_df['body'].apply(get_text_features).tolist())

# Encode subreddit
subreddit_encoder = LabelEncoder()
train_df['subreddit_encoded'] = subreddit_encoder.fit_transform(train_df['subreddit'])
test_df['subreddit_encoded'] = subreddit_encoder.transform(test_df['subreddit'])

print("Feature engineering completed")

Feature engineering completed


## 4. Cross-Validation Setup

In [None]:
# Setup StratifiedGroupKFold
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
groups = train_df['rule']
y = train_df['rule_violation']

cv_folds = list(sgkf.split(train_df, y, groups))
print(f"Created {len(cv_folds)} folds")

# Analyze folds
for i, (train_idx, val_idx) in enumerate(cv_folds):
    train_rules = set(train_df.iloc[train_idx]['rule'])
    val_rules = set(train_df.iloc[val_idx]['rule'])
    print(f"Fold {i+1}: Train rules: {train_rules}, Val rules: {val_rules}")

## 5. TF-IDF Vectorization

In [None]:
# TF-IDF vectorizers
char_vectorizer = TfidfVectorizer(
    analyzer='char', ngram_range=(2, 4), max_features=10000
)
word_vectorizer = TfidfVectorizer(
    analyzer='word', ngram_range=(1, 2), max_features=20000, stop_words='english'
)

# Fit and transform
train_char_tfidf = char_vectorizer.fit_transform(train_df['body_cleaned'])
train_word_tfidf = word_vectorizer.fit_transform(train_df['body_cleaned'])

test_char_tfidf = char_vectorizer.transform(test_df['body_cleaned'])
test_word_tfidf = word_vectorizer.transform(test_df['body_cleaned'])

print(f"Character TF-IDF shape: {train_char_tfidf.shape}")
print(f"Word TF-IDF shape: {train_word_tfidf.shape}")

## 6. Combine Features

In [None]:
from scipy.sparse import hstack

# Combine all features
train_combined = hstack([
    train_char_tfidf,
    train_word_tfidf,
    train_text_features.values,
    train_df[['subreddit_encoded']].values
])

test_combined = hstack([
    test_char_tfidf,
    test_word_tfidf,
    test_text_features.values,
    test_df[['subreddit_encoded']].values
])

print(f"Combined training features shape: {train_combined.shape}")

## 7. LightGBM Baseline Model

In [None]:
# LightGBM parameters
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'verbose': -1,
    'random_state': 42
}

# Cross-validation training
oof_predictions = np.zeros(len(train_df))
fold_scores = []
models = []

for fold_idx, (train_idx, val_idx) in enumerate(cv_folds):
    print(f"Training fold {fold_idx + 1}...")
    
    X_train_fold = train_combined[train_idx]
    X_val_fold = train_combined[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    # Train model
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold)
    
    model = lgb.train(
        lgb_params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=1000,
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict
    val_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = val_pred
    
    # Score
    fold_auc = roc_auc_score(y_val_fold, val_pred)
    fold_scores.append(fold_auc)
    models.append(model)
    
    print(f"Fold {fold_idx + 1} AUC: {fold_auc:.4f}")

# Overall score
overall_auc = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_auc:.4f} (+/- {np.std(fold_scores):.4f})")

## 8. Model Evaluation

In [None]:
# Evaluation plots
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# CV scores
axes[0].bar(range(1, len(fold_scores)+1), fold_scores)
axes[0].set_title('Cross-Validation AUC Scores')
axes[0].set_xlabel('Fold')
axes[0].set_ylabel('AUC')

# Prediction distribution
axes[1].hist(oof_predictions[y==0], alpha=0.7, label='Non-violation', bins=50)
axes[1].hist(oof_predictions[y==1], alpha=0.7, label='Violation', bins=50)
axes[1].set_title('Prediction Distribution')
axes[1].set_xlabel('Predicted Probability')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"\nBaseline Model Summary:")
print(f"- Cross-validation AUC: {overall_auc:.4f}")
print(f"- Standard deviation: {np.std(fold_scores):.4f}")
print(f"- Feature count: {train_combined.shape[1]}")

## 9. Generate Test Predictions

In [None]:
# Generate test predictions
test_predictions = np.zeros(len(test_df))

for model in models:
    test_pred = model.predict(test_combined, num_iteration=model.best_iteration)
    test_predictions += test_pred / len(models)

# Create submission
submission = pd.DataFrame({
    'row_id': test_df.index,
    'rule_violation': test_predictions
})

submission.to_csv('baseline_submission.csv', index=False)
print(f"Submission saved with {len(submission)} predictions")
print(f"Prediction range: {test_predictions.min():.4f} - {test_predictions.max():.4f}")

In [None]:


What patterns distinguish violations from non-violations across both rules
Which features transfer between different rule types
How to build rule-agnostic violation detectors
What the provided rule examples teach us about generalization