# Jigsaw ACRC - Baseline v2 (Feature Engineering + Ensemble)

**Improvements:**
- Text length & word count features
- Character-level features (uppercase, digits, punctuation ratios)
- Average word length, sentence count
- Increased TF-IDF features (15,000)
- LightGBM + XGBoost ensemble
- Improved hyperparameters

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
print('Libraries loaded!')

## Load Data

In [None]:
print("Loading data...")
train_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')
sample_submission = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

## Feature Engineering

In [None]:
def create_text_features(df):
    """Create additional text-based features"""
    features = pd.DataFrame()
    
    # Fill NaN
    text_cols = ['body', 'rule', 'positive_example_1', 'positive_example_2',
                 'negative_example_1', 'negative_example_2']
    for col in text_cols:
        df[col] = df[col].fillna('').astype(str)
    
    # Length features
    features['body_len'] = df['body'].str.len()
    features['rule_len'] = df['rule'].str.len()
    features['pos1_len'] = df['positive_example_1'].str.len()
    features['pos2_len'] = df['positive_example_2'].str.len()
    features['neg1_len'] = df['negative_example_1'].str.len()
    features['neg2_len'] = df['negative_example_2'].str.len()
    
    # Word count features
    features['body_words'] = df['body'].str.split().str.len()
    features['rule_words'] = df['rule'].str.split().str.len()
    
    # Character features
    features['body_upper_ratio'] = df['body'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1))
    features['body_digit_ratio'] = df['body'].apply(lambda x: sum(1 for c in x if c.isdigit()) / (len(x) + 1))
    features['body_punct_ratio'] = df['body'].apply(lambda x: sum(1 for c in x if not c.isalnum() and not c.isspace()) / (len(x) + 1))
    
    # Avg word length
    features['body_avg_word_len'] = df['body'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)
    
    # Sentence count
    features['body_sentences'] = df['body'].str.count('\\.') + 1
    
    return features

print("Creating text features...")
train_text_feats = create_text_features(train_df)
test_text_feats = create_text_features(test_df)
print(f"Created {train_text_feats.shape[1]} additional features")

In [None]:
def create_combined_text(df):
    """Combine all text fields"""
    text_cols = ['body', 'rule', 'positive_example_1', 'positive_example_2',
                 'negative_example_1', 'negative_example_2']
    
    for col in text_cols:
        df[col] = df[col].fillna('').astype(str)
    
    combined = (
        df['body'] + ' [SEP] ' +
        df['rule'] + ' [SEP] ' +
        df['positive_example_1'] + ' [SEP] ' +
        df['positive_example_2'] + ' [SEP] ' +
        df['negative_example_1'] + ' [SEP] ' +
        df['negative_example_2']
    )
    return combined

train_text = create_combined_text(train_df)
test_text = create_combined_text(test_df)
print("Combined text created")

## TF-IDF Vectorization

In [None]:
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\\w{1,}',
    stop_words='english'
)

train_tfidf = tfidf.fit_transform(train_text)
test_tfidf = tfidf.transform(test_text)
print(f"TF-IDF shape: {train_tfidf.shape}")

In [None]:
# Combine features
X_train = hstack([train_tfidf, csr_matrix(train_text_feats.values)]).toarray()
X_test = hstack([test_tfidf, csr_matrix(test_text_feats.values)]).toarray()
y_train = train_df['rule_violation']

print(f"Final feature shape: {X_train.shape}")

## Cross-Validation Training

In [None]:
print("LightGBM - 5-Fold CV...\n")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

lgbm_oof = np.zeros(len(y_train))
lgbm_test = np.zeros(len(X_test))
lgbm_scores = []

lgbm_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 63,
    'learning_rate': 0.03,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbose': -1,
    'random_state': RANDOM_STATE,
    'n_estimators': 2000,
    'early_stopping_rounds': 100
}

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_train_fold = X_train[train_idx]
    X_val_fold = X_train[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]
    
    model = LGBMClassifier(**lgbm_params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        eval_metric='auc'
    )
    
    lgbm_oof[val_idx] = model.predict_proba(X_val_fold)[:, 1]
    lgbm_test += model.predict_proba(X_test)[:, 1] / 5
    
    fold_auc = roc_auc_score(y_val_fold, lgbm_oof[val_idx])
    lgbm_scores.append(fold_auc)
    print(f"Fold {fold} AUC: {fold_auc:.6f}")

lgbm_auc = roc_auc_score(y_train, lgbm_oof)
print(f"\nLightGBM Overall CV AUC: {lgbm_auc:.6f}")

In [None]:
print("\nXGBoost - 5-Fold CV...\n")

xgb_oof = np.zeros(len(y_train))
xgb_test = np.zeros(len(X_test))
xgb_scores = []

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'booster': 'gbtree',
    'max_depth': 7,
    'learning_rate': 0.03,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'random_state': RANDOM_STATE,
    'n_estimators': 2000,
    'early_stopping_rounds': 100,
    'verbosity': 0
}

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_train_fold = X_train[train_idx]
    X_val_fold = X_train[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]
    
    model = XGBClassifier(**xgb_params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        verbose=False
    )
    
    xgb_oof[val_idx] = model.predict_proba(X_val_fold)[:, 1]
    xgb_test += model.predict_proba(X_test)[:, 1] / 5
    
    fold_auc = roc_auc_score(y_val_fold, xgb_oof[val_idx])
    xgb_scores.append(fold_auc)
    print(f"Fold {fold} AUC: {fold_auc:.6f}")

xgb_auc = roc_auc_score(y_train, xgb_oof)
print(f"\nXGBoost Overall CV AUC: {xgb_auc:.6f}")

## Ensemble & Submission

In [None]:
# Ensemble predictions (60% LightGBM, 40% XGBoost)
ensemble_oof = 0.6 * lgbm_oof + 0.4 * xgb_oof
ensemble_test = 0.6 * lgbm_test + 0.4 * xgb_test

ensemble_auc = roc_auc_score(y_train, ensemble_oof)

print(f"{'='*70}")
print("ENSEMBLE RESULTS")
print(f"{'='*70}")
print(f"LightGBM CV AUC: {lgbm_auc:.6f}")
print(f"XGBoost CV AUC: {xgb_auc:.6f}")
print(f"Ensemble CV AUC: {ensemble_auc:.6f}")
print(f"{'='*70}")

In [None]:
# Create submission
submission = sample_submission.copy()
submission['rule_violation'] = ensemble_test
submission.to_csv('submission.csv', index=False)

print("\nSubmission saved: submission.csv")
print(f"\nPrediction stats:")
print(f"  Min: {ensemble_test.min():.6f}")
print(f"  Max: {ensemble_test.max():.6f}")
print(f"  Mean: {ensemble_test.mean():.6f}")
print(f"  Median: {np.median(ensemble_test):.6f}")
print("\nDone!")