# Plan to Medal: Detecting Insults in Social Commentary

Goal: Optimize AUC-ROC with robust text modeling under time limits.

Milestones:
- Data loading & sanity checks (schema, target balance)
- Baseline: TF-IDF (word + char) + Logistic Regression with 5-fold Stratified CV
- Iterate: tune n-grams, min_df, C, optional class weights
- Add simple preprocessing (lowercase, URL/user/number normalization)
- Error analysis: top coefficients, misclassified examples
- Finalize and generate submission.csv

Checkpoints for Expert Review:
- After plan (this cell)
- After initial EDA
- After baseline CV results
- After tuning trials (if needed)
- Before final submission

Time Management:
- Baseline within ~15 minutes
- Iterative tuning in parallel with light EDA
- Log timing per fold; interrupt if slow

In [2]:
import os, sys, time, re, html, gc
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

start_time = time.time()
print('Working directory:', os.getcwd())

# Load files
train_path = 'train.csv'
test_path = 'test.csv'
sample_sub_path = 'sample_submission_null.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_sub = pd.read_csv(sample_sub_path)

print('\nLoaded files:')
print('train.shape:', train.shape)
print('test.shape:', test.shape)
print('sample_submission.shape:', sample_sub.shape)

print('\nColumns:')
print('train.columns:', list(train.columns))
print('test.columns:', list(test.columns))
print('sample_submission.columns:', list(sample_sub.columns))

# Identify target and text columns heuristically
target_col = None
for c in train.columns:
    if c.lower() == 'insult':
        target_col = c
        break
if target_col is None:
    raise ValueError('Target column Insult not found in train.csv')

text_col_candidates = [c for c in train.columns if c.lower() in ('comment','comment_text','text','content','quotation')]
if not text_col_candidates:
    # fallback: choose the first object column
    obj_cols = [c for c in train.columns if train[c].dtype == 'object']
    if not obj_cols:
        raise ValueError('No object/text column found in train.csv')
    text_col = obj_cols[0]
else:
    text_col = text_col_candidates[0]

print(f"\nDetected target_col='{target_col}', text_col='{text_col}'")

# Identify submission id columns from sample submission (everything except the Insult column)
insult_pred_col = None
for c in sample_sub.columns:
    if c.lower() == 'insult':
        insult_pred_col = c
        break
if insult_pred_col is None:
    raise ValueError('Could not find Insult column in sample_submission_null.csv')
submission_id_cols = [c for c in sample_sub.columns if c != insult_pred_col]
missing_in_test = [c for c in submission_id_cols if c not in test.columns]
if missing_in_test:
    raise ValueError(f'Submission ID columns {missing_in_test} not present in test.csv')
print(f"Submission ID columns: {submission_id_cols}, submission target col='{insult_pred_col}'")

# Basic sanitation
for df, name in [(train,'train'),(test,'test')]:
    if text_col not in df.columns:
        raise ValueError(f"Text column '{text_col}' not in {name} columns")
    df[text_col] = df[text_col].astype(str).fillna('')

# Target checks
y = train[target_col].values
print('\nTarget distribution:')
print(pd.Series(y).value_counts(normalize=False).rename('count'))
print(pd.Series(y).value_counts(normalize=True).rename('ratio'))

# Duplicates within train
dup_train = train.duplicated(subset=[text_col]).sum()
print(f"\nDuplicate comments within train: {dup_train}")

# Overlap between train and test comments (exact match)
train_text_set = set(train[text_col].values.tolist())
test_text_set = set(test[text_col].astype(str).values.tolist())
overlap = len(train_text_set & test_text_set)
print(f"Exact text overlap train<->test: {overlap}")

# Show a few rows
print('\nSample train rows:')
print(train[[text_col, target_col]].head(3))
print('\nSample test rows (ID cols + text):')
cols_to_show = submission_id_cols + [text_col]
print(test[cols_to_show].head(3))

elapsed = time.time() - start_time
print(f"\nData load & checks done in {elapsed:.2f}s")

Working directory: /app/agent_run_states/detecting-insults-in-social-commentary-spray-20250910-034902

Loaded files:
train.shape: (3947, 3)
test.shape: (2647, 2)
sample_submission.shape: (2647, 3)

Columns:
train.columns: ['Insult', 'Date', 'Comment']
test.columns: ['Date', 'Comment']
sample_submission.columns: ['Insult', 'Date', 'Comment']

Detected target_col='Insult', text_col='Comment'
Submission ID columns: ['Date', 'Comment'], submission target col='Insult'

Target distribution:
0    2898
1    1049
Name: count, dtype: int64
0    0.734229
1    0.265771
Name: ratio, dtype: float64

Duplicate comments within train: 12
Exact text overlap train<->test: 0

Sample train rows:
                                             Comment  Insult
0                               "You fuck your dad."       1
1  "i really don't understand your point.\xa0 It ...       0
2  "A\\xc2\\xa0majority of Canadians can and has ...       0

Sample test rows (ID cols + text):
              Date                  

In [None]:
import time, re, html, gc
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

np.random.seed(42)
folds = 5
random_state = 42

# Preprocess function: minimal normalization
url_re = re.compile(r'https?://\S+|www\.\S+')
user_re = re.compile(r'@\w+')
num_re = re.compile(r'\d+')
def normalize_text(series):
    # series is iterable of strings
    out = []
    for s in series:
        s = html.unescape(s)
        s = url_re.sub(' _URL_ ', s)
        s = user_re.sub(' _USER_ ', s)
        s = num_re.sub(' _NUM_ ', s)
        out.append(s)
    return out

print('\n=== Baseline TF-IDF + Logistic Regression (word+char) ===')
word_vect = TfidfVectorizer(ngram_range=(1,2), min_df=2, sublinear_tf=True, lowercase=True, analyzer='word')
char_vect = TfidfVectorizer(ngram_range=(3,5), min_df=2, sublinear_tf=True, lowercase=True, analyzer='char')

features = FeatureUnion([
    ('word', word_vect),
    ('char', char_vect),
], n_jobs=1)

logreg = LogisticRegression(C=3.0, max_iter=2000, solver='saga', n_jobs=-1, class_weight='balanced', verbose=0, random_state=random_state)

# We'll handle normalization outside the vectorizers to ensure consistency
X_text = normalize_text(train[text_col].tolist())
y = train[target_col].values.astype(int)

skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)
oof = np.zeros(len(train), dtype=float)
fold_times = []
for fold, (trn_idx, val_idx) in enumerate(skf.split(X_text, y), 1):
    t0 = time.time()
    print(f'Fold {fold}/{folds}: train={len(trn_idx)} val={len(val_idx)}')
    X_trn = [X_text[i] for i in trn_idx]
    X_val = [X_text[i] for i in val_idx]
    y_trn, y_val = y[trn_idx], y[val_idx]

    pipe = Pipeline([
        ('feats', features),
        ('clf', logreg)
    ])
    pipe.fit(X_trn, y_trn)
    val_pred = pipe.predict_proba(X_val)[:, 1]
    oof[val_idx] = val_pred
    auc = roc_auc_score(y_val, val_pred)
    dt = time.time() - t0
    fold_times.append(dt)
    print(f'  Fold {fold} AUC: {auc:.5f} | time: {dt:.2f}s', flush=True)
    gc.collect()

oof_auc = roc_auc_score(y, oof)
print(f'OOF AUC: {oof_auc:.5f}')
print('Fold times (s):', [round(t,2) for t in fold_times], ' | total:', round(sum(fold_times),2))

# Train on full data
print('\nTraining final model on full training data...')
t0 = time.time()
final_pipe = Pipeline([
    ('feats', features),
    ('clf', logreg)
])
final_pipe.fit(X_text, y)
print(f'Final training done in {time.time()-t0:.2f}s')

# Predict on test
X_test_text = normalize_text(test[text_col].astype(str).tolist())
test_pred = final_pipe.predict_proba(X_test_text)[:, 1]

# Build submission: columns follow sample_submission with Insult first, then ID columns
sub = pd.DataFrame({insult_pred_col: test_pred})
# Append identifier columns from test in the same order as sample submission (excluding Insult)
for c in submission_id_cols:
    sub[c] = test[c]
sub = sub[[insult_pred_col] + submission_id_cols]
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv with shape:', sub.shape)
print(sub.head(3))


=== Baseline TF-IDF + Logistic Regression (word+char) ===
Fold 1/5: train=3157 val=790
