# Detecting Insults in Social Commentary - Plan and Experiment Log

## Plan
- Load data (train.csv, test.csv); inspect columns, sizes, target distribution.
- Baseline: TF-IDF (word + char n-grams) -> Logistic Regression (linear) with class_weight='balanced'.
- Cross-validation: StratifiedKFold (5 folds), AUC-ROC per fold; log timing.
- Iterate: try feature tweaks (char_wb 3-5, word 1-2, sublinear TF, min_df), tune C and regularization.
- Train on full train with chosen setup; generate test predictions and save submission.csv.
- Keep concise logs; avoid long blocking; interrupt if too slow.

## Experiment Log
- v0: Baseline TF-IDF(word 1-2, char_wb 3-5, sublinear_tf, min_df=2) + LogisticRegression(saga, l2, C=4.0, class_weight='balanced'). 5-fold CV AUC target: >0.78.

---

In [1]:
# Baseline: TF-IDF (word+char) + Logistic Regression with 5-fold CV
import os, re, time, sys, math, random
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_colwidth', 200)

def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        s = '' if pd.isna(s) else str(s)
    s = s.lower()
    # URLs
    s = re.sub(r'https?://\S+|www\.\S+', ' URL ', s)
    # @mentions
    s = re.sub(r'@[A-Za-z0-9_]+', ' USER ', s)
    # numbers
    s = re.sub(r'\d+', ' NUM ', s)
    return s

print('Loading data...')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
text_col = 'Comment'
target_col = 'Insult'
id_col = 'id' if 'id' in train.columns else ('Id' if 'Id' in train.columns else train.columns[0])
print('Train shape:', train.shape, ' Test shape:', test.shape)
print('Columns:', train.columns.tolist())
print('Target distribution:')
print(train[target_col].value_counts(normalize=True))

X_text = train[text_col].astype(str).apply(normalize_text).values
y = train[target_col].values.astype(int)
X_test_text = test[text_col].astype(str).apply(normalize_text).values

# Config
n_splits = 5
seed = 42
word_params = dict(ngram_range=(1,2), min_df=2, strip_accents='unicode', lowercase=True, sublinear_tf=True, analyzer='word')
char_params = dict(ngram_range=(3,5), min_df=2, strip_accents='unicode', lowercase=True, sublinear_tf=True, analyzer='char_wb')
C_val = 4.0

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
oof = np.zeros(len(train), dtype=float)
fold_times = []
fold_aucs = []

print('Starting 5-fold CV...')
t0 = time.time()
for fold, (tr_idx, va_idx) in enumerate(skf.split(X_text, y), 1):
    fstart = time.time()
    X_tr, X_va = X_text[tr_idx], X_text[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    # Vectorizers fit on training fold only
    word_vec = TfidfVectorizer(**word_params)
    char_vec = TfidfVectorizer(**char_params)
    Xw_tr = word_vec.fit_transform(X_tr)
    Xw_va = word_vec.transform(X_va)
    Xc_tr = char_vec.fit_transform(X_tr)
    Xc_va = char_vec.transform(X_va)
    X_tr_mat = sparse.hstack([Xw_tr, Xc_tr], format='csr')
    X_va_mat = sparse.hstack([Xw_va, Xc_va], format='csr')

    clf = LogisticRegression(solver='saga', penalty='l2', C=C_val, max_iter=5000, n_jobs=-1, random_state=seed)
    clf.fit(X_tr_mat, y_tr)
    va_pred = clf.predict_proba(X_va_mat)[:, 1]
    oof[va_idx] = va_pred
    auc = roc_auc_score(y_va, va_pred)
    fold_aucs.append(auc)
    ftime = time.time() - fstart
    fold_times.append(ftime)
    print(f'Fold {fold}/{n_splits}: AUC={auc:.5f} | time={ftime:.2f}s | tr_n={len(tr_idx)} va_n={len(va_idx)}', flush=True)

cv_auc = roc_auc_score(y, oof)
elapsed = time.time() - t0
print(f'CV AUC (OOF): {cv_auc:.5f} | mean_fold={np.mean(fold_aucs):.5f} ± {np.std(fold_aucs):.5f} | total_time={elapsed:.2f}s')

# Train on full data for baseline submission
print('Training full model for submission...')
wf = TfidfVectorizer(**word_params)
cf = TfidfVectorizer(**char_params)
Xw_full = wf.fit_transform(X_text)
Xc_full = cf.fit_transform(X_text)
X_full = sparse.hstack([Xw_full, Xc_full], format='csr')
Xw_test = wf.transform(X_test_text)
Xc_test = cf.transform(X_test_text)
X_test = sparse.hstack([Xw_test, Xc_test], format='csr')
full_clf = LogisticRegression(solver='saga', penalty='l2', C=C_val, max_iter=5000, n_jobs=-1, random_state=seed)
full_train_start = time.time()
full_clf.fit(X_full, y)
print(f'Full fit time: {time.time()-full_train_start:.2f}s')
test_pred = full_clf.predict_proba(X_test)[:, 1]

sub = pd.DataFrame({
    id_col: test[id_col].values,
    'Insult': test_pred
})
sub_path = 'submission.csv'
sub.to_csv(sub_path, index=False)
print('Saved submission to', sub_path)

# Log to experiment section for traceability
print('\n--- Baseline Summary ---')
print(f'Params: word {word_params}, char {char_params}, C={C_val}')
print(f'OOF AUC: {cv_auc:.5f}, folds: {fold_aucs}')

Loading data...
Train shape: (3947, 3)  Test shape: (2647, 2)
Columns: ['Insult', 'Date', 'Comment']
Target distribution:
Insult
0    0.734229
1    0.265771
Name: proportion, dtype: float64
Starting 5-fold CV...


Fold 1/5: AUC=0.90745 | time=5.93s | tr_n=3157 va_n=790


Fold 2/5: AUC=0.90729 | time=5.36s | tr_n=3157 va_n=790


Fold 3/5: AUC=0.91049 | time=5.16s | tr_n=3158 va_n=789


Fold 4/5: AUC=0.90757 | time=5.94s | tr_n=3158 va_n=789


Fold 5/5: AUC=0.89790 | time=5.68s | tr_n=3158 va_n=789


CV AUC (OOF): 0.90588 | mean_fold=0.90614 ± 0.00429 | total_time=28.08s
Training full model for submission...


Full fit time: 5.48s


KeyError: 'Insult'

In [2]:
# Fix submission: detect id column from test and save submission.csv
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Determine id column from test (not train)
id_col = 'id' if 'id' in test.columns else ('Id' if 'Id' in test.columns else test.columns[0])
print('Using id column:', id_col)

def ensure_test_pred():
    global test_pred
    if 'test_pred' in globals():
        return
    # Recompute minimal pipeline if needed
    wf = TfidfVectorizer(**word_params)
    cf = TfidfVectorizer(**char_params)
    Xw_full = wf.fit_transform(X_text)
    Xc_full = cf.fit_transform(X_text)
    X_full = sparse.hstack([Xw_full, Xc_full], format='csr')
    Xw_test = wf.transform(X_test_text)
    Xc_test = cf.transform(X_test_text)
    X_test_mat = sparse.hstack([Xw_test, Xc_test], format='csr')
    clf = LogisticRegression(solver='saga', penalty='l2', C=C_val, max_iter=5000, n_jobs=-1, random_state=seed)
    clf.fit(X_full, y)
    globals()['test_pred'] = clf.predict_proba(X_test_mat)[:, 1]

ensure_test_pred()
sub = pd.DataFrame({id_col: test[id_col].values, 'Insult': test_pred})
sub_path = 'submission.csv'
sub.to_csv(sub_path, index=False)
print('Saved submission to', sub_path, 'with shape', sub.shape)

Using id column: Date
Saved submission to submission.csv with shape (2647, 2)


In [3]:
# Create submission matching sample format: columns ['Insult','Date','Comment']
assert 'test_pred' in globals(), 'test_pred missing; run previous cells to compute predictions.'
required_cols = ['Insult', 'Date', 'Comment']
sub3 = pd.DataFrame({
    'Insult': test_pred,
    'Date': test['Date'].values,
    'Comment': test['Comment'].values
})[required_cols]
sub3_path = 'submission.csv'
sub3.to_csv(sub3_path, index=False)
print('Saved submission with shape', sub3.shape, 'and columns', sub3.columns.tolist())

Saved submission with shape (2647, 3) and columns ['Insult', 'Date', 'Comment']
