In [None]:
%%bash
pip install -q pandas scikit-learn numpy matplotlib seaborn


### Data Augmentation
Apply simple token dropout augmentation to expand the training data and evaluate impact.


In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

df = pd.read_csv(Path('../../data/comments.csv'))
df['stratify_key'] = df['aspect'] + '_' + df['label']
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['stratify_key'])
for frame in (train_df, test_df):
    frame['text_with_aspect'] = 'Aspect: ' + frame['aspect'] + ' | ' + frame['comment']
print(train_df.head())


In [None]:
import random
random.seed(42)

def token_dropout(text, drop_prob=0.1):
    tokens = text.split()
    kept = [t for t in tokens if random.random() > drop_prob]
    return ' '.join(kept) if kept else text

augmented = train_df.copy()
augmented['text_with_aspect'] = augmented['text_with_aspect'].apply(lambda t: token_dropout(t, 0.2))
combined = pd.concat([train_df, augmented], ignore_index=True)
print(f"Original train size: {len(train_df)}, after augmentation: {len(combined)}")


In [None]:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import classification_report

    baseline = Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
        ('clf', LogisticRegression(max_iter=400))
    ])
    baseline.fit(train_df['text_with_aspect'], train_df['label'])
    base_preds = baseline.predict(test_df['text_with_aspect'])
    print('Baseline (no augmentation)')
    print(classification_report(test_df['label'], base_preds))

    aug_model = Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
        ('clf', LogisticRegression(max_iter=400))
    ])
    aug_model.fit(combined['text_with_aspect'], combined['label'])
    aug_preds = aug_model.predict(test_df['text_with_aspect'])
    print('
With augmentation')
    print(classification_report(test_df['label'], aug_preds))
