# Baseline

In [1]:
import os
from typing import Any, Dict

import numpy as np
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

## Utilities

In [2]:
def train_validate_test_logistic_regression_model(train_dict: Dict[str, Any],
                                                  dev_dict: Dict[str, Any],
                                                  C: float, random_seed: int) -> Dict[str, float]:
    """Train and validate logistic regression model with tfidf word features"""

    # define tfidf vectorizer
    vectorizer = TfidfVectorizer(
        max_features=None,
        encoding='utf-8',
        tokenizer=nltk.word_tokenize,
        ngram_range=(1, 1),
    )

    # fit vectorizer
    vectorizer.fit(train_dict['text'])

    train_X = vectorizer.transform(train_dict['text'])
    dev_X = vectorizer.transform(dev_dict['text'])

    # Define Logistic Regression model
    model = LogisticRegression(
        solver='liblinear',
        random_state=random_seed,
        verbose=False,
        C=C,
    )
    # Fit the model to training data
    model.fit(
        train_X,
        train_dict['labels']
    )

    # make prediction using the trained model
    train_pred = model.predict(train_X)
    dev_pred = model.predict(dev_X)

    # compute F1 scores
    train_f1 = f1_score(y_pred=train_pred, y_true=train_dict['labels'], average='macro', labels=['0', '1'])
    dev_f1 = f1_score(y_pred=dev_pred, y_true=dev_dict['labels'], average='macro', labels=['0', '1'])

    return {
        'train_f1': train_f1,
        'dev_f1': dev_f1,
    }


def pick_best_dev_score(scores_dict: Dict[float, Dict[str, float]]) -> Dict[str, float]:
    best_val = {'dev_f1': -1}
    for k, val in scores_dict.items():
        if val['dev_f1'] > best_val['dev_f1']:
            best_val = val
    return best_val

## Load data

In [3]:
DATA_DIR = os.path.join('../data/GermEval21_Toxic_Train')
assert os.path.isdir(DATA_DIR)
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), encoding='utf-8', sep=',')
dev_df = pd.read_csv(os.path.join(DATA_DIR, 'dev.csv'), encoding='utf-8', sep=',')

train_di = {
    'text': train_df['comment_text'],
    'labels': train_df['Sub3_FactClaiming'].astype(str),
}
dev_di = {
    'text': dev_df['comment_text'],
    'labels': dev_df['Sub3_FactClaiming'].astype(str),
}

## Train and evaluate

In [4]:
scores_dict = {}
for c in [1.0, 2.0, 3.0, 4.0, 5.0]:
    scores_dict[c] = train_validate_test_logistic_regression_model(
        train_dict=train_di,
        dev_dict=dev_di,
        C=c,
        random_seed=123,
    )



In [5]:
scores_dict

{1.0: {'train_f1': 0.7965483264649844, 'dev_f1': 0.6945666752947266},
 2.0: {'train_f1': 0.8710244063146564, 'dev_f1': 0.7032396716888818},
 3.0: {'train_f1': 0.918281801734961, 'dev_f1': 0.7041393143690737},
 4.0: {'train_f1': 0.944626334665374, 'dev_f1': 0.7064309184791112},
 5.0: {'train_f1': 0.9614401616689945, 'dev_f1': 0.7095785657641327}}

In [6]:
pick_best_dev_score(scores_dict)


{'train_f1': 0.9614401616689945, 'dev_f1': 0.7095785657641327}

## Train using 5-fold cross-validation data

In [7]:
CROSS_VALIDATION_DATA_DIR = os.path.join('../data/cross_validation')
results_dict = {}
for fold_name in ['fold_A', 'fold_B', 'fold_C', 'fold_D', 'fold_E']:
    print(f'*** {fold_name} ***')
    data_dir = os.path.join(CROSS_VALIDATION_DATA_DIR, fold_name)
    assert os.path.isdir(data_dir)
    train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'), encoding='utf-8', sep=',')
    dev_df = pd.read_csv(os.path.join(data_dir, 'dev.csv'), encoding='utf-8', sep=',')

    train_di = {
        'text': train_df['comment_text'],
        'labels': train_df['Sub3_FactClaiming'].astype(str),
    }
    dev_di = {
        'text': dev_df['comment_text'],
        'labels': dev_df['Sub3_FactClaiming'].astype(str),
    }

    scores_dict = {}
    for c in [1.0, 2.0, 3.0, 4.0, 5.0]:
        scores_dict[c] = train_validate_test_logistic_regression_model(
            train_dict=train_di,
            dev_dict=dev_di,
            C=c,
            random_seed=123,
        )

    results_dict[fold_name] = scores_dict

*** fold_A ***




*** fold_B ***




*** fold_C ***




*** fold_D ***




*** fold_E ***




In [8]:
fold_names = ['fold_A', 'fold_B', 'fold_C', 'fold_D', 'fold_E']
train_f1_means = []
train_f1_stds = []
dev_f1_means = []
dev_f1_stds = []
Cs = []
for c in [1.0, 2.0, 3.0, 4.0, 5.0]:
    Cs.append(c)
    train_f1_means.append(
        np.mean([results_dict[fold_name][c]['train_f1'] for fold_name in fold_names])
    )
    train_f1_stds.append(
        np.std([results_dict[fold_name][c]['train_f1'] for fold_name in fold_names])
    )
    dev_f1_means.append(
        np.mean([results_dict[fold_name][c]['dev_f1'] for fold_name in fold_names])
    )
    dev_f1_stds.append(
        np.std([results_dict[fold_name][c]['dev_f1'] for fold_name in fold_names])
    )
table_dict = {
    'C': Cs,
    'train_f1': [f'{train_f1_mean:0.3f} ± {train_f1_std:0.2f}' for train_f1_mean, train_f1_std in
                 zip(train_f1_means, train_f1_stds)],
    'dev_f1': [f'{dev_f1_mean:0.3f} ± {dev_f1_std:0.2f}' for dev_f1_mean, dev_f1_std in zip(dev_f1_means, dev_f1_stds)],
}

In [9]:
pd.DataFrame(table_dict)

Unnamed: 0,C,train_f1,dev_f1
0,1.0,0.800 ± 0.00,0.688 ± 0.00
1,2.0,0.876 ± 0.00,0.701 ± 0.01
2,3.0,0.919 ± 0.00,0.709 ± 0.01
3,4.0,0.946 ± 0.00,0.707 ± 0.01
4,5.0,0.964 ± 0.00,0.711 ± 0.01
