# Бейзлайны для задачи "Question Answering for Yes/No-Questions"

Для оценки моделей воспользуемся следующими метриками:
- AUROC
- f1
- precision
- recall
- accuracy

In [48]:
import json
import gensim
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_auc_score, 
    f1_score, 
    precision_score, 
    recall_score, 
    accuracy_score
)

PATH_TRAIN = "../data/train.jsonl"
PATH_TEST = "../data/dev.jsonl"


def read_data(path):
    records = []
    with open(path) as fp:
        for line in fp:
            record = json.loads(line.strip())
            records.append(record)
    df = pd.DataFrame(records)
    return df


In [58]:
def get_scores(y_true, y_score, thr=0.5):
    y_pred = (thr<y_score).astype(int)
    auc = roc_auc_score(y_score=y_score, y_true=y_true)
    f1 = f1_score(y_true=y_true, y_pred=y_pred)
    prec = precision_score(y_true=y_true, y_pred=y_pred)
    rec = recall_score(y_true=y_true, y_pred=y_pred)
    acc = accuracy_score(y_true=y_true, y_pred=y_pred)
    return {
        "auroc": auc,
        "f1": f1,
        "precision": prec,
        "recall": rec,
        "accuracy": acc
    }

In [3]:
d_train = read_data(PATH_TRAIN)
d_test = read_data(PATH_TEST)


# Бейзлайн 1: Наиболее частый ответ

In [None]:
ans_train = d_train.answer.mode().values.item()
ans_test = d_test.answer.mode().values.item()
print(f"{ans_train=}, {ans_test=}")
p_train = np.array([ans_train for i in range(d_train.shape[0])])
p_test = np.array([ans_test for i in range(d_test.shape[0])])


In [69]:
get_scores(y_true=d_train.answer.tolist(), y_score=p_train)

{'auroc': 0.5,
 'f1': 0.7677929547088426,
 'precision': 0.6231038506417736,
 'recall': 1.0,
 'accuracy': 0.6231038506417736}

In [71]:
get_scores(y_true=d_test.answer.tolist(), y_score=p_test)

{'auroc': 0.5,
 'f1': 0.7667358099189139,
 'precision': 0.6217125382262997,
 'recall': 1.0,
 'accuracy': 0.6217125382262997}

# Бейзлайн 2: средний ответ

In [73]:
ans_train = d_train.answer.mean().item()
ans_test = d_test.answer.mean().item()
print(f"{ans_train=}, {ans_test=}")

p_train = np.array([ans_train for i in range(d_train.shape[0])])
p_test = np.array([ans_test for i in range(d_test.shape[0])])

ans_train=0.6231038506417736, ans_test=0.6217125382262997


In [74]:
get_scores(y_true=d_train.answer.tolist(), y_score=p_train)

{'auroc': 0.5,
 'f1': 0.7677929547088426,
 'precision': 0.6231038506417736,
 'recall': 1.0,
 'accuracy': 0.6231038506417736}

In [75]:
get_scores(y_true=d_test.answer.tolist(), y_score=p_test)

{'auroc': 0.5,
 'f1': 0.7667358099189139,
 'precision': 0.6217125382262997,
 'recall': 1.0,
 'accuracy': 0.6217125382262997}

# Бейзлайн 3: FastText

In [25]:
model = gensim.models.fasttext.load_facebook_vectors("../data/cc.ru.300.bin")

In [76]:
d_train["tp"] = d_train.question + ". " + d_train.passage
d_test["tp"] = d_test.question + ". " + d_test.passage
tokens_train = d_train.tp.apply(lambda line: [x for x in gensim.utils.tokenize(line)]).tolist()
tokens_test = d_test.tp.apply(lambda line: [x for x in gensim.utils.tokenize(line)]).tolist()

In [77]:
x_train = np.array(
    [model.get_mean_vector(tokens).tolist() for tokens in tokens_train]
)


In [78]:
x_test =  np.array(
    [model.get_mean_vector(tokens).tolist() for tokens in tokens_test]
)


In [79]:
len(x_train), len(x_test)

(9427, 3270)

In [101]:

# params = dict(min_child_samples=10) # 0.7706774951912803
params = dict(min_child_samples=8) # 0.7706774951912803

est = lgb.LGBMClassifier(
    **params
)
est.fit(x_train, d_train.answer.values)


[LightGBM] [Info] Number of positive: 5874, number of negative: 3553
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 9427, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623104 -> initscore=0.502744
[LightGBM] [Info] Start training from score 0.502744


In [102]:
get_scores(
    y_true=d_train.answer.values,
    y_score=est.predict_proba(x_train)[:,1]
)


{'auroc': 0.9948531939277219,
 'f1': 0.9497360941940722,
 'precision': 0.9079335506908865,
 'recall': 0.9955737146748382,
 'accuracy': 0.9343375411053357}

In [103]:
get_scores(
    y_true=d_test.answer.values,
    y_score=est.predict_proba(x_test)[:,1]
)


{'auroc': 0.6582806490004656,
 'f1': 0.7660297239915074,
 'precision': 0.673888681359731,
 'recall': 0.8873585833743236,
 'accuracy': 0.6629969418960244}