In [None]:
import warnings

import numpy as np

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

from tqdm.notebook import tqdm

from src import utils

In [None]:
warnings.filterwarnings("ignore")
tqdm.pandas()

In [None]:
baseline_data = np.loadtxt('./data/data_spacy_emb.csv')

In [None]:
X, y = baseline_data[:, :-1], baseline_data[:, -1]

In [None]:
(X_train, y_train), (X_test, y_test) = utils.split_data(X, y, test_size=0.3, shuffle=True, validate=False)

# Baseline

In [None]:
baseline_model = LogisticRegression(verbose=0)

cv = StratifiedKFold(n_splits=3, shuffle=True)

f1_cv = cross_val_score(estimator=baseline_model, X=X_train, y=y_train, cv=cv, verbose=0, scoring="f1_macro", n_jobs=-1)

print("Mean f1 macro score on cv = ", np.mean(f1_cv))

In [None]:
baseline_model = LogisticRegression(C=1, verbose=0).fit(X_train, y_train)

y_pred_test = baseline_model.predict(X_test)
f1_test = f1_score(y_test, y_pred_test, average='macro')

print("f1 macro score on test =", f1_test)