In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import random

In [2]:
from preprocessing import get_data

labels, corpus = get_data('train')
labels_valid, corpus_valid = get_data('dev')
labels_test, corpus_test = get_data('test')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2593169/2593169 [02:56<00:00, 14673.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 33932/33932 [00:02<00:00, 16805.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34493/34493 [00:02<00:00, 17185.61it/s]


In [3]:
flatten = lambda t: [item for sublist in t for item in sublist]
sorted(random.choices(list(set(flatten(corpus))), k=20))

['breas',
 'fbdg',
 'ii/phase',
 'interscalenus',
 'iocm',
 'lmvh',
 'meantime',
 'molly',
 'naps',
 'oblong',
 'pardee',
 'pat',
 'pelf',
 'pfmdri',
 'phosphocreatine/adenosine',
 'preferences/perceptions',
 'reversible',
 'wilcoxonmannwhitney',
 'yjhd',
 'zf']

In [4]:
def to_strings(corpus: list) -> list:
    return list(map(lambda sentence: ' '.join(sentence), corpus))

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(to_strings(corpus))
print(random.choices(vectorizer.get_feature_names(), k=20))
print(X.shape)

['biglycans', 'estrogenization', 'facebook', 'prolase', 'extramitochondrial', 'yueyueshu', 'bullied', 'cardiopump', 'caplet', 'inconsistently', 'wugong', 'scintigraphy', 'anvisa', 'radiosynthesis', 'mullins', 'ischemic', 'emile', 'zhuhai', 'crowe', 'died']
(2211861, 135141)


In [5]:
scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
model=scikit_log_reg.fit(X, labels)

[LibLinear]

In [6]:
from sklearn.metrics import f1_score

def evaluate(X, y):
    y_pred = model.predict(X)
    micro = f1_score(y, y_pred, average='micro')
    macro = f1_score(y, y_pred, average='macro')
    weighted = f1_score(y, y_pred, average='weighted')
    # samples = f1_score(y, y_pred, average='samples')
    print(f'F1 Score: micro {micro}, macro {macro}, weighted {weighted}')

In [7]:
X_test = vectorizer.transform(to_strings(corpus_test))
X_valid = vectorizer.transform(to_strings(corpus_valid))

evaluate(X_valid, labels_valid)
evaluate(X_test, labels_test)

F1 Score: micro 0.7963846260196322, macro 0.7217727643915564, weighted 0.7918544950542978
F1 Score: micro 0.7978842437188486, macro 0.723853019785438, weighted 0.7932956511373992
