In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import random
import os
from tqdm import tqdm

In [18]:
def get_data(filename): # train, dev, test
    labels = ['background', 'objective', 'methods', 'results', 'conclusions']
    data = []
    with open(os.path.join('./PubMed_200k_RCT', f'{filename}.txt'), 'r') as f:
        data = f.readlines()
    output_labels = []  # define an empty list to store the labels
    output_sentences = []  # define an empty list to store the sentences

    for line in tqdm(data):
        line = line.split()
        if len(line) >= 2:
            label = line[0].lower()
            if label not in labels:
                continue
            else:
                labelnum = labels.index(label)
                
                output_labels.append(labelnum)
                output_sentences.append(' '.join(line[1:]))
    return output_labels, output_sentences

labels, corpus = get_data('train')
labels_valid, corpus_valid = get_data('dev')
labels_test, corpus_test = get_data('test')

100%|████████████████████████████████████████████████████████████████████| 2593169/2593169 [00:05<00:00, 491758.86it/s]
100%|████████████████████████████████████████████████████████████████████████| 33932/33932 [00:00<00:00, 507800.06it/s]
100%|████████████████████████████████████████████████████████████████████████| 34493/34493 [00:00<00:00, 467369.17it/s]


In [19]:
sorted(random.choices(list(set(corpus)), k=20))

['After 3 years , 87 % of 183 patients on treatment were receiving sildenafil 80 mg tid .',
 'After a 2-week placebo baseline period , children were randomly assigned to lithium or placebo treatment for 6 weeks of placebo .',
 'Average scores for intervention and control group residents were calculated and between-group differences on the CSD skills assessment were evaluated using two-tailed independent sample t tests .',
 'Biochemical profiles and insulin sensitivity index from euglycemic hyperinsulinemic clamp test were assessed before and after treatment .',
 'C31G demonstrated noninferior contraceptive efficacy compared with nonoxynol-9 .',
 'Cytotoxic ( CD8 + ) and regulatory ( forkhead box protein 3 , FOXP3 + ) T cells were quantified using immunohistochemistry ( IHC ) .',
 'Financial support for participation should be considered .',
 'Grade 3/4 toxicity was significantly higher in arm B.',
 'However , implementation of telemonitoring in primary care practices may not yield the 

In [20]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(random.choices(vectorizer.get_feature_names(), k=20))
print(X.shape)
X_test = vectorizer.transform(corpus_test)
X_valid = vectorizer.transform(corpus_valid)

['axillary', 'norovirus', '4830', '5199', '3666', 'outway', 'opened', 'cifx', 'cl', 'bisporus', 'wellcome', '186716', 'ferti', 'reforming', 'wooled', 'hpds', 'ambulators', 'polytetrafluaroethylene', '12cm', 'compas']
(2211861, 180804)


In [43]:
from sklearn.metrics import f1_score

def evaluate(model, X, y):
    y_pred = model.predict(X)
    micro = f1_score(y, y_pred, average='micro')
    macro = f1_score(y, y_pred, average='macro')
    weighted = f1_score(y, y_pred, average='weighted')
    # samples = f1_score(y, y_pred, average='samples')
    print(f'F1 Score: micro {micro}, macro {macro}, weighted {weighted}')

In [23]:
scikit_log_reg = LogisticRegression(solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
model=scikit_log_reg.fit(X, labels)

evaluate(model, X_valid, labels_valid)
evaluate(model, X_test, labels_test)

F1 Score: micro 0.8243121802848058, macro 0.7572239244299099, weighted 0.8210974554468179
F1 Score: micro 0.8247041670904961, macro 0.7573541905307171, weighted 0.8214023347513191


In [36]:
highest_count = max([labels.count(label) for label in set(labels)])
class_weights = {label: highest_count / labels.count(label) for label in sorted(set(labels))}
class_weights

{0: 3.8958508101622358,
 1: 4.106467810997797,
 2: 1.0604564716172193,
 3: 1.0,
 4: 2.2556356229063272}

In [37]:
scikit_log_reg = LogisticRegression(solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000, class_weight=class_weights)
model=scikit_log_reg.fit(X, labels)

evaluate(model, X_valid, labels_valid)
evaluate(model, X_test, labels_test)

F1 Score: micro 0.8206829807825246, macro 0.7558711820099019, weighted 0.8208605739729451
F1 Score: micro 0.8189739938290442, macro 0.7539315578603097, weighted 0.8188876052065251


In [41]:
scikit_log_reg = LogisticRegression(solver='lbfgs', C=5, penalty='l2', max_iter=1000)
model=scikit_log_reg.fit(X, labels)

evaluate(model, X_valid, labels_valid)
evaluate(model, X_test, labels_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


F1 Score: micro 0.8240702336513204, macro 0.7599997046810416, weighted 0.8224748299930563
F1 Score: micro 0.823212287661479, macro 0.7566691023147788, weighted 0.8213424554175457


In [44]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X, labels)

evaluate(model, X_valid, labels_valid)
evaluate(model, X_test, labels_test)

F1 Score: micro 0.7611986727498963, macro 0.663888710263248, weighted 0.7496920171576062
F1 Score: micro 0.7649950835791544, macro 0.6715430668376375, weighted 0.754122108843251
