In [1]:
import json
import ijson
from tqdm import tqdm
from indexingcode.utils.Preprocessor import Preprocessor
import gensim
import nltk
import xml.etree.ElementTree as ET
from typing import List, Dict
import random

In [2]:
def get_text(node):
    try:
        return node.text
    except AttributeError:
        return None

In [3]:
preprocessor = Preprocessor(gensim.utils.tokenize, nltk.PorterStemmer())

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


In [4]:
et = ET.parse('/Volumes/SanDisk/desc2022.xml')

In [5]:
# load meshes XML
text_to_tree = {}
root = et.getroot()
omit_count = 0
with tqdm(root.findall('DescriptorRecord')) as bar:
    for record in bar:
        tree_text = get_text(record.find('TreeNumberList/TreeNumber'))
        name = get_text(record.find('DescriptorName/String'))
        if name is None or tree_text is None:
            omit_count += 1
            bar.set_description(str(omit_count))
            continue
        # first_tree = tree_text.split('.')[0]
        first_tree = tree_text[0]
        text_to_tree[name] = first_tree

2: 100%|██████████| 30194/30194 [00:00<00:00, 56809.35it/s]


In [None]:
desc_words_set = set()
omit_count = 0
with tqdm(root.findall('DescriptorRecord')) as bar:
    for record in bar:
        name = get_text(record.find('DescriptorName/String'))
        if name is None:
            omit_count += 1
            bar.set_description(str(omit_count))
            continue


In [8]:
with open('/Volumes/SanDisk/BioAsq2021/allMeSH_2021.json', encoding='ISO-8859-1') as bioasq_file:
    with open('/Volumes/SanDisk/compare_meshes_tree.csv', 'w') as wf:
            num_to_text = {}
            num_to_tree = {}
            tree_to_indexes: Dict[str, List[int]] = {}
            omit_count = 0
            i = 0
            for article in tqdm(ijson.items(bioasq_file, 'articles.item'), total=15559157, desc='Num to text dictionary creation'):
                # mesh num to text
                nums: List[int] = preprocessor.preprocess_mesh(article['meshMajor'])
                for num, text in zip(nums, article['meshMajor']):
                    if num not in num_to_text:
                        num_to_text[num] = text
                        if text in text_to_tree:
                            first_tree = text_to_tree[text]
                            num_to_tree[num] = first_tree
                        else:
                            omit_count += 1
                trees = list(filter(lambda x: x is not None, [num_to_tree[n] if n in num_to_tree else None for n in nums]))
                most_freq_tree = max(set(trees), key=trees.count)
                if most_freq_tree not in tree_to_indexes:
                    tree_to_indexes[most_freq_tree] = []
                tree_to_indexes[most_freq_tree].append(i)
                i += 1
                # journal = preprocessor.preprocess_journal(article['journal'])
                # wf.write(f"{most_freq_tree}\n")
                # wf.write(f"{vector};{most_freq_tree};{journal}\n")

Num to text dictionary creation: 100%|██████████| 15559157/15559157 [05:58<00:00, 43342.82it/s]


In [14]:
random.seed(75)
train_set_not_balanced: List[int] = []
test_set_not_balanced: List[int] = []
for tree, indexes in tree_to_indexes.items():
    my_set = random.sample(indexes, int(0.0006*len(indexes)))
    test_set_len = int(0.1*len(my_set))
    random.shuffle(my_set)
    test_set_not_balanced.extend(my_set[:test_set_len])
    train_set_not_balanced.extend(my_set[test_set_len:])

In [15]:
train_set_balanced: List[int] = []
test_set_balanced: List[int] = []
total_len = len(train_set_not_balanced) + len(test_set_not_balanced)
one_tree_set_len = int(total_len / len(tree_to_indexes.keys()))
for tree, indexes in tree_to_indexes.items():
    my_set = random.sample(indexes, min(one_tree_set_len, len(indexes)))
    test_set_len = int(0.1*len(my_set))
    random.shuffle(my_set)
    test_set_balanced.extend(my_set[:test_set_len])
    train_set_balanced.extend(my_set[test_set_len:])

In [16]:
train_s_nb = set(train_set_not_balanced)
test_s_nb = set(test_set_not_balanced)
train_s_b = set(train_set_balanced)
test_s_b = set(test_set_balanced)

In [17]:
include_list_nb = [1 if _ in train_s_nb else 2 if _ in test_s_nb else 0 for _ in tqdm(range(15559157))]
include_list_b = [1 if _ in train_s_b else 2 if _ in test_s_b else 0 for _ in tqdm(range(15559157))]

100%|██████████| 15559157/15559157 [00:04<00:00, 3542527.81it/s]
100%|██████████| 15559157/15559157 [00:04<00:00, 3704997.77it/s]


In [21]:
with open('/Volumes/SanDisk/BioAsq2021/allMeSH_2021.json', encoding='ISO-8859-1') as bioasq_file:
    with open('/Volumes/SanDisk/compare_train_title_b.csv', 'w') as wf_train:
        with open('/Volumes/SanDisk/compare_test_title_b.csv', 'w') as wf_test:
            for article, include_flag in zip(tqdm(ijson.items(bioasq_file, 'articles.item'), total=15559157, desc='Saving new compare file'), include_list_b):
                if include_flag == 0:
                    continue
                nums: List[int] = preprocessor.preprocess_mesh(article['meshMajor'])
                trees = list(filter(lambda x: x is not None, [num_to_tree[n] if n in num_to_tree else None for n in nums]))
                most_freq_tree = max(set(trees), key=trees.count)
                text_vector = [preprocessor.num_to_text[num] for num in preprocessor.preprocess_text(article['title'] or '')]
                # vector = ','.join([str(_) for _ in preprocessor.preprocess_text(article['abstractText'])])
                write_text = f"{' '.join(list(map(str, text_vector)))};{str(most_freq_tree)};{str(article['pmid'])}\n"
                if include_flag == 1:
                    wf_train.write(write_text)
                else:
                    wf_test.write(write_text)

Saving new compare file: 100%|██████████| 15559157/15559157 [02:07<00:00, 122164.01it/s]


In [38]:
# compute tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
with open('/Volumes/SanDisk/compare_train.csv') as f:
    vectors: List[List[str]] = []
    labels: List[str] = []
    journals: List[str] = []
    for line in tqdm(f):
        vector_str, y, journal = line.split(';')
        vector: List[str] = vector_str.split(',')
        vectors.append(vector)
        labels.append(y)
        journals.append(journal)

84019it [00:01, 55795.73it/s]


In [40]:
term_to_documents_count: Dict[str, int] = {}
for vector in vectors:
    unique_words = set(vector)
    for w in unique_words:
        if w not in term_to_documents_count:
            term_to_documents_count[w] = 0
        term_to_documents_count[w] += 1

In [41]:
import math
documents_len = len(vectors)
term_to_idf: Dict[str, float] = {}
for term, belonging_doc_count in term_to_documents_count.items():
    term_to_idf[term] = math.log(documents_len/belonging_doc_count, math.e)

In [81]:
from sklearn.decomposition import IncrementalPCA
import numpy as np
import pandas as pd
term_to_tfidf: Dict[str, List[float]] = {}
tfidf_list = []
pca = IncrementalPCA(20)
i = 0
for term, idf in tqdm(term_to_idf.items()):
    tfidf_list.append([vector.count(term) * idf for vector in vectors])
    i += 1

    if i == 20:
        pca.partial_fit(np.array(pd.DataFrame(tfidf_list)))
        tfidf_list.clear()
        i = 0

  self.noise_variance_ = explained_variance[self.n_components_ :].mean()
  ret = ret.dtype.type(ret / rcount)
  3%|▎         | 3446/119359 [21:14<11:54:35,  2.70it/s]


KeyboardInterrupt: 

In [85]:
# # process few last
# if len(tfidf_list) > 0:
#     pca.partial_fit(np.array(pd.DataFrame(tfidf_list)))
#
# # save pca with pickle
# import pickle
# with open('/Volumes/SanDisk/pca.pkl', 'w') as f:
#     pickle.dump(pca, f)

# export train and test vectors
with open('/Volumes/SanDisk/compare_train.csv') as f:
    with open('/Volumes/SanDisk/compare_train_tfidf.csv', 'w') as wf:
        for line in tqdm(f, total=84019):
            vector_str, y, journal = line.rstrip().split(';')
            vector: List[str] = vector_str.split(',')

            tfidf_vector = [vector.count(w) * term_to_idf[w] for w in vector]
            wf.write(f"{vector_str};{y};{journal};{','.join([str(_) for _ in tfidf_vector])}\n")

100%|██████████| 84019/84019 [00:34<00:00, 2402.43it/s]


In [86]:
with open('/Volumes/SanDisk/compare_test.csv') as f:
    with open('/Volumes/SanDisk/compare_test_tfidf.csv', 'w') as wf:
        for line in tqdm(f):
            vector_str, y, journal = line.rstrip().split(';')
            vector: List[str] = vector_str.split(',')

            tfidf_vector = [vector.count(w) * term_to_idf[w] if w in term_to_idf else None for w in vector]
            tfidf_vector = list(filter(lambda w: w is not None, tfidf_vector))
            wf.write(f"{vector_str};{y};{journal};{','.join([str(_) for _ in tfidf_vector])}\n")

9327it [00:03, 2523.92it/s]


In [34]:
class AbstractIterator:
    def __iter__(self):
        with open('/Volumes/SanDisk/compare_train_title_b.csv') as f:
            for line in tqdm(f, total=9000):
                vector_str, y, pmid = line.rstrip().split(';')
                vector: List[str] = vector_str.split(' ')
                yield gensim.models.doc2vec.TaggedDocument(vector, [i])

model = gensim.models.Doc2Vec(vector_size=20, window=10, epochs=5, dbow_words=0)
sentences_iterator = AbstractIterator()
print("Build vocab")
model.build_vocab(sentences_iterator)
print("Train")
model.train(sentences_iterator, total_examples=model.corpus_count, epochs=model.epochs)

print("Build train abstracts out")
with open('/Volumes/SanDisk/compare_train_title_b.csv') as f:
    with open('/Volumes/SanDisk/compare_train_title_b_d2v.csv', 'w') as wf:
        for line in tqdm(f, total=84019):
            vector_str, y, pmid = line.rstrip().split(';')
            vector: List[str] = vector_str.split(' ')
            d2v_vector = model.infer_vector(vector)
            wf.write(f"{vector_str};{y};{pmid};{','.join([str(_) for _ in d2v_vector])}\n")
print("Build test abstracts out")
with open('/Volumes/SanDisk/compare_test_title_b.csv') as f:
    with open('/Volumes/SanDisk/compare_test_title_b_d2v.csv', 'w') as wf:
        for line in tqdm(f, total=9327):
            vector_str, y, pmid = line.rstrip().split(';')
            vector: List[str] = vector_str.split(' ')
            d2v_vector = model.infer_vector(vector)
            wf.write(f"{vector_str};{y};{pmid};{','.join([str(_) for _ in d2v_vector])}\n")

Build vocab


 93%|█████████▎| 8385/9000 [00:00<00:00, 186997.81it/s]


Train


 93%|█████████▎| 8385/9000 [00:00<00:00, 203476.23it/s]
 93%|█████████▎| 8385/9000 [00:00<00:00, 195240.40it/s]
 93%|█████████▎| 8385/9000 [00:00<00:00, 201296.06it/s]
 93%|█████████▎| 8385/9000 [00:00<00:00, 255228.70it/s]
 93%|█████████▎| 8385/9000 [00:00<00:00, 210763.36it/s]


Build train abstracts out


 10%|▉         | 8385/84019 [00:02<00:22, 3399.00it/s]


Build test abstracts out


 10%|▉         | 930/9327 [00:00<00:02, 2871.10it/s]


In [None]:
# we dont even try Word2Vec
class AbstractIterator:
    def __iter__(self):
        with open('/Volumes/SanDisk/compare_train.csv') as f:
            for line in tqdm(f, total=84019):
                vector_str, y, journal = line.rstrip().split(';')
                vector: List[str] = vector_str.split(',')
                yield vector

model = gensim.models.Word2Vec(vector_size=3, window=10, epochs=5, sg=1)
sentences_iterator = AbstractIterator()
print("Build vocab")
model.build_vocab(sentences_iterator)
print("Train")
model.train(sentences_iterator, total_examples=model.corpus_count, epochs=model.epochs)

print("Build train abstracts out")
with open('/Volumes/SanDisk/compare_train.csv') as f:
    with open('/Volumes/SanDisk/compare_train_w2v.csv', 'w') as wf:
        for line in tqdm(f, total=84019):
            vector_str, y, journal = line.rstrip().split(';')
            vector: List[str] = vector_str.split(',')
            w2v_vector = [model.wv[word] for word in vector]
            wf.write(f"{vector_str};{y};{journal};{','.join([str(_) for _ in d2v_vector])}\n")
print("Build test abstracts out")
with open('/Volumes/SanDisk/compare_test.csv') as f:
    with open('/Volumes/SanDisk/compare_test_d2v.csv', 'w') as wf:
        for line in tqdm(f, total=9327):
            vector_str, y, journal = line.rstrip().split(';')
            vector: List[str] = vector_str.split(',')
            d2v_vector = model.infer_vector(vector)
            wf.write(f"{vector_str};{y};{journal};{','.join([str(_) for _ in d2v_vector])}\n")

In [35]:
from sklearn.preprocessing import StandardScaler
class CorpusIterator:
    def __iter__(self):
        with open('/Volumes/SanDisk/compare_train_title_b_d2v.csv') as f:
            for line in tqdm(f, total=9000):
                simple_vec_str, y, pmid, d2v_vector_str = line.rstrip().split(';')
                d2v_vector: List[float] = [float(_) for _ in d2v_vector_str.split(',')]
                yield d2v_vector, y

class TestCorpusIterator:
    def __iter__(self):
        with open('/Volumes/SanDisk/compare_test_title_b_d2v.csv') as f:
            for line in tqdm(f, total=9327):
                simple_vec_str, y, pmid, d2v_vector_str = line.rstrip().split(';')
                d2v_vector: List[float] = [float(_) for _ in d2v_vector_str.split(',')]
                yield d2v_vector, y


features = []
y = []
for vector, label in CorpusIterator():
    features.append(vector)
    y.append(label)

 93%|█████████▎| 8385/9000 [00:00<00:00, 154036.88it/s]


In [36]:
test_features = []
test_y = []
for vector, label in TestCorpusIterator():
    test_features.append(vector)
    test_y.append(label)

 10%|▉         | 930/9327 [00:00<00:00, 151295.58it/s]


In [37]:
scaler = StandardScaler()
X = scaler.fit_transform(features)
test_X = scaler.transform(test_features)

In [38]:
from sklearn.metrics import accuracy_score

In [39]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(100,), random_state=75)
for i in tqdm(range(10)):
    model.fit(X, y)
accuracy_score(test_y, model.predict(test_X))

100%|██████████| 10/10 [01:19<00:00,  7.91s/it]


0.09462365591397849

In [40]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=75)
for i in tqdm(range(10)):
    dt_model.fit(X, y)
accuracy_score(test_y, dt_model.predict(test_X))

100%|██████████| 10/10 [00:03<00:00,  2.83it/s]


0.07419354838709677

In [41]:
from sklearn.metrics import classification_report, confusion_matrix
print("MLP")
mlp_predict_y = model.predict(test_X)
print(confusion_matrix(test_y, mlp_predict_y))
print(classification_report(test_y, mlp_predict_y))
print("Decision tree")
dt_predict_y = dt_model.predict(test_X)
print(confusion_matrix(test_y, dt_predict_y))
print(classification_report(test_y, dt_predict_y))

MLP
[[ 4  7  5  6  6  1  2  1  2  5 12  4  6  1  0]
 [ 2  3  6 10  8  5  2  4  2  4  4  2  2  4  4]
 [ 4  1  4  8  1  7  7  4  5  2 10  3  0  4  2]
 [ 7  3  4 13  2  2  6  3  1  4  4  6  1  2  4]
 [ 5  5  1  5  8  4  3  4  1  4  7  4  5  3  3]
 [ 1  5  6  7  4  3  2  6  2  7  2  2  7  5  3]
 [ 5  5  2  3  7  6  8  2  2  2  5  5  3  5  2]
 [ 3  4  3  2  5  5  2  6  6  5  7  3  3  5  3]
 [ 1  3  3  3  3  1  6  5  3  6  6  6  2 10  4]
 [ 1  3  2  7  8  5  6  5  2  4  7  6  2  2  2]
 [ 1  2  6  4  4  3  1  2  3  4 16  3  3  7  3]
 [ 0  7  7  5  1  5  4  4  3  3  6  4  3  7  3]
 [ 2  5  2  4  6  2  2  1  5  4 11  4  2  7  5]
 [ 6  2  2  6  5  2  5  2  3  6  6  5  4  6  2]
 [ 4  3  3  6  4  2  2  2  1  2 11  6  7  5  4]]
              precision    recall  f1-score   support

           A       0.09      0.06      0.07        62
           B       0.05      0.05      0.05        62
           C       0.07      0.06      0.07        62
           D       0.15      0.21      0.17        62
    

In [130]:
from sklearn.preprocessing import StandardScaler


class CorpusIterator:
    def __iter__(self):
        with open('/Volumes/SanDisk/compare_train_tfidf.csv') as f:
            for line in tqdm(f, total=84019):
                simple_vec_str, y, journal, d2v_vector_str = line.rstrip().split(';')
                d2v_vector: List[float] = [float(_) for _ in d2v_vector_str.split(',')]
                yield d2v_vector, y


class TestCorpusIterator:
    def __iter__(self):
        with open('/Volumes/SanDisk/compare_test_tfidf.csv') as f:
            for line in tqdm(f, total=9327):
                simple_vec_str, y, journal, d2v_vector_str = line.rstrip().split(';')
                d2v_vector: List[float] = [float(_) for _ in d2v_vector_str.split(',')]
                yield d2v_vector, y


features = []
y = []
for vector, label in CorpusIterator():
    features.append(vector)
    y.append(label)
test_features = []
test_y = []
for vector, label in TestCorpusIterator():
    test_features.append(vector)
    test_y.append(label)

scaler = StandardScaler()
X = scaler.fit_transform(features)
test_X = scaler.transform(test_features)
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(60,), random_state=75)
for i in tqdm(range(10)):
    model.fit(X, y)
accuracy_score(test_y, model.predict(test_X))
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=75)
for i in tqdm(range(10)):
    dt_model.fit(X, y)
accuracy_score(test_y, dt_model.predict(test_X))
from sklearn.metrics import classification_report, confusion_matrix

print("MLP")
mlp_predict_y = model.predict(test_X)
print(confusion_matrix(test_y, mlp_predict_y))
print(classification_report(test_y, mlp_predict_y))
print("Decision tree")
dt_predict_y = dt_model.predict(test_X)
print(confusion_matrix(test_y, dt_predict_y))
print(classification_report(test_y, dt_predict_y))

100%|██████████| 84019/84019 [00:04<00:00, 18697.93it/s]
100%|██████████| 9327/9327 [00:00<00:00, 19448.23it/s]


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (84019,) + inhomogeneous part.

84019