In [1]:
"""This file contains the pipeline for training and evaluating the GCN on the data."""
# %load_ext autoreload
# %autoreload
import os
import sys
import json
import random
import numpy as np
import pandas as pd
import datetime
from typeguard import typechecked

from sklearn.model_selection import train_test_split

from tqdm import tqdm
sys.path.append("/home/jovyan/20230406_ArticleClassifier/ArticleClassifier")

import src.general.global_variables as gv

sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname('data_loader.py'), os.path.pardir)))
from src.data.data_loader import DataLoader
from src.data.data_processing import standardise_embeddings

from src.general.utils import cc_path
from src.models.evaluation import Metrics

import matplotlib.pyplot as plt

In [12]:
print('Start loading data...')
loc_dict = {
    'processed_csv': cc_path('data/processed/canary/articles_cleaned.csv'),
    'abstract_embeddings': cc_path('data/processed/canary/embeddings_fasttext_20230410.csv'),
    'scibert_embeddings': cc_path('data/processed/canary/embeddings_scibert_20230413.csv'),
    'keyword_network': cc_path('data/processed/canary/keyword_network_weighted.pickle'),
    'xml_embeddings': cc_path('data/processed/canary/embeddings_xml.ftr'),
    'author_network': cc_path('data/processed/canary/author_network.pickle'), 
    'label_network': cc_path('data/processed/canary/label_network_weighted.pickle')
}
data_loader = DataLoader(loc_dict)
processed_df = data_loader.load_processed_csv()
embedding_df = data_loader.load_scibert_embeddings_csv()
embedding_df = standardise_embeddings(embedding_df)

Start loading data...


In [13]:
embedding_df

Unnamed: 0,pui,d0,d1,d2,d3,d4,d5,d6,d7,d8,...,d758,d759,d760,d761,d762,d763,d764,d765,d766,d767
0,624531411,1.724949,-0.962311,-0.916890,-3.019909,1.156869,0.775545,1.696071,1.341233,-0.070411,...,-2.360899,0.901550,0.111421,0.728343,-2.005042,0.101287,-0.038096,0.202163,1.430674,0.079277
1,625340088,0.554961,3.080485,-0.729940,-1.055520,0.941503,0.080624,-0.799699,-0.848227,3.005049,...,1.328036,-0.011042,0.460629,-0.703980,-1.130742,-3.013882,0.546458,0.892324,0.747129,0.086044
2,625805682,0.565920,3.867995,0.258198,-1.007153,-0.205592,1.003677,-2.211239,-0.545270,1.809024,...,-1.104570,0.165435,1.243758,-0.784265,-2.496688,-0.386428,0.316321,1.064401,2.162855,0.974458
3,626662493,0.589574,0.171245,-0.374974,1.760145,0.714378,-0.054711,0.642225,-0.603529,1.615703,...,2.143460,-0.499742,1.147708,0.905595,0.196390,-1.750528,0.834781,0.664159,2.008654,1.673547
4,626822402,0.765471,-0.337877,-0.183119,-0.232586,-0.986672,-0.270002,1.486768,-0.785418,-1.257400,...,1.043334,-0.751844,-0.779597,1.890243,0.497988,-0.869973,0.084734,0.288078,-0.007600,0.244727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117305,2011621972,-0.933348,1.521798,0.592468,-0.353155,-0.483301,0.013271,0.649759,-1.882091,-0.287263,...,0.379663,0.090271,-0.482317,-1.637399,0.483649,0.212784,0.961884,0.871724,-0.340540,0.512850
117306,2011622024,1.866118,-0.713270,-0.590325,0.520996,0.698065,-0.631587,-0.846515,-0.147998,0.014351,...,1.083385,-0.173565,-1.112911,-0.280100,-0.631747,0.541184,0.231767,-0.922456,-1.388049,0.522586
117307,2011622065,-1.387895,0.017300,0.835669,0.365057,0.516104,0.495457,-0.712975,-0.164661,-0.821042,...,1.264139,-2.331056,-1.067966,0.212543,-0.124560,0.951281,-1.883171,-0.381497,0.123854,0.185559
117308,2011626864,0.349304,-1.349804,-0.876579,-0.755956,0.286240,-1.248593,0.818646,1.510866,-1.088184,...,0.399086,1.030749,-0.073069,0.362131,0.296715,-0.150970,0.633721,0.234453,-0.739582,-0.459597


In [14]:
label_columns = processed_df.loc[:, ~processed_df.columns.isin(
    ['file_name', 'title', 'keywords', 'abstract', 'abstract_2', 'authors', 'organization', 'chemicals',
     'num_refs', 'date-delivered', 'labels_m', 'labels_a'])]

# label_columns = processed_df.loc[:, ['pui', 'human', 'mouse', 'rat', 'nonhuman',
#                                      'controlled study', 'animal experiment']]
label_columns[label_columns.columns.difference(['pui'])] = label_columns[
    label_columns.columns.difference(['pui'])].astype(int)
features = ['file_name', 'pui', 'title', 'keywords', 'abstract', 'abstract_2', 'authors', 'organization',
            'chemicals',
            'num_refs', 'date-delivered', 'labels_m', 'labels_a']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_columns[label_columns.columns.difference(['pui'])] = label_columns[


In [15]:
with open(cc_path("data/pui_idx_mapping.json"), "r") as outfile:
    node_label_mapping = json.load(outfile)
    
with open(cc_path(f'data/train_indices.txt')) as f:
    train_puis = f.read().splitlines()
    train_indices = list(map(node_label_mapping.get, train_puis))
with open(cc_path(f'data/val_indices.txt')) as f:
    val_puis = f.read().splitlines()
    val_indices = list(map(node_label_mapping.get, val_puis))
with open(cc_path(f'data/test_indices.txt')) as f:
    test_puis = f.read().splitlines()
    test_indices = list(map(node_label_mapping.get, test_puis))

In [16]:
embedding_df.set_index('pui', inplace=True)
label_columns.set_index('pui', inplace=True)

In [17]:
train_texts_df = embedding_df.loc[train_puis]
train_texts = train_texts_df[embedding_df.columns.difference(['pui'])].to_numpy()
train_labels = label_columns.loc[train_texts_df.index.to_list(), label_columns.columns.difference(['pui'])].to_numpy().tolist()

val_texts_df = embedding_df.loc[val_puis]
val_texts = val_texts_df[embedding_df.columns.difference(['pui'])].to_numpy()
val_labels = label_columns.loc[val_texts_df.index.to_list(), label_columns.columns.difference(['pui'])].to_numpy().tolist()

test_texts_df = embedding_df.loc[test_puis]
test_texts = test_texts_df[embedding_df.columns.difference(['pui'])].to_numpy()
test_labels = label_columns.loc[test_texts_df.index.to_list(), label_columns.columns.difference(['pui'])].to_numpy().tolist()


## Model training and prediction

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [19]:
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
ovr = OneVsRestClassifier(lr, n_jobs=-1)
ovr.fit(train_texts, train_labels)

In [20]:
y_train_preds = ovr.predict(train_texts)
y_val_preds = ovr.predict(val_texts)
y_test_preds = ovr.predict(test_texts)


In [21]:
from sklearn.metrics import f1_score, recall_score, precision_score

for dataset_name, (dataset_pred, dataset_real) in {'train': (y_train_preds, train_labels), 'val': (y_val_preds, val_labels), 'test': (y_test_preds, test_labels)}.items():
    for metric_name, metric in {'f1_score': f1_score, 'recall': recall_score, 'precision': precision_score}.items():
        for averaging_type in ['macro', 'micro']:
            print(f'{dataset_name}: {averaging_type} - {metric_name}: {metric(dataset_real, dataset_pred, average=averaging_type)}')

train: macro - f1_score: 0.6287088605331834
train: micro - f1_score: 0.7671352088592136
train: macro - recall: 0.5602337917932163
train: micro - recall: 0.7238506381875283
train: macro - precision: 0.801878231662428
train: micro - precision: 0.8159256492698083
val: macro - f1_score: 0.44314485144175575
val: micro - f1_score: 0.7366167462252361
val: macro - recall: 0.3969946274670088
val: micro - recall: 0.6947557582315659
val: macro - precision: 0.5434305381501641
val: micro - precision: 0.7838456507521255
test: macro - f1_score: 0.44670151530110175
test: micro - f1_score: 0.7346048867650795
test: macro - recall: 0.40026358761421077
test: micro - recall: 0.6955468980396418
test: macro - precision: 0.5422019101236327
test: micro - precision: 0.7783104071140753


  _warn_prf(average, modifier, msg_start, len(result))
