In [44]:
"""This file contains the pipeline for training and evaluating the GCN on the data."""
# %load_ext autoreload
# %autoreload
import os
import sys
import json
import random
import numpy as np
import pandas as pd
import datetime
from typeguard import typechecked

from sklearn.model_selection import train_test_split

from tqdm import tqdm
sys.path.append("/home/jovyan/20230406_ArticleClassifier/ArticleClassifier")

import src.general.global_variables as gv

sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname('data_loader.py'), os.path.pardir)))
from src.data.data_loader import DataLoader
from src.data.data_processing import standardise_embeddings

from src.general.utils import cc_path
from src.models.evaluation import Metrics

import matplotlib.pyplot as plt

In [45]:
print('Start loading data...')
loc_dict = {
    'processed_csv': cc_path('data/processed/canary/articles_cleaned.csv'),
    'abstract_embeddings': cc_path('data/processed/canary/embeddings_fasttext_20230410.csv'),
    'scibert_embeddings': cc_path('data/processed/canary/embeddings_scibert_20230413.csv'),
    'keyword_network': cc_path('data/processed/canary/keyword_network_weighted.pickle'),
    'xml_embeddings': cc_path('data/processed/canary/embeddings_xml.ftr'),
    'author_network': cc_path('data/processed/canary/author_network.pickle'), 
    'label_network': cc_path('data/processed/canary/label_network_weighted.pickle')
}
data_loader = DataLoader(loc_dict)
processed_df = data_loader.load_processed_csv()
embedding_df = data_loader.load_embeddings_csv()
embedding_df = standardise_embeddings(embedding_df)

Start loading data...


In [26]:
embedding_df

Unnamed: 0,pui,d0,d1,d2,d3,d4,d5,d6,d7,d8,...,d246,d247,d248,d249,d250,d251,d252,d253,d254,d255
0,624531411,0.523307,1.438530,-0.080917,-0.187375,0.072302,-1.583681,0.250296,0.900716,-0.741329,...,-0.135713,-0.404753,0.530738,-1.523115,-0.309949,-0.956836,0.191369,-0.745007,1.264909,1.375934
1,625340088,0.453272,1.137349,-1.342064,0.234267,1.387469,1.169491,0.813650,0.410952,-0.195154,...,-1.427820,0.143418,-0.211770,-1.288424,0.660130,-2.610597,1.132445,2.322045,0.469497,1.136931
2,625805682,-0.059067,0.170448,-0.223014,0.705250,1.362661,-1.334532,-0.514577,0.166644,-0.142857,...,-0.918942,0.827185,-0.775148,-1.769286,0.425720,-1.537911,0.914770,0.827428,1.122793,1.136876
3,626662493,0.331264,-0.128117,-2.104888,0.236542,1.049195,1.514864,0.991066,-1.626433,0.648361,...,-0.014593,-1.935730,0.909682,-0.389697,1.338018,-0.443178,0.641751,-0.139190,-1.298358,0.807388
4,626822402,-0.288815,-1.844490,0.028553,1.893139,-0.935294,1.106734,-1.491607,-0.352630,-0.468991,...,1.670850,0.319826,-0.376360,-0.205706,-1.133871,-0.471493,-0.035112,0.090738,0.177633,-1.315776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117305,2011621972,0.410056,-0.997743,0.753250,-0.983777,-1.308745,0.268359,-0.362296,1.848456,-1.967524,...,0.751787,1.455989,0.150169,0.588903,0.036674,0.148446,1.444333,-0.515651,-0.118400,-0.185293
117306,2011622024,1.088784,0.015001,1.156794,-0.696836,0.203408,0.937699,0.986402,2.081369,-1.623275,...,0.612815,1.847444,-0.189883,1.228686,-0.223918,0.413490,1.540787,-1.583825,-1.793567,0.548181
117307,2011622065,0.157201,0.386948,0.687774,0.356504,0.275766,-1.527228,0.734882,-0.341372,0.323361,...,-0.522548,-0.694288,-1.353055,0.980701,-0.020913,-0.653802,-0.842045,-0.416620,0.845871,0.409172
117308,2011626864,-1.163109,-0.616085,0.716920,0.693161,-0.796137,1.830795,-0.697601,-0.946027,-0.068566,...,1.553166,1.017975,0.491364,1.343349,-1.649031,1.036480,-0.450476,-1.386962,-0.871713,-1.482193


In [27]:
label_columns = processed_df.loc[:, ~processed_df.columns.isin(
    ['file_name', 'title', 'keywords', 'abstract', 'abstract_2', 'authors', 'organization', 'chemicals',
     'num_refs', 'date-delivered', 'labels_m', 'labels_a'])]

# label_columns = processed_df.loc[:, ['pui', 'human', 'mouse', 'rat', 'nonhuman',
#                                      'controlled study', 'animal experiment']]
label_columns[label_columns.columns.difference(['pui'])] = label_columns[
    label_columns.columns.difference(['pui'])].astype(int)
features = ['file_name', 'pui', 'title', 'keywords', 'abstract', 'abstract_2', 'authors', 'organization',
            'chemicals',
            'num_refs', 'date-delivered', 'labels_m', 'labels_a']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_columns[label_columns.columns.difference(['pui'])] = label_columns[


In [28]:
with open(cc_path("data/pui_idx_mapping.json"), "r") as outfile:
    node_label_mapping = json.load(outfile)
    
with open(cc_path(f'data/train_indices.txt')) as f:
    train_puis = f.read().splitlines()
    train_indices = list(map(node_label_mapping.get, train_puis))
with open(cc_path(f'data/val_indices.txt')) as f:
    val_puis = f.read().splitlines()
    val_indices = list(map(node_label_mapping.get, val_puis))
with open(cc_path(f'data/test_indices.txt')) as f:
    test_puis = f.read().splitlines()
    test_indices = list(map(node_label_mapping.get, test_puis))

In [29]:
embedding_df.set_index('pui', inplace=True)
label_columns.set_index('pui', inplace=True)

In [30]:
train_texts_df = embedding_df.loc[train_puis]
train_texts = train_texts_df[embedding_df.columns.difference(['pui'])].to_numpy()
train_labels = label_columns.loc[train_texts_df.index.to_list(), label_columns.columns.difference(['pui'])].to_numpy().tolist()

val_texts_df = embedding_df.loc[val_puis]
val_texts = val_texts_df[embedding_df.columns.difference(['pui'])].to_numpy()
val_labels = label_columns.loc[val_texts_df.index.to_list(), label_columns.columns.difference(['pui'])].to_numpy().tolist()

test_texts_df = embedding_df.loc[test_puis]
test_texts = test_texts_df[embedding_df.columns.difference(['pui'])].to_numpy()
test_labels = label_columns.loc[test_texts_df.index.to_list(), label_columns.columns.difference(['pui'])].to_numpy().tolist()


## Model training and prediction

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [41]:
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
ovr = OneVsRestClassifier(lr, n_jobs=-11)
ovr.fit(train_texts, train_labels)

In [42]:
y_train_preds = ovr.predict(train_texts)
y_val_preds = ovr.predict(val_texts)
y_test_preds = ovr.predict(test_texts)


In [43]:
from sklearn.metrics import f1_score, recall_score, precision_score

for dataset_name, (dataset_pred, dataset_real) in {'train': (y_train_preds, train_labels), 'val': (y_val_preds, val_labels), 'test': (y_test_preds, test_labels)}.items():
    for metric_name, metric in {'f1_score': f1_score, 'recall': recall_score, 'precision': precision_score}.items():
        for averaging_type in ['macro', 'micro']:
            print(f'{dataset_name}: {averaging_type} - {metric_name}: {metric(dataset_real, dataset_pred, average=averaging_type)}')

train: macro - f1_score: 0.47038260268676846
train: micro - f1_score: 0.7322604838287705
train: macro - recall: 0.40230362198115394
train: micro - recall: 0.6778775255379382
train: macro - precision: 0.7065758261506867
train: micro - precision: 0.7961303127607393
val: macro - f1_score: 0.39366005045413444
val: micro - f1_score: 0.7222428863678632
val: macro - recall: 0.3414371083815635
val: micro - recall: 0.6675834801667454
val: macro - precision: 0.5426066073272986


  _warn_prf(average, modifier, msg_start, len(result))


val: micro - precision: 0.7866511368815516
test: macro - f1_score: 0.44110422298635804
test: micro - f1_score: 0.729988552921797
test: macro - recall: 0.37869912718318527
test: micro - recall: 0.6738544011271284
test: macro - precision: 0.6348763728404164
test: micro - precision: 0.7963248730964467
