In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from torch.nn import CrossEntropyLoss, MSELoss
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn import metrics

### Load Model

In [2]:
model_dir = 'sancharidan/scibet_expertfinder'
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig

# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassification.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained(model_dir)
device = None
n_gpu = 0
if torch.cuda.is_available():
    device = torch.device("cuda") 
    n_gpu = torch.cuda.device_count()

    # Copy the model to the GPU.
    model.to(device)

if n_gpu > 1:
    model = torch.nn.DataParallel(model)

### Predictions

In [3]:
test_data = pd.read_csv('Data/SCIS_0402_gs_int_val_v1.csv')
train_data = pd.read_csv('Data/SCIS_0402_train_v1.csv')
# train_data = train_data[train_data.relation=='researches in'].reset_index()
# train_data = train_data.drop('index', axis = 1)

In [4]:
test_data.head()

Unnamed: 0,head,relation,tail,label
0,Jisun AN,researches in,Computational Social Science,1
1,Jisun AN,researches in,Data Science,1
2,Jisun AN,researches in,News and Social Media Analytics,1
3,Jisun AN,researches in,Anomaly Detection,0
4,Jisun AN,researches in,Cognitive Science,0


In [5]:
max_seq_length = 128
def get_features(head, relation = None, tail = None):
    tokens_head = tokenizer.tokenize(head)
    tokens = ["[CLS]"] + tokens_head + ["[SEP]"]
    segment_ids = [0] * len(tokens)
    
    if relation:
        tokens_relation = tokenizer.tokenize(relation)
        tokens += tokens_relation + ["[SEP]"]
        segment_ids += [1] * (len(tokens_relation) + 1)
    
    if tail:
        tokens_tail = tokenizer.tokenize(tail)
        tokens += tokens_tail + ["[SEP]"]
        segment_ids += [1] * (len(tokens_tail) + 1)
        
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)
    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding
    
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    
    return tokens, input_ids, input_mask, segment_ids

In [6]:
import torch.nn.functional as F
def get_predictions(sequences, batch_size):
    input_ids_list = []
    input_mask_list = []
    segment_ids_list = []
    logits = []
    probabilities = []
    for sequence in sequences:
        
        tokens_enc, input_ids, input_mask, segment_ids = get_features(sequence[0], sequence[1], sequence[2])
        input_ids_list.append(input_ids)
        input_mask_list.append(input_mask)
        segment_ids_list.append(segment_ids)

        
    all_input_ids = torch.tensor([input_ids for input_ids in input_ids_list], dtype=torch.long)
    all_input_mask = torch.tensor([input_mask for input_mask in input_mask_list], dtype=torch.long)
    all_segment_ids = torch.tensor([segment_ids for segment_ids in segment_ids_list], dtype=torch.long)

    all_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)

    sampler = SequentialSampler(all_data)
    dataloader = DataLoader(all_data, sampler=sampler, batch_size=batch_size)
    
    for step, batch in enumerate(dataloader):
        print ("Getting predictions for Batch",step)
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_segment_id = batch[2].to(device)
        outputs = model(b_input_ids, 
                        token_type_ids=b_segment_id, 
                        attention_mask=b_input_mask)
        probabilities.append(F.softmax(outputs[0], dim=-1))
#         print (probabilities)
        logits.append(outputs[0])
    # Move logits and labels to CPU
    predictions = torch.cat(logits, dim=0)
    probabilities = torch.cat(probabilities, dim = 0)
#     print (len(predictions))
    predictions = predictions.detach().cpu().numpy()
    probabilities = probabilities.detach().cpu().numpy()
#     print ((predictions, probabilities))
    return predictions, probabilities

#### Test Set Performance

In [7]:
# ------TEST DATA------#
torch.cuda.empty_cache()
list_of_st = test_data.apply(lambda row: [row['head'], row['relation'], row['tail']], axis =1)
list_of_st = list_of_st.tolist()
out_batch_size = 50
tmpdf = pd.DataFrame()
for i in range(0,len(list_of_st),out_batch_size):
    start = i
    end = min(i+out_batch_size,len(list_of_st))
    print ('Getting predictions from ', start, 'to ',end)
    sublist = list_of_st[start:end]
    # call predict function
    predictions, probabilities = get_predictions(sublist, 4)
    predictions = np.argmax(predictions, axis =1)
#     print ((predictions, probabilities))
    # post processing
    for j,sequence in enumerate(sublist):
        tmpdf = pd.concat([tmpdf, pd.DataFrame({'head' : sequence[0], 'relation': sequence[1], 'tail': sequence[2], 'Prediction':predictions[j], 'Pred_Proba':probabilities[j][predictions[j]]}, index =[0])])
        tmpdf = tmpdf.reset_index().drop(['index'], axis=1)
tmpdf['label'] = test_data['label']

Getting predictions from  0 to  50
Getting predictions for Batch 0
Getting predictions for Batch 1
Getting predictions for Batch 2
Getting predictions for Batch 3
Getting predictions for Batch 4
Getting predictions for Batch 5
Getting predictions for Batch 6
Getting predictions for Batch 7
Getting predictions for Batch 8
Getting predictions for Batch 9
Getting predictions for Batch 10
Getting predictions for Batch 11
Getting predictions for Batch 12
Getting predictions from  50 to  100
Getting predictions for Batch 0
Getting predictions for Batch 1
Getting predictions for Batch 2
Getting predictions for Batch 3
Getting predictions for Batch 4
Getting predictions for Batch 5
Getting predictions for Batch 6
Getting predictions for Batch 7
Getting predictions for Batch 8
Getting predictions for Batch 9
Getting predictions for Batch 10
Getting predictions for Batch 11
Getting predictions for Batch 12
Getting predictions from  100 to  150
Getting predictions for Batch 0
Getting predictions 

In [8]:
from sklearn.metrics import confusion_matrix, classification_report
y_true = tmpdf['label']
y_pred = tmpdf['Prediction']
tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
(tp, tn, fp, fn)

(149, 184, 58, 93)

In [9]:
report = classification_report(y_true, y_pred, output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0,0.66426,0.760331,0.709056,242.0
1,0.719807,0.615702,0.663697,242.0
accuracy,0.688017,0.688017,0.688017,0.688017
macro avg,0.692033,0.688017,0.686376,484.0
weighted avg,0.692033,0.688017,0.686376,484.0


### Retrieval performance

In [10]:
adv_rev = pd.read_pickle('Data/Pred_Expert_v2_Adv_v3_Rev_2811_ACM_v4_robertalarge_n3.pkl')
len(adv_rev), len(adv_rev['tail'].unique())

(430, 430)

In [11]:
tmp_adv_rev = adv_rev.copy()

In [12]:
# adv_rev = adv_rev[adv_rev['list_of_head'].apply(len)>=5]

In [13]:
len(adv_rev)

430

In [14]:
adv_rev['list_of_head'] = adv_rev['list_of_head'].apply(lambda l: [k for k in l if k in train_data['head'].unique()])
adv_rev['len'] = adv_rev['list_of_head'].apply(len)
adv_rev['len'].describe()

count    430.000000
mean       1.867442
std        5.507665
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max       40.000000
Name: len, dtype: float64

In [15]:
adv_rev = adv_rev[adv_rev.len>=5]

In [16]:
len(adv_rev)

53

In [17]:
all_experts = train_data['head'].unique()
total_num_experts = len(all_experts)

In [18]:
import operator
def get_experts(ra, num_experts = 249):
    print ('Getting Experts for ', ra)
    list_of_st = [[expert, 'researches in', ra] for expert in all_experts]
    out_batch_size = 50
    predictions = []
    probabilities = []
    for i in range(0,len(list_of_st),out_batch_size):
        start = i
        end = min(i+out_batch_size,len(list_of_st))
        experts_b = list_of_st[start:end]
        # call predict function
        predictions_b, probabilities_b = get_predictions(experts_b, 4)
        predictions_b = np.argmax(predictions_b, axis =1).tolist()
        probabilities_b = [p[j] for (j,p) in zip(predictions_b, probabilities_b)]
        predictions = predictions + predictions_b
        probabilities = probabilities + probabilities_b
    assert len(list_of_st)==len(predictions)
    
    # create a list of tuples with relevant experts and their probability scores and sort
    res = [(all_experts[i], probabilities[i]) for i in range(len(predictions)) if predictions[i]==1]
    res.sort(key=operator.itemgetter(1),reverse=True)
    return [t[0] for t in res]

In [19]:
import math
eval_dict_list = []
def get_eval_metrics(row):
    """
    Get precision at 1, 5, 10, 15, 20, 25, 30
    """
    eval_dict = {}
    actual_experts = row['list_of_head']#.astype(object)
    pred_experts = get_experts(row['tail'], num_experts = total_num_experts)
#     pred_experts = row['Pred_experts']
    
    prec = []
    rel = []
    rec = []
    ap = []
    i=1
    while(True):
        overlap_i = [exp for exp in pred_experts[:i] if exp in actual_experts]
        num_overlap = len(overlap_i)
        prec_i = float(num_overlap)/i
        rec_i = float(num_overlap)/len(actual_experts)
        rel_i = 0

        
        if pred_experts[i-1] in actual_experts:
            rel_i = 1
        prec = prec + [prec_i]
        if i==5:
            eval_dict['P@5']=prec_i
        rec = rec + [rec_i]
        rel = rel + [rel_i]
        
        ap_i = np.mean([p for (j,p) in enumerate(prec) if rel[j]==1])
        if math.isnan(ap_i):
            ap_i = 0.0
        ap = ap + [ap_i]
        
        if rec_i == 1.0 or i>=len(pred_experts):
            if sum(rel)==0:
                print (row['tail'],len(pred_experts), len(actual_experts))
            eval_dict['Premature'] = False
            eval_dict['AP'] = np.mean([p for (j,p) in enumerate(ap) if rel[j]==1])
            break
        i = i + 1 
        
    # find MRR
    mrr = 0.0
    for i,exp in enumerate(pred_experts):
        if exp in actual_experts:
            mrr += 1.0/(i+1)
            break
    eval_dict['MRR'] = mrr

    eval_dict_list.append(eval_dict)
    return pred_experts

In [20]:
adv_rev['Pred_experts'] = adv_rev.apply(get_eval_metrics, axis =1)

Getting Experts for  ambient intelligence
Getting predictions for Batch 0
Getting predictions for Batch 1
Getting predictions for Batch 2
Getting predictions for Batch 3
Getting predictions for Batch 4
Getting predictions for Batch 5
Getting predictions for Batch 6
Getting predictions for Batch 7
Getting predictions for Batch 8
Getting predictions for Batch 9
Getting predictions for Batch 10
Getting predictions for Batch 11
Getting predictions for Batch 12
Getting predictions for Batch 0
Getting predictions for Batch 1
Getting predictions for Batch 2
Getting predictions for Batch 3
Getting predictions for Batch 4
Getting predictions for Batch 5
Getting predictions for Batch 6
Getting Experts for  applied computing
Getting predictions for Batch 0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Getting predictions for Batch 1
Getting predictions for Batch 2
Getting predictions for Batch 3
Getting predictions for Batch 4
Getting predictions for Batch 5
Getting predictions for Batch 6
Getting predictions for Batch 7
Getting predictions for Batch 8
Getting predictions for Batch 9
Getting predictions for Batch 10
Getting predictions for Batch 11
Getting predictions for Batch 12
Getting predictions for Batch 0
Getting predictions for Batch 1
Getting predictions for Batch 2
Getting predictions for Batch 3
Getting predictions for Batch 4
Getting predictions for Batch 5
Getting predictions for Batch 6
Getting Experts for  artificial intelligence
Getting predictions for Batch 0
Getting predictions for Batch 1
Getting predictions for Batch 2
Getting predictions for Batch 3
Getting predictions for Batch 4
Getting predictions for Batch 5
Getting predictions for Batch 6
Getting predictions for Batch 7
Getting predictions for Batch 8
Getting predictions for Batch 9
Getting predictions for 

IndexError: list index out of range

In [None]:
eval_df = pd.DataFrame(eval_dict_list)
eval_df[['P@5','AP','MRR','P@5']].isnull().sum(axis=0)

In [None]:
eval_df.fillna(0,inplace=True)

In [None]:
eval_df[['P@5','AP','MRR','P@5']].mean(axis=0)

In [None]:
eval_df[['P@5','AP','MRR','P@5']].std(axis=0)