In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, get_cosine_schedule_with_warmup
from seqeval.metrics import classification_report

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

  from .autonotebook import tqdm as notebook_tqdm
2023-09-25 18:04:58.317224: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-25 18:04:58.369487: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cuda


## Launche NER classifer and evaluate the prediction - CCRoberta-ep10

In [9]:
%%time
#df_test_data = pd.read_csv('data/test_GPT+labels.csv')
df_test_data = pd.read_csv('GPT_results/Human_corrected_annotations+gpt_res.csv')

stat_list = []
for trainset_num in range(1, 11):

    eval_list = []
    model_name = f'ner_model/allenai/scibert_scivocab_uncased_ft_3ep_train_size_10240_trainset_{trainset_num}'
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name,truncation=True, model_max_length=512)
    pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="first")
    
    for index, row in df_test_data.iterrows():
    
        #let's remove repeated terms, keeping 1616 unique out of 1660 terms
        expected_list = set(row['plain_text_term'].split(';'))
        while '' in expected_list:
            expected_list.remove('')

        #when using test_GPT+labels.csv
        #extracted_list = pipe(row['sentence'])
        
        #when using test_GPT+labels.csv
        extracted_list = pipe(row['plain_text_def'])
        
        extracted_list = [x['word'].strip() for x in extracted_list] 
        while '' in extracted_list:
            extracted_list.remove('')
        
        num_TP = 0
        num_too_long = 0
        num_cut_off = 0
        num_split_term = 0
        TP_list = []
        ST_list = [] 
        
        for expected in expected_list:
            for extracted in extracted_list:
                
                if extracted.casefold() == expected.casefold():
                    num_TP = num_TP + 1
                    TP_list.append(expected)
                elif extracted.casefold() in expected.casefold():
                    num_cut_off = num_cut_off + 1
                elif expected.casefold() in extracted.casefold():
                    num_too_long = num_too_long + 1
                    
            expected_no_space = expected.replace(" ","")
            extracted_no_space = ("".join(extracted_list)).replace(" ","")
            if expected_no_space.casefold() in extracted_no_space.casefold(): # including TPs
                num_split_term = num_split_term + 1
                ST_list.append(expected)
        
        num_TP = num_TP - (len(TP_list) - len(set(TP_list)))
        num_split_term = num_split_term - (len(ST_list) -len(set(ST_list)))
        
        eval_list.append({'True Term Num' : len(expected_list),
                            'Extracted Term Num': len(extracted_list),
                            'TP': num_TP,
                            'Cut Off': num_cut_off,
                            'Too Long': num_too_long,
                            'Split Term': num_split_term,
                            'extracted': '###'.join(extracted_list)})
    df_eval = pd.DataFrame(eval_list)
    df_eval['expected'] = df_test_data['plain_text_term']    
    df_eval.to_csv(f'GPT_results/scibert_scivocab_uncased_ft_3ep_train_size_10240_trainset_{trainset_num}_first_eval.csv', index=False)

    num_T = df_eval['True Term Num'].sum()
    num_Ex = df_eval['Extracted Term Num'].sum()
    num_ST = df_eval['Split Term'].sum()
    precision = num_ST / num_Ex
    recall = num_ST / num_T
    stat_list.append({
        "model_name": model_name,
        "True Term Num": num_T,
        "Extracted Term Num": num_Ex,
        "True positive": df_eval['TP'].sum(),
        "True positive + split terms": num_ST,
        "Too Long": df_eval['Too Long'].sum(),
        "Cut Off": df_eval['Cut Off'].sum(),
        "precision /correct rate": precision,
        "recall": recall,
        "f1": 2 * precision * recall / ( precision + recall )
    })
df_stat = pd.DataFrame(stat_list)
df_stat.to_csv(f'GPT_results/scibert_scivocab_uncased_ft_3ep_train_size_10240_first_eval_stat.csv', index=False)

CPU times: user 1h 33min 27s, sys: 48.3 s, total: 1h 34min 15s
Wall time: 12min 13s


In [10]:
df_stat

Unnamed: 0,model_name,True Term Num,Extracted Term Num,True positive,True positive + split terms,Too Long,Cut Off,precision /correct rate,recall,f1
0,ner_model/allenai/scibert_scivocab_uncased_ft_...,1552,2534,936,1277,153,1735,0.503946,0.822809,0.625061
1,ner_model/allenai/scibert_scivocab_uncased_ft_...,1552,2182,829,1110,123,1483,0.508708,0.715206,0.594537
2,ner_model/allenai/scibert_scivocab_uncased_ft_...,1552,2543,902,1248,153,1731,0.490759,0.804124,0.609524
3,ner_model/allenai/scibert_scivocab_uncased_ft_...,1552,2471,922,1256,144,1655,0.508296,0.809278,0.62441
4,ner_model/allenai/scibert_scivocab_uncased_ft_...,1552,2428,929,1251,151,1570,0.515239,0.806057,0.628643
5,ner_model/allenai/scibert_scivocab_uncased_ft_...,1552,2584,925,1297,177,1703,0.501935,0.835696,0.627176
6,ner_model/allenai/scibert_scivocab_uncased_ft_...,1552,2432,902,1225,138,1683,0.503701,0.789304,0.61496
7,ner_model/allenai/scibert_scivocab_uncased_ft_...,1552,2476,914,1272,184,1500,0.513732,0.819588,0.631579
8,ner_model/allenai/scibert_scivocab_uncased_ft_...,1552,2655,961,1283,139,1759,0.483239,0.826675,0.609936
9,ner_model/allenai/scibert_scivocab_uncased_ft_...,1552,2525,895,1292,217,1588,0.511683,0.832474,0.633799


In [13]:
pd.DataFrame(df_stat.mean(numeric_only=True))

Unnamed: 0,0
True Term Num,1552.0
Extracted Term Num,2483.0
True positive,911.5
True positive + split terms,1251.1
Too Long,157.9
Cut Off,1640.7
precision /correct rate,0.504124
recall,0.806121
f1,0.619962


In [15]:
df_stat_sci_2048 = pd.read_csv('GPT_results/scibert_scivocab_uncased_ft_5ep_train_size_2048_first_eval_stat.csv')
pd.DataFrame(df_stat_sci_2048.mean(numeric_only=True))
#pd.DataFrame(df_stat_10_3.std(numeric_only=True))

Unnamed: 0,0
True Term Num,1552.0
Extracted Term Num,2481.2
True positive,863.2
True positive + split terms,1201.6
Too Long,171.2
Cut Off,1648.9
precision /correct rate,0.485267
recall,0.774227
f1,0.595982


In [None]:
df_stat_10_3 = pd.read_csv('GPT_results/cc_math_roberta_ep10_ft_3ep_train_size_10240_first_eval_stat.csv')
#pd.DataFrame(df_stat_10_3.mean(numeric_only=True))
pd.DataFrame(df_stat_10_3.std(numeric_only=True))

In [165]:
df_stat_01_3 = pd.read_csv('GPT_results/cc_math_roberta_ep01_ft_3ep_train_size_10240_first_eval_stat.csv')
#pd.DataFrame(df_stat_01_3.mean(numeric_only=True))
pd.DataFrame(df_stat_01_3.std(numeric_only=True))

  pd.DataFrame(df_stat_01_3.std())


Unnamed: 0,0
True Term Num,0.0
Extracted Term Num,333.799274
True positive,88.298798
True positive + split terms,96.693617
Too Long,37.24767
Cut Off,104.628656
precision /correct rate,0.05742
recall,0.062303
f1,0.042937


In [166]:
df_stat_ro_3 = pd.read_csv('GPT_results/roberta-base_ft_3ep_train_size_10240_first_eval_stat.csv')
#pd.DataFrame(df_stat_ro_3.mean(numeric_only=True))
pd.DataFrame(df_stat_ro_3.std(numeric_only=True))

  pd.DataFrame(df_stat_ro_3.std())


Unnamed: 0,0
True Term Num,0.0
Extracted Term Num,85.684499
True positive,29.89686
True positive + split terms,24.253293
Too Long,24.680402
Cut Off,27.790686
precision /correct rate,0.023374
recall,0.015627
f1,0.010583


In [168]:
df_stat_01_5 = pd.read_csv('GPT_results/cc_math_roberta_ep01_ft_5ep_train_size_2048_first_eval_stat_corrected.csv')
#print(df_stat_ro_3.mean(numeric_only=True))
pd.DataFrame(df_stat_01_5.std(numeric_only=True))

Unnamed: 0,0
True Term Num,0.0
Extracted Term Num,693.232044
True positive,95.847622
True positive + split terms,132.830552
Too Long,65.555405
Cut Off,233.797253
precision /correct rate,0.081552
recall,0.085587
f1,0.044332


In [161]:
df_stat_10_5 = pd.read_csv('GPT_results/cc_math_roberta_ep10_ft_5ep_train_size_2048_first_eval_stat.csv')
#pd.DataFrame(df_stat_10_5.mean(numeric_only=True))
#print(df_stat_10_5.mean(numeric_only=True))
pd.DataFrame(df_stat_10_5.std(numeric_only=True))

  pd.DataFrame(df_stat_10_5.std())


Unnamed: 0,0
True Term Num,0.0
Extracted Term Num,261.143128
True positive,100.494113
True positive + split terms,120.302628
Too Long,64.220886
Cut Off,80.113531
precision /correct rate,0.060666
recall,0.077515
f1,0.051962


In [163]:
df_stat_ro_5 = pd.read_csv('GPT_results/roberta-base_ft_5ep_train_size_2048_first_eval_stat_corrected.csv')
#pd.DataFrame(df_stat_ro_5.mean(numeric_only=True))
pd.DataFrame(df_stat_ro_5.std(numeric_only=True))

  pd.DataFrame(df_stat_ro_5.std())


Unnamed: 0,0
True Term Num,0.0
Extracted Term Num,291.651161
True positive,86.461038
True positive + split terms,118.986694
Too Long,86.441759
Cut Off,46.124108
precision /correct rate,0.049217
recall,0.076667
f1,0.031259


In [4]:
df_stat['precision /correct rate'].mean(), df_stat['recall'].mean(), df_stat['f1'].mean()

(0.6502595023873526, 0.7186881188118812, 0.6810759139456813)

## Update test set... and eval res

In [123]:
rm_list = []
df_old_res = pd.read_csv('GPT_results/gpt-4_res+eval.csv')
df_updated_test_data = pd.read_csv('GPT_results/Human_corrected_annotations+gpt_res.csv')

old_gpt4_list = df_old_res['gpt-4 terms'].to_list()
old_tt_list = df_old_res['plain_text_term'].to_list()

new_gpt4_list = df_updated_test_data['gpt-4 terms'].to_list()
new_tt_list = df_updated_test_data['plain_text_term'].to_list()

j = 0
for i in range(0, 999):
    while old_gpt4_list[j] != new_gpt4_list[i]:
        print(j)
        rm_list.append(j)
        j = j+1       
    j = j + 1

update_list=[]
for i in range(0, 999):
    while old_gpt4_list[i] != new_gpt4_list[i]:
        del old_gpt4_list[i], old_tt_list[i]

    if old_tt_list[i] == new_tt_list[i]:
        continue
    else:
        update_list.append({'idx':i, 'expected': new_tt_list[i], 'old_expected': old_tt_list[i]})
        

24
111
198
271
336
337
437
438
439
440
441
442
443
444
445
446
447
448
600
867
868
869
870
871
881


In [124]:
update_list

[{'idx': 365, 'expected': 'Poincaré lemma', 'old_expected': '(A Poincaré'},
 {'idx': 368, 'expected': 'non-regular', 'old_expected': 'non-'},
 {'idx': 496,
  'expected': 'Bounded difference assumption',
  'old_expected': '(Bounded difference assumption)'},
 {'idx': 505, 'expected': 'reduction', 'old_expected': 'reduction;i.e.'},
 {'idx': 514,
  'expected': 'substring;length',
  'old_expected': 'i;substring;length'},
 {'idx': 583,
  'expected': 'The Dehn-Sommerville Equations',
  'old_expected': '(The Dehn-Sommerville Equations)'},
 {'idx': 584,
  'expected': "McMullen's Upper Bound Theorem",
  'old_expected': "(McMullen's Upper Bound Theorem)"},
 {'idx': 585,
  'expected': 'The Generalized Lower Bound Theorem',
  'old_expected': '(The Generalized Lower Bound Theorem)'},
 {'idx': 586,
  'expected': 'The Generalized Dehn-Sommerville Equations',
  'old_expected': '(The Generalized Dehn-Sommerville Equations)'},
 {'idx': 587,
  'expected': 'Bayer and Ehrenborg',
  'old_expected': '(Bayer a

In [145]:
for trainset_num in range(1, 11):

    df_eval = pd.read_csv(f'GPT_results/cc_math_roberta_ep10_ft_3ep_train_size_10240_trainset_{trainset_num}_first_eval.csv')
    df_eval = df_eval.drop([ df_eval.index[i] for i in rm_list])
    print(len(df_eval))

    for x in update_list:
        index_ici = x['idx']
        expected = x['expected']
        df_eval['expected'] = df_eval['expected'].replace(x['old_expected'], x['expected'])
    
    df_eval.to_csv(f'GPT_results/cc_math_roberta_ep10_ft_3ep_train_size_10240_trainset_{trainset_num}_first_eval_tmp.csv',index=False)

     

stat_list = []

for trainset_num in range(1, 11):

    df_res = pd.read_csv(f'GPT_results/cc_math_roberta_ep10_ft_3ep_train_size_10240_trainset_{trainset_num}_first_eval_tmp.csv')
    eval_list =[]

    for index, row in df_res.iterrows():

        #let's remove repeated terms, keeping 1616 unique out of 1660 terms
        expected_list = set(row['expected'].split(';'))
        while '' in expected_list:
            expected_list.remove('')

        extracted_string = row['extracted']#['gpt-3.5-turbo terms']
        if extracted_string != extracted_string: #nan
            extracted_list = []
        else:
            extracted_list = extracted_string.split('###')
            extracted_list = [x.strip() for x in extracted_list] 
            while '' in extracted_list:
                extracted_list.remove('')

        num_TP = 0
        num_too_long = 0
        num_cut_off = 0
        num_split_term = 0
        TP_list = []
        ST_list = [] 

        for expected in expected_list:
            for extracted in extracted_list:

                if extracted.casefold() == expected.casefold():
                    num_TP = num_TP + 1
                    TP_list.append(expected)
                elif extracted.casefold() in expected.casefold():
                    num_cut_off = num_cut_off + 1
                elif expected.casefold() in extracted.casefold():
                    num_too_long = num_too_long + 1

            expected_no_space = expected.replace(" ","")
            extracted_no_space = ("".join(extracted_list)).replace(" ","")
            if expected_no_space.casefold() in extracted_no_space.casefold(): # including TPs
                num_split_term = num_split_term + 1
                ST_list.append(expected)

        num_TP = num_TP - (len(TP_list) - len(set(TP_list)))
        num_split_term = num_split_term - (len(ST_list) -len(set(ST_list)))

        eval_list.append({'True Term Num' : len(expected_list),
                            'Extracted Term Num': len(extracted_list),
                            'TP': num_TP,
                            'Cut Off': num_cut_off,
                            'Too Long': num_too_long,
                            'Split Term': num_split_term})
    df_eval = pd.DataFrame(eval_list)
    df_eval['expected'] = df_res['expected']
    df_eval['extracted'] = df_res['extracted']
    df_eval.to_csv(f'GPT_results/cc_math_roberta_ep10_ft_3ep_train_size_10240_trainset_{trainset_num}_first_eval_corrected.csv', index=False)
    num_T = df_eval['True Term Num'].sum()
    num_Ex = df_eval['Extracted Term Num'].sum()
    num_ST = df_eval['Split Term'].sum()
    precision = num_ST / num_Ex
    recall = num_ST / num_T
    stat_list.append({
        "True Term Num": num_T,
        "Extracted Term Num": num_Ex,
        "True positive": df_eval['TP'].sum(),
        "True positive + split terms": num_ST,
        "Too Long": df_eval['Too Long'].sum(),
        "Cut Off": df_eval['Cut Off'].sum(),
        "precision /correct rate": precision,
        "recall": recall,
        "f1": 2 * precision * recall / ( precision + recall )
    })
df_stat = pd.DataFrame(stat_list)
df_stat.to_csv(f'GPT_results/cc_math_roberta_ep10_ft_3ep_train_size_10240_first_eval_stat_corrected.csv', index=False)

999
999
999
999
999
999
999
999
999
999


In [146]:
df_stat['precision /correct rate'].mean(), df_stat['recall'].mean(), df_stat['f1'].mean()

(0.6516446172588075, 0.742590206185567, 0.6924612342211107)

In [1]:
# GPT-4
p = 0.6248288452761296
r = 0.8820876288659794
2 * p * r / (p + r)

0.7314987977558107

In [2]:
# GPT-3.5
p = 0.19291161956034095
r = 0.8311855670103093
2 * p * r / (p + r)

0.31314479912610754

In [15]:
#roberta_base testset 10 11367
p = 0.7010250569476082
r = 0.7617574257425742
2 * p * r / (p + r)


0.7301304863582444

In [1]:
#roberta_base testset 1 11366
p = 0.7470198675496689
r = 0.698019801980198
2 * p * r / (p + r)

0.7216890595009596