In [1]:
import pandas as pd
import fastparquet

## Testing why merging test_data has indexing issues

In [2]:
# Read the test_data and read all the indexes in it
test = pd.read_parquet('data/test_data.parquet', engine='fastparquet')
print(test.shape)
test_indexes = test.index.tolist()
print(len(test_indexes))
print(test_indexes[0:50])

(20000, 3)
20000
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [3]:
test.head()

Unnamed: 0,index,text,tokens
0,14149,patient on a regular basis show with somatic s...,"[patient, on, a, regular, basis, show, with, s..."
1,8946,Hydropower is a great source of energy product...,"[Hydropower, is, a, great, source, of, energy,..."
2,22378,There has been an exponential growth in the cr...,"[There, has, been, an, exponential, growth, in..."
3,12162,Automotive and aerospace industries have putti...,"[Automotive, and, aerospace, industries, have,..."
4,4879,This paper contributes to attempts of reconsid...,"[This, paper, contributes, to, attempts, of, r..."


In [4]:
test.set_index("index", inplace=True)
test.head()

Unnamed: 0_level_0,text,tokens
index,Unnamed: 1_level_1,Unnamed: 2_level_1
14149,patient on a regular basis show with somatic s...,"[patient, on, a, regular, basis, show, with, s..."
8946,Hydropower is a great source of energy product...,"[Hydropower, is, a, great, source, of, energy,..."
22378,There has been an exponential growth in the cr...,"[There, has, been, an, exponential, growth, in..."
12162,Automotive and aerospace industries have putti...,"[Automotive, and, aerospace, industries, have,..."
4879,This paper contributes to attempts of reconsid...,"[This, paper, contributes, to, attempts, of, r..."


In [5]:
scibert_preds_df = pd.read_parquet(f'data/test_data_predictions_scibert.parquet',
                                         engine='fastparquet')
print(scibert_preds_df.shape)
scibert_preds_df.head()

(20000, 1)


Unnamed: 0_level_0,preds
index,Unnamed: 1_level_1
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."


In [6]:
merged_test_preds = scibert_preds_df.merge(test, left_index=True, right_index=True, how='inner')
print(merged_test_preds.shape)

(20000, 3)


In [None]:
_preds

In [3]:
def merge_test_model_predictions(model_list):
    test_df = pd.read_parquet('data/test_data.parquet', engine='fastparquet')
    if test_df.index.name != "index":
        test_df.set_index("index", inplace=True)

    merged_df = None
    for model in model_list:
        test_preds_df = pd.read_parquet(f'data/test_data_predictions_{model}.parquet',
                                         engine='fastparquet')
        test_preds_df.rename(columns={'preds': f'{model}_preds'}, inplace=True)
        
        if merged_df is None:
            merged_df = test_preds_df.copy(deep=True)
        else:
            merged_df = merged_df.merge(test_preds_df, how='inner', left_index=True, right_index=True)
        print(merged_df.shape)

    merged_test_preds = merged_df.merge(test_df, left_index=True, right_index=True, how='inner')
    print(f"Final Merged File Shape = {merged_test_preds.shape}")
    merged_test_preds.to_parquet('data/merged_test_predictions.parquet')
    
    return merged_test_preds


In [None]:
%%time
merge_test_model_predictions(['scibert', 'roberta'])

In [1]:
import pandas as pd
import fastparquet


def merge_test_model_predictions(model_list):
    test_df = pd.read_parquet('data/test_data.parquet', engine='fastparquet')
    if test_df.index.name != "index":
        test_df.set_index("index", inplace=True)

    merged_df = None
    for model in model_list:
        test_preds_df = pd.read_parquet(f'data/test_data_predictions_{model}.parquet',
                                         engine='fastparquet')
        test_preds_df.rename(columns={'preds': f'{model}_preds'}, inplace=True)
        
        if merged_df is None:
            merged_df = test_preds_df.copy(deep=True)
        else:
            merged_df = merged_df.merge(test_preds_df, how='inner', left_index=True, right_index=True)
        print(merged_df.shape)

    merged_test_preds = merged_df.merge(test_df, left_index=True, right_index=True, how='inner')
    print(f"Final Merged File Shape = {merged_test_preds.shape}")
    merged_test_preds.to_parquet('data/merged_test_predictions.parquet')
    print("NAs in final merged file")
    print(merged_test_preds.isna())
    
    return merged_test_preds


In [2]:

# To merge the test prediction files
contesting_models = ['roberta', 'scibert', 'deberta', 'biomed_roberta', 'cs_roberta']
model_list = contesting_models[:3]
print(model_list)
test_pred_df = merge_test_model_predictions(model_list)

['roberta', 'scibert', 'deberta']
(20000, 1)
(20000, 2)
(20000, 3)
Final Merged File Shape = (20000, 5)
NAs in final merged file
       roberta_preds  scibert_preds  deberta_preds   text  tokens
index                                                            
0              False          False          False  False   False
1              False          False          False  False   False
2              False          False          False  False   False
3              False          False          False  False   False
4              False          False          False  False   False
...              ...            ...            ...    ...     ...
24995          False          False          False  False   False
24996          False          False          False  False   False
24997          False          False          False  False   False
24998          False          False          False  False   False
24999          False          False          False  False   False

[20000 rows 