## Model comparison script

In [1]:
import pandas as pd
import numpy as np
import s3fs
import pyarrow.parquet as pq

In [2]:
import os
import fastparquet
import random

In [3]:
s3 = s3fs.S3FileSystem()

In [4]:
contesting_models = ['roberta', 'scibert', 'deberta']
best_model = 'deberta'

In [5]:
model_scores = {'roberta': 0.88, 'scibert': 0.87, 'deberta': 0.89}
model_weights = {"scibert": model_scores['scibert']/(0.87 + 0.88 + 0.89),
                 "roberta": model_scores['roberta']/(0.87 + 0.88 + 0.89),
                 "deberta": model_scores['deberta']/(0.87 + 0.88 + 0.89)}

In [6]:
model_weights

{'scibert': 0.32954545454545453,
 'roberta': 0.3333333333333333,
 'deberta': 0.3371212121212121}

In [7]:
class_dict = {"human":0,"NLTK_synonym_replacement":1,"chatgpt":2,"summarized":3}
class_labels_list = [0, 1, 2, 3]

In [8]:
# Note: When running from S3, please use this function to get the data from dev_data and preds from contesting_models 

In [9]:
def merge_model_predictions():
    dev_df = pq.ParquetDataset('s3://dagpapsubmission/data/data_dev_data.parquet', filesystem=s3).read_pandas().to_pandas()

    for model in contesting_models:
        model_df = pq.ParquetDataset(f's3://dagpapsubmission/predictions_{model}.parquet', filesystem=s3).read_pandas().to_pandas()
        model_df.rename(column={'preds': f'{model}_preds'}, inplace=True)

        dev_df = dev_df.merge(model_df, how='inner', left_index=True, right_index=True)
        print(f"Data shape after merging with {model} model {dev_df.shape}")
    
    return dev_df

In [10]:
def get_max_repeated_pred(input_df):
    input_df['preds'] = None
    for index, row in input_df.iterrows():
        combined_preds_max = [0] * len(row['tokens'])
        for i in range(len(row['tokens'])):
            preds_data = [row['roberta_preds'][i], row['scibert_preds'][i], row['deberta_preds'][i]]
            max_repeated = statistics.multimode(preds_data)
            if len(max_repeated) != 1:
                # Weighted avg
                combined_preds_max[i] = random.choices(preds_data, weights=[model_weights['roberta'],
                                                                            model_weights['scibert'], model_weights['deberta']], k=1)[0]
            else:
                combined_preds_max[i] = max_repeated[0]
        input_df.at[index,'preds'] = combined_preds_max
    return input_df
    

### Scratch pad

In [None]:
# Note: Read data from S3 instead of reading from local

In [None]:
dev_df = pq.ParquetDataset('s3://dagpapsubmission/data/data_dev_data.parquet', filesystem=s3).read_pandas().to_pandas()

In [11]:
base_folder = "/Users/gayatri/Documents/Gayatri/US/Self projects/AI Competition/DAGPAP24/data"

In [18]:
dev_df = pd.read_parquet(base_folder + os.sep + 'dev_data.parquet', engine="fastparquet")
print(dev_df.shape)
dev_df.head()

(5000, 2)


Unnamed: 0_level_0,text,tokens
index,Unnamed: 1_level_1,Unnamed: 2_level_1
12313,Phylogenetic networks are a generalization of ...,"[Phylogenetic, networks, are, a, generalizatio..."
3172,Prediction modelling is more closely aligned w...,"[Prediction, modelling, is, more, closely, ali..."
6451,The heat transfer exhibits the flow of heat (t...,"[The, heat, transfer, exhibits, the, flow, of,..."
4351,a common experience during superficial ultraso...,"[a, common, experience, during, superficial, u..."
22694,Code metadata Current code version v1.5.9 Perm...,"[Code, metadata, Current, code, version, v1.5...."


In [19]:
dev_df.columns

Index(['text', 'tokens'], dtype='object')

In [28]:
merged = dev_df.copy(deep=True)

for model in contesting_models:
    model_df = pd.read_parquet(base_folder + os.sep + f'predictions_{model}.parquet', engine="fastparquet")
    model_df.rename(columns={'preds': f'{model}_preds'}, inplace=True)

    merged = merged.merge(model_df, how='inner', left_index=True, right_index=True)
    print(f"Data shape after merging with {model} model {merged.shape}")

Data shape after merging with roberta model (5000, 3)
Data shape after merging with scibert model (5000, 4)
Data shape after merging with deberta model (5000, 5)


In [29]:
merged.head()

Unnamed: 0_level_0,text,tokens,roberta_preds,scibert_preds,deberta_preds
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12313,Phylogenetic networks are a generalization of ...,"[Phylogenetic, networks, are, a, generalizatio...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3172,Prediction modelling is more closely aligned w...,"[Prediction, modelling, is, more, closely, ali...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6451,The heat transfer exhibits the flow of heat (t...,"[The, heat, transfer, exhibits, the, flow, of,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4351,a common experience during superficial ultraso...,"[a, common, experience, during, superficial, u...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
22694,Code metadata Current code version v1.5.9 Perm...,"[Code, metadata, Current, code, version, v1.5....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [93]:
%%time
max_pred_df = get_max_repeated_pred(merged)
print(max_pred_df.shape)
max_pred_df.isna().sum()

(5000, 6)
CPU times: user 1min 27s, sys: 1.17 s, total: 1min 28s
Wall time: 1min 29s


text             0
tokens           0
roberta_preds    0
scibert_preds    0
deberta_preds    0
preds            0
dtype: int64

In [95]:
0.88/(0.87+0.88+0.89)

0.3333333333333333

In [94]:
max_pred_df[['preds']].to_parquet(base_folder + os.sep + 'predictions_three_models_combined.parquet') # , engine="fastparquet")

In [96]:
merged.head()

Unnamed: 0_level_0,text,tokens,roberta_preds,scibert_preds,deberta_preds,preds
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12313,Phylogenetic networks are a generalization of ...,"[Phylogenetic, networks, are, a, generalizatio...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3172,Prediction modelling is more closely aligned w...,"[Prediction, modelling, is, more, closely, ali...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6451,The heat transfer exhibits the flow of heat (t...,"[The, heat, transfer, exhibits, the, flow, of,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4351,a common experience during superficial ultraso...,"[a, common, experience, during, superficial, u...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
22694,Code metadata Current code version v1.5.9 Perm...,"[Code, metadata, Current, code, version, v1.5....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [97]:
merged.to_csv(base_folder + os.sep + 'dev_majority_model_preds.csv')

In [None]:
# Read the dev_data.parquet file
# Read the S3 prediction.parquet files for all the contesting_models
# Create another column which gives the count of each label for every token. For example, if the first text has 10 tokens then each token will have a dict with the count of label predicted by each model
# Create another column which would give the label having the max count. If there is a tie, then resolve by using the label from the best_model


In [None]:
# Read the input file along with the expected labels
    
# Read the model predictions for the training data and its labels for all the contesting models
    
# Calculate the f1-score for each row in the data (If possible, use the same function as the baseline for this calculation)
    
# Identify the training data that agrees in its predictions with all the models
    # Identify data rows as conflict 
        # -> a) if all the models do not have the same prediction; 
        # b) The models have the same prediction but it is the incorrect prediction.
    
# Resolve conflict rows with the help of another model/ approach
    
# Save this model trained in the second phase to be used on test data

In [None]:
# Test this on the dev_data.parquet

In [None]:
#Weighted Random sample
