## Model comparison script

In [1]:
import pandas as pd
import numpy as np
import s3fs
import pyarrow.parquet as pq
import os
import fastparquet
import random
import statistics
import traceback
import ast
from collections import Counter
import langchain
import json
from langchain.agents import create_openai_functions_agent
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate

In [2]:
s3 = s3fs.S3FileSystem()

In [3]:
contesting_models = ['roberta', 'scibert', 'deberta', 'biomed_roberta', 'cs_roberta']

In [4]:
# TODO: Make changes in this cell after cs_roberta
# Save the predictions_cs_roberta.parquet file to S3 dagpapsubmission

total_score = (0.220629 + 0.22031 + 0.220398 + 0.2204659 + 0.220459 )
model_scores = {'roberta': 0.22031, 'scibert': 0.220629, 'deberta': 0.220398, 'biomed_roberta': 0.2204659,'cs_roberta':  0.220459 }
total_score = sum(list(map(lambda x:model_scores[x],list(model_scores.keys()))))
model_weights = {"scibert": model_scores['scibert']/total_score,
                 "roberta": model_scores['roberta']/total_score,
                 "deberta": model_scores['deberta']/total_score,
                 "biomed_roberta": model_scores['biomed_roberta']/total_score,
                 "cs_roberta": model_scores['biomed_roberta']/total_score}

In [5]:
model_weights

{'scibert': 0.20016023415124842,
 'roberta': 0.19987082924666089,
 'deberta': 0.1999506650824092,
 'biomed_roberta': 0.20001226568749222,
 'cs_roberta': 0.20001226568749222}

In [6]:
# class_dict = {"human":0,"NLTK_synonym_replacement":1,"chatgpt":2,"summarized":3}
# class_labels_list = [0, 1, 2, 3]

In [7]:
def merge_model_predictions():
    dev_df = pq.ParquetDataset('s3://dagpapsubmission/data/data_dev_data.parquet', filesystem=s3).read_pandas().to_pandas()
    dev_df["tokens"] = dev_df.tokens.map(lambda x:ast.literal_eval(x.decode()))
    
    for model in contesting_models[0:-1]:
        model_df = pq.ParquetDataset(f's3://dagpapsubmission/predictions_{model}.parquet', filesystem=s3).read_pandas().to_pandas()
        model_df.rename(columns={'preds': f'{model}_preds'}, inplace=True)
        
        dev_df = dev_df.merge(model_df, how='inner', left_index=True, right_index=True)
        print(f"Data shape after merging with {model} model {dev_df.shape}")
    
    return dev_df

In [8]:
#Load MAG-FOS Taxonomy JSON for different fields of study"
with open('MAG_FOS.json',"r+") as f:
    mag_fox_taxonomy = json.load(f)
mag_fox_taxonomy

{'FOS': [{'field_of_study': 'Art',
   'sub_fields': ['Aesthetics, Art History, Classics, Humanities, Literature, Visual Arts']},
  {'field_of_study': 'Biology',
   'sub_fields': ['Anatomy, Animal Science, Bioinformatics, Botany, Genetics, Immunology, Zoology']},
  {'field_of_study': 'Business',
   'sub_fields': ['Accounting, Actuarial Science, Commerce, Finance, International Trade, Marketing']},
  {'field_of_study': 'Chemistry',
   'sub_fields': ['Biochemistry, Food Science, Mineralogy, Organic Chemistry, Radiochemistry']},
  {'field_of_study': 'Computer Science',
   'sub_fields': ['Algorithm, Artificial Intelligence, Database, Internet Privacy, Parallel Computing']},
  {'field_of_study': 'Economics',
   'sub_fields': ['Accounting, International Trade, Management, Political Economy, Socioeconomics']},
  {'field_of_study': 'Engineering',
   'sub_fields': ['Aeronautics, Control Theory, Nuclear Engineering, Simulation, Systems-Engineering']},
  {'field_of_study': 'Environmental Science',

In [9]:
#Setup and test the LLM Instance for all tasks with respect to this analysis
llm_models = ['gpt-4-turbo-2024-04-09']
llms = list(map(lambda x: ChatOpenAI(model=x, temperature=0),llm_models))
llm_tests = list(map(lambda x:x("who are you, give me your model name and version?"),llms))

ValidationError: 1 validation error for ChatOpenAI
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)

In [10]:
import os
print (os.environ['OPENAI_API_KEY'])

KeyError: 'OPENAI_API_KEY'

In [None]:
def get_max_repeated_pred(input_df):
    input_df['preds'] = None
    input_df['majority_model'] = 'baseline'
    input_df['majority_col'] = None
    for index, row in input_df.iterrows():
        combined_preds_max = [0] * len(row['tokens'])
        majority_model_prediction = ['baseline'] * len(row['tokens'])
        majority_col_val = [0] * len(row['tokens'])
    
        for i in range(len(row['tokens'])):
            preds_data = [row['roberta_preds'][i], row['scibert_preds'][i], row['deberta_preds'][i],
                              row['biomed_roberta_preds'][i], row['cs_roberta_preds'][i]]
            
            max_repeated = statistics.multimode(preds_data)
            if len(max_repeated) != 1:
                # Weighted avg
                combined_preds_max[i] = random.choices(
                    preds_data, weights=[model_weights['roberta'], model_weights['scibert'],
                                         model_weights['deberta'], model_weights['biomed_roberta'], 
                                         model_weights['cs_roberta']],
                    k=1)[0]
                if combined_preds_max[i] == row['deberta_preds'][i]:
                    majority_model_prediction[i] = 'deberta'
                elif combined_preds_max[i] == row['biomed_roberta_preds'][i]:
                    majority_model_prediction[i] = 'biomed_roberta'
                elif combined_preds_max[i] == row['roberta_preds'][i]:
                    majority_model_prediction[i] = 'roberta'
                elif combined_preds_max[i] == row['cs_roberta_preds'][i]:
                    majority_model_prediction[i] = 'cs_roberta'
                else:
                    majority_model_prediction[i] = 'scibert'
                majority_col_val[i] = 0
            else:
                combined_preds_max[i] = max_repeated[0]
                if (row['deberta_preds'][i] == row['biomed_roberta_preds'][i]) and \
                (row['roberta_preds'][i] == row['biomed_roberta_preds'][i]) and \
                (row['roberta_preds'][i] == row['scibert_preds'][i]) and \
                (row['deberta_preds'][i] == row['scibert_preds'][i]) and \
                (row['cs_roberta_preds'][i] == row['scibert_preds'][i]):
                    majority_model_prediction[i] = 'all'
                else:
                    majority_model_prediction[i] = 'majority'
                majority_col_val[i] = 1
                    
        input_df.at[index,'preds'] = combined_preds_max
        input_df.at[index, 'majority_model'] = majority_model_prediction
        input_df.at[index, 'majority_col'] = majority_col_val
        
    return input_df

### Function calls to make the predictions and save the parquet file

In [None]:
%%time
merged_model_dev_predictions = merge_model_predictions()

In [None]:
merged_model_dev_predictions.head()

In [None]:
%%time
majority_vote_df = get_max_repeated_pred(merged_model_dev_predictions)
majority_vote_df.head()

In [None]:
majority_vote_df[['preds']].to_parquet('predictions_four_models.parquet')

In [None]:
majority_vote_df.head()

# End here

### Finding the stats about the majority column

In [None]:
def get_model_stats(row):
    stats_dict = row['stats']
    out_dict = {}
    out_keys =  ['roberta', 'scibert', 'deberta', 'biomed_roberta',  'all']
    out_dict = {key:stats_dict[key] if key in list(stats_dict.keys()) else 0 for key in out_keys}
    
    return out_dict

In [None]:
majority_vote_df['stats'] = majority_vote_df[['majority_model']].map(lambda x: Counter(x))
# majority_vote_df.head()

In [None]:
stats_df = majority_vote_df[['tokens', 'stats', 'majority_col']]
stats_df.head()

In [None]:
stats_df['len_tokens'] = stats_df['tokens'].map(lambda x : len(x))
stats_df.drop(columns=['tokens'], inplace=True)

In [None]:
stats_df['sum_majority'] = stats_df['majority_col'].map(lambda x : sum(x))

In [None]:
stats_df['sum_majority_not_all'] = stats_df['sum_majority'] - stats_df['all']
stats_df.head()

In [None]:
stats_df['pct_majority'] = 100 * stats_df['sum_majority']/stats_df['len_tokens']

In [None]:
stats_df[['roberta', 'scibert', 'deberta', 'biomed_roberta',  'all']] = stats_df.apply(
    lambda x : get_model_stats(x), axis=1, result_type='expand')

In [None]:
stats_df.describe()

In [None]:
stats_df['pct_majority'].hist()

In [None]:
stats_df['roberta'].hist()

In [None]:
stats_df['scibert'].hist()

In [None]:
stats_df['deberta'].hist()

In [None]:
stats_df['biomed_roberta'].hist()

In [None]:
stats_df['all'].hist()

# For local runs

In [None]:
base_folder = "/Users/gayatri/Documents/Gayatri/US/Self projects/AI Competition/DAGPAP24/data"

In [None]:
dev_df = pd.read_parquet(base_folder + os.sep + 'dev_data.parquet', engine="fastparquet")
print(dev_df.shape)
dev_df.head()

In [None]:
dev_df.columns

In [None]:
merged = dev_df.copy(deep=True)

for model in contesting_models:
    model_df = pd.read_parquet(base_folder + os.sep + f'predictions_{model}.parquet', engine="fastparquet")
    model_df.rename(columns={'preds': f'{model}_preds'}, inplace=True)

    merged = merged.merge(model_df, how='inner', left_index=True, right_index=True)
    print(f"Data shape after merging with {model} model {merged.shape}")

In [None]:
merged.head()

In [None]:
%%time
max_pred_df = get_max_repeated_pred(merged)
print(max_pred_df.shape)
max_pred_df.isna().sum()

In [None]:
0.88/(0.87+0.88+0.89)

In [None]:
max_pred_df[['preds']].to_parquet(base_folder + os.sep + 'predictions_three_models_combined.parquet') # , engine="fastparquet")

In [None]:
merged.head()

In [None]:
merged.to_csv(base_folder + os.sep + 'dev_majority_model_preds.csv')

In [98]:
merged.to_parquet(base_folder + os.sep + 'dev_majority_model_preds.parquet')