## Model comparison script

In [79]:
import pandas as pd
import numpy as np
import s3fs
import pyarrow.parquet as pq
import os
import fastparquet
import random
import statistics
import traceback
import ast
from collections import Counter
import langchain
import json
from langchain.agents import create_openai_functions_agent
from langchain_openai import ChatOpenAI
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Dict, List
import re
import string
from openai import RateLimitError
from dotenv import load_dotenv
import time
load_dotenv("../_envvars.txt")

True

In [80]:
s3 = s3fs.S3FileSystem()

In [81]:
contesting_models = ['roberta', 'scibert', 'deberta', 'biomed_roberta', 'cs_roberta']

In [82]:
# TODO: Make changes in this cell after cs_roberta
# Save the predictions_cs_roberta.parquet file to S3 dagpapsubmission

total_score = (0.220629 + 0.22031 + 0.220398 + 0.2204659 + 0.2204 )
model_scores = {'roberta': 0.22031, 'scibert': 0.220629, 'deberta': 0.220398, 'biomed_roberta': 0.2204659,'cs_roberta':  0.220459 }
total_score = sum(list(map(lambda x:model_scores[x],list(model_scores.keys()))))
model_weights = {"scibert": model_scores['scibert']/total_score,
                 "roberta": model_scores['roberta']/total_score,
                 "deberta": model_scores['deberta']/total_score,
                 "biomed_roberta": model_scores['biomed_roberta']/total_score,
                 "cs_roberta": model_scores['biomed_roberta']/total_score}

In [83]:
model_weights

{'scibert': 0.20016023415124842,
 'roberta': 0.19987082924666089,
 'deberta': 0.1999506650824092,
 'biomed_roberta': 0.20001226568749222,
 'cs_roberta': 0.20001226568749222}

In [84]:
# class_dict = {"human":0,"NLTK_synonym_replacement":1,"chatgpt":2,"summarized":3}
# class_labels_list = [0, 1, 2, 3]

In [85]:
def merge_model_predictions():
    dev_df = pq.ParquetDataset('s3://dagpapsubmission/data/data_dev_data.parquet', filesystem=s3).read_pandas().to_pandas()
    dev_df["tokens"] = dev_df.tokens.map(lambda x:ast.literal_eval(x.decode()))
    
    for model in contesting_models:
        model_df = pq.ParquetDataset(f's3://dagpapsubmission/predictions_{model}.parquet', filesystem=s3).read_pandas().to_pandas()
        model_df.rename(columns={'preds': f'{model}_preds'}, inplace=True)
        
        dev_df = dev_df.merge(model_df, how='inner', left_index=True, right_index=True)
        print(f"Data shape after merging with {model} model {dev_df.shape}")
    
    return dev_df

In [86]:
#Load MAG-FOS Taxonomy JSON for different fields of study"
with open('MAG_FOS.json',"r+") as f:
    mag_fos_taxonomy = json.load(f)
mag_fos_taxonomy
major_fields_of_study = list(map(lambda x:x['field_of_study'],mag_fos_taxonomy["FOS"]))
major_fields_of_study_str = ",".join(major_fields_of_study)
sub_areas_within_major_field_of_study_list = list(map(lambda x:{x['field_of_study']:x['sub_fields']},mag_fos_taxonomy["FOS"]))
sub_areas_within_major_field_of_study = {list(fos.keys())[0]:fos[list(fos.keys())[0]] for fos in sub_areas_within_major_field_of_study_list}
sub_areas_within_major_field_of_study_str = "\n".join(f"{k}:{v}" for k,v in sub_areas_within_major_field_of_study.items())

In [58]:
#define Pydantic class for Structured output for article field of study
class ArticleFieldOfStudy(BaseModel):
    major_field_of_study: str = Field(description="The major field of study associated with the text of the article")
    sub_areas_within_major_field_of_study: List[str] = Field(description="A list sub areas within the major field of study associated with the text of the article")
    allied_field_of_study: List[str] = Field(description="List of other major fields of study associated with the text of the article")    

In [87]:
article_fos_dict_schema = convert_to_openai_tool(ArticleFieldOfStudy)

In [88]:
#Setup and test the LLM Instance for all tasks with respect to this analysis
llm_models = ['gpt-4-turbo-2024-04-09', 'gpt-3.5-turbo-0125']
llms = list(map(lambda x: ChatOpenAI(model=x, temperature=0),llm_models))
llm_tests = list(map(lambda x:x.invoke("who are you, give me your model name and version?"),llms))
llm_tests

[AIMessage(content='I am an AI language model created by OpenAI, known as ChatGPT. My responses are generated based on a mixture of licensed data, data created by human trainers, and publicly available data. I do not have a specific model name or version like software might, but I am based on the GPT (Generative Pre-trained Transformer) architecture.', response_metadata={'token_usage': {'completion_tokens': 71, 'prompt_tokens': 19, 'total_tokens': 90}, 'model_name': 'gpt-4-turbo-2024-04-09', 'system_fingerprint': 'fp_46a93fa712', 'finish_reason': 'stop', 'logprobs': None}, id='run-abf8a15d-9957-40aa-b264-6479f1c08f51-0'),
 AIMessage(content='I am an AI digital assistant created by OpenAI. I do not have a specific model name or version as I am constantly learning and updating my capabilities.', response_metadata={'token_usage': {'completion_tokens': 31, 'prompt_tokens': 19, 'total_tokens': 50}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': 'fp_3b956da36b', 'finish_reason': '

In [77]:
? llm.with_structured_output

[0;31mSignature:[0m
 [0mllm[0m[0;34m.[0m[0mwith_structured_output[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mschema[0m[0;34m:[0m [0;34m'Optional[_DictOrPydanticClass]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmethod[0m[0;34m:[0m [0;34m"Literal['function_calling', 'json_mode']"[0m [0;34m=[0m [0;34m'function_calling'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minclude_raw[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m:[0m [0;34m'Any'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'Runnable[LanguageModelInput, _DictOrPydantic]'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Model wrapper that returns outputs formatted to match the given schema.

        Args:
            schema: The output schema as a dict or a Pydantic class. If a Pydantic class
               

In [60]:
#Create a ChatPromptTempate for executing 
system = f'''Given an input text from a scientific article identify relevant information about the text.
            You can make use of the following major fields of study: {major_fields_of_study_str}
            You can also make use of the following sub areas within each major field of study listed above: {sub_areas_within_major_field_of_study_str}
         '''
prompt = ChatPromptTemplate.from_messages(
    [("system", system), ("human", "{input}"),]
)

In [89]:
#construct structured LLMs from input LLMs
structured_llms = list(map(lambda x:x.with_structured_output(article_fos_dict_schema),llms))
structured_article_fos_chains = list(map(lambda x:prompt | x,structured_llms))
structured_article_fos_chains

[ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template="Given an input text from a scientific article identify relevant information about the text.\n            You can make use of the following major fields of study: Art,Biology,Business,Chemistry,Computer Science,Economics,Engineering,Environmental Science,Geography,Geology,History,Materials Science,Mathematics,Medicine,Philosophy,Physics,Political Science,Psychology,Sociology\n            You can also make use of the following sub areas within each major field of study listed above: Art:['Aesthetics, Art History, Classics, Humanities, Literature, Visual Arts']\nBiology:['Anatomy, Animal Science, Bioinformatics, Botany, Genetics, Immunology, Zoology']\nBusiness:['Accounting, Actuarial Science, Commerce, Finance, International Trade, Marketing']\nChemistry:['Biochemistry, Food Science, Mineralogy, Organic Chemistry, Radiochemistry']\nComputer Science:['Al

In [None]:
def get_max_repeated_pred(input_df):
    input_df['preds'] = None
    input_df['majority_model'] = 'baseline'
    input_df['majority_col'] = None
    for index, row in input_df.iterrows():
        combined_preds_max = [0] * len(row['tokens'])
        majority_model_prediction = ['baseline'] * len(row['tokens'])
        majority_col_val = [0] * len(row['tokens'])
    
        for i in range(len(row['tokens'])):
            preds_data = [row['roberta_preds'][i], row['scibert_preds'][i], row['deberta_preds'][i],
                              row['biomed_roberta_preds'][i], row['cs_roberta_preds'][i]]
            
            max_repeated = statistics.multimode(preds_data)
            if len(max_repeated) != 1:
                # Weighted avg
                combined_preds_max[i] = random.choices(
                    preds_data, weights=[model_weights['roberta'], model_weights['scibert'],
                                         model_weights['deberta'], model_weights['biomed_roberta'], 
                                         model_weights['cs_roberta']],
                    k=1)[0]
                if combined_preds_max[i] == row['deberta_preds'][i]:
                    majority_model_prediction[i] = 'deberta'
                elif combined_preds_max[i] == row['biomed_roberta_preds'][i]:
                    majority_model_prediction[i] = 'biomed_roberta'
                elif combined_preds_max[i] == row['roberta_preds'][i]:
                    majority_model_prediction[i] = 'roberta'
                elif combined_preds_max[i] == row['cs_roberta_preds'][i]:
                    majority_model_prediction[i] = 'cs_roberta'
                else:
                    majority_model_prediction[i] = 'scibert'
                majority_col_val[i] = 0
            else:
                combined_preds_max[i] = max_repeated[0]
                if (row['deberta_preds'][i] == row['biomed_roberta_preds'][i]) and \
                (row['roberta_preds'][i] == row['biomed_roberta_preds'][i]) and \
                (row['roberta_preds'][i] == row['scibert_preds'][i]) and \
                (row['deberta_preds'][i] == row['scibert_preds'][i]) and \
                (row['cs_roberta_preds'][i] == row['scibert_preds'][i]):
                    majority_model_prediction[i] = 'all'
                else:
                    majority_model_prediction[i] = 'majority'
                majority_col_val[i] = 1
                    
        input_df.at[index,'preds'] = combined_preds_max
        input_df.at[index, 'majority_model'] = majority_model_prediction
        input_df.at[index, 'majority_col'] = majority_col_val
        
    return input_df

### Function calls to make the predictions and save the parquet file

In [62]:
%%time
merged_model_dev_predictions = merge_model_predictions()

Data shape after merging with roberta model (5000, 3)
Data shape after merging with scibert model (5000, 4)
Data shape after merging with deberta model (5000, 5)
Data shape after merging with biomed_roberta model (5000, 6)
Data shape after merging with cs_roberta model (5000, 7)
CPU times: user 2min 1s, sys: 2.37 s, total: 2min 4s
Wall time: 2min 6s


In [65]:
merged_model_dev_predictions.head()

Unnamed: 0_level_0,text,tokens,roberta_preds,scibert_preds,deberta_preds,biomed_roberta_preds,cs_roberta_preds
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12313,Phylogenetic networks are a generalization of ...,"[Phylogenetic, networks, are, a, generalizatio...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3172,Prediction modelling is more closely aligned w...,"[Prediction, modelling, is, more, closely, ali...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6451,The heat transfer exhibits the flow of heat (t...,"[The, heat, transfer, exhibits, the, flow, of,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4351,a common experience during superficial ultraso...,"[a, common, experience, during, superficial, u...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
22694,Code metadata Current code version v1.5.9 Perm...,"[Code, metadata, Current, code, version, v1.5....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [93]:
%%time
input_text = merged_model_dev_predictions['text'].tolist()
#outputs = list(map(lambda x:x.invoke({"input":input_text}),structured_article_fos_chains))
chain = structured_article_fos_chains[0]
chain.invoke(input_text[0])

CPU times: user 15 ms, sys: 43 µs, total: 15.1 ms
Wall time: 2.63 s


{'major_field_of_study': 'Biology',
 'sub_areas_within_major_field_of_study': ['Genetics'],
 'allied_field_of_study': ['Computer Science', 'Mathematics']}

In [96]:
def populate_field_of_study(row):
    input_text = row['text']
    try:
        result = chain.invoke(input_text)
    except RateLimitError as e:
        delay = 30
        print(f"Rate Limit Error Encountered, sleeping for {delay} seconds")
        result = chain.invoke(input_text)
        time.sleep(delay)
    return result            

In [95]:
%%time
merged_model_dev_predictions[['major_field_of_study','sub_areas_within_major_field_of_study','allied_fields_of_study']] = merged_model_dev_predictions.apply(lambda x:populate_field_of_study(x),axis=1,result_type='expand')

NameError: name 'RateLimitError' is not defined

In [None]:
%%time
majority_vote_df = get_max_repeated_pred(merged_model_dev_predictions)
majority_vote_df.head()

In [None]:
majority_vote_df[['preds']].to_parquet('predictions_four_models.parquet')

In [None]:
majority_vote_df.head()

# End here

### Finding the stats about the majority column

In [None]:
def get_model_stats(row):
    stats_dict = row['stats']
    out_dict = {}
    out_keys =  ['roberta', 'scibert', 'deberta', 'biomed_roberta',  'all']
    out_dict = {key:stats_dict[key] if key in list(stats_dict.keys()) else 0 for key in out_keys}
    
    return out_dict

In [None]:
majority_vote_df['stats'] = majority_vote_df[['majority_model']].map(lambda x: Counter(x))
# majority_vote_df.head()

In [None]:
stats_df = majority_vote_df[['tokens', 'stats', 'majority_col']]
stats_df.head()

In [None]:
stats_df['len_tokens'] = stats_df['tokens'].map(lambda x : len(x))
stats_df.drop(columns=['tokens'], inplace=True)

In [None]:
stats_df['sum_majority'] = stats_df['majority_col'].map(lambda x : sum(x))

In [None]:
stats_df['sum_majority_not_all'] = stats_df['sum_majority'] - stats_df['all']
stats_df.head()

In [None]:
stats_df['pct_majority'] = 100 * stats_df['sum_majority']/stats_df['len_tokens']

In [None]:
stats_df[['roberta', 'scibert', 'deberta', 'biomed_roberta',  'all']] = stats_df.apply(
    lambda x : get_model_stats(x), axis=1, result_type='expand')

In [None]:
stats_df.describe()

In [None]:
stats_df['pct_majority'].hist()

In [None]:
stats_df['roberta'].hist()

In [None]:
stats_df['scibert'].hist()

In [None]:
stats_df['deberta'].hist()

In [None]:
stats_df['biomed_roberta'].hist()

In [None]:
stats_df['all'].hist()

# For local runs

In [None]:
base_folder = "/Users/gayatri/Documents/Gayatri/US/Self projects/AI Competition/DAGPAP24/data"

In [None]:
dev_df = pd.read_parquet(base_folder + os.sep + 'dev_data.parquet', engine="fastparquet")
print(dev_df.shape)
dev_df.head()

In [None]:
dev_df.columns

In [None]:
merged = dev_df.copy(deep=True)

for model in contesting_models:
    model_df = pd.read_parquet(base_folder + os.sep + f'predictions_{model}.parquet', engine="fastparquet")
    model_df.rename(columns={'preds': f'{model}_preds'}, inplace=True)

    merged = merged.merge(model_df, how='inner', left_index=True, right_index=True)
    print(f"Data shape after merging with {model} model {merged.shape}")

In [None]:
merged.head()

In [None]:
%%time
max_pred_df = get_max_repeated_pred(merged)
print(max_pred_df.shape)
max_pred_df.isna().sum()

In [None]:
0.88/(0.87+0.88+0.89)

In [None]:
max_pred_df[['preds']].to_parquet(base_folder + os.sep + 'predictions_three_models_combined.parquet') # , engine="fastparquet")

In [None]:
merged.head()

In [None]:
merged.to_csv(base_folder + os.sep + 'dev_majority_model_preds.csv')

In [98]:
merged.to_parquet(base_folder + os.sep + 'dev_majority_model_preds.parquet')