# Test and Evaluation Framework for AI Reasoning

### Prompt Creation 

Load the config file available in this package. This file contains different parameter settings specific to each task. 

In [7]:
import json
import pandas as pd
#load the config details- 
with open('configs/config.json', 'r') as openfile:
    config = json.load(openfile)

Set the input and output directories. Input the task type. (Allowed task types - "biomimicry", "DRSM", "Field of Study", "MeSH Descriptors", "SciDocs-Mesh Diseases", "SciDocs-MAG", "Nuclear")<br>
If the task type is DRSM, provide two output paths in array format (each for gold label and standard label types)<br>
If the task type is SciDocs-Mesh Diseases or SciDocs-MAG, provide two input paths in array format (first path containing the data and second path containing the label and doc id mappings). 

In [2]:
from instructions import instructions
import os

input_dir = "./demo/inputs/sample.jsonl"

out_dir = "./demo/inputs/instructions_sample_train.jsonl"

task = "Field of Study"


# Initialize class
iGen = instructions.InstructionsGenerator(input_dir, out_dir, task, config) 

In [3]:
# get all possible categories 
all_cat = iGen.get_all_categories()
all_cat

['Physics',
 'Biology',
 'Geography',
 'Political science',
 'Business',
 'Psychology',
 'Materials science',
 'Engineering',
 'Linguistics',
 'Geology',
 'Art',
 'Economics',
 'Mathematics',
 'Agricultural and Food sciences',
 'Philosophy',
 'History',
 'Medicine',
 'Sociology',
 'Law',
 'Computer science',
 'Chemistry',
 'Education',
 'Environmental science']

In [None]:
# generate instruction sample
ins_sample = iGen.generate_instructions(all_cat)
ins_sample

In [5]:
# Save the result to file
iGen.save_to_file(ins_sample)

File(s) saved successfully


### Instruction Finetuning

Import the required files

In [1]:
from finetune import finetune

Set the values of all the parameters for the training

In [2]:
## DEFAULT PARAMETERS
train_retriever = True # Recommended True. whether to train the retriever or not.
query_side_retriever_training=True # Recommended True. only trains the query encoder whereas passage encoder remains frozen.
reader_model_type= "google/t5-base-lm-adapt" # name of the reader model
model_path= "../checkpoint/step-4800/" # path of the pretrained model checkpoint
train_data = ["./demo/inputs/instructions_sample_train.jsonl"] # list of paths to train files
eval_data = ["./demo/inputs/instructions_sample_test.jsonl"] # list of paths to evaluation files
name = "atlas-mlm-gen-S2ROC-220M-fos-instun" # name of the experiment or directory where instrution tuning checkpoint would be saved
checkpoint_dir = "./experiments" # source Directory where experiment directory is created
train_steps = 10000 # number of finetuning steps
load_index_path = "./saved_index" # path to passage index and embeddings, please reference to the script `model/custom_train_genindex.py`.
load_subset_textindex = True # Recoomended True for low compute scenarios. Load subset of index rather than entire bulk.
subset_textindex_name = "Physics,Bio-1,Art,History,Political-Science,Business,Economics,Geology,Computer-Science,Environmental-Science,Engineering,Med-1" # name of the domain indexes that need to be load for retrieval
task = "base" # task type. For classification use "base". For open domain QA, use "QA".
gold_score_mode = "pdist" # target scores type for loss evaluation.
per_gpu_batch_size = 2 # number of queries per GPU. Global bacth size = per_gpu_batch_size* number of GPUs.

Set the following to True if you want the model to use the arguments you gave earlier. Otherwise the model will default to default parameters

In [3]:
USE_CUSTOM_ARGS = False 

Run the instruction finetuning

In [4]:
finetune.InstructionFinetuning.run_instructions_finetune(train_retriever,
                query_side_retriever_training,
                reader_model_type,
                model_path,
                train_data,
                eval_data, 
                name,
                checkpoint_dir,
                train_steps,
                load_index_path,
                load_subset_textindex,
                subset_textindex_name,
                task,
                gold_score_mode,
                per_gpu_batch_size,
                USE_CUSTOM_ARGS)

### Prompting

In [None]:
from finetune import finetune

Set the values of all the parameters for the Evaluation



In [8]:
## DEFAULT PARAMETERS
reader_model_type= "google/t5-base-lm-adapt" # name of the reader model
model_path= "" #  path of the pretrained model checkpoint, please use the model checkpoint available in BDC.
eval_data = ["./demo/inputs/instructions_sample_test.jsonl"] # list of paths to eval files
name = "atlas-mlm-gen-S2ROC-220M-fosinstun-foseval-step9690-adapretrv2" # name of the experiment or directory where instrution tuning checkpoint would be saved
checkpoint_dir = "./experiments" # source Directory where experiment directory is created
load_index_path = "./saved_index" # path to passage index and embeddings
load_subset_textindex = True  # Recoomended True for low compute scenarios. Load subset of index rather than entire bulk.
task = "base" # task type. For classification use "base". For open domain QA, use "QA".
gold_score_mode = "pdist" # target scores type for loss evaluation.
per_gpu_batch_size = 2 # number of queries per GPU. Global bacth size = per_gpu_batch_size* number of GPUs.
per_gpu_batch_size_domainindex= 600 # number of queries per GPU for domain searching.
no_sel_indices= 3 # number of domains for each type of query
index_model_path = "./base/" # path of the domain search model checkpoint

Set the following to True if you want the model to use the arguments you gave earlier. Otherwise the model will default to default parameters

In [7]:
USE_CUSTOM_ARGS = False 

Now run the evaluation

In [5]:
finetune.Evaluation.run_generation(reader_model_type,
                    model_path,
                    eval_data, 
                    name,
                    checkpoint_dir,
                    load_index_path,
                    load_subset_textindex,
                    task,
                    gold_score_mode,
                    per_gpu_batch_size,
                    per_gpu_batch_size_domainindex,
                    no_sel_indices,
                    index_model_path,
                    USE_CUSTOM_ARGS)

### Evaluation

##### Task Performance

In [None]:
finetune.Evaluation.run_metrics(
                    filepath="./demo/inputs/instructions_sample_test_outputs.jsonl",
                    outpath="./demo/inputs/",
                    metric_name="accuracy",
                    task_type="fos",
                    paper_topic_file="./demo/inputs/paper_topic_map.json", ## Please provide a file that contains the mapping between document to scientific domains.
                    USE_CUSTOM_ARGS=False)

In [None]:
df_accuracy = pd.read_json("./demo/inputs/instructions_sample_test_outputs_metrics_accuracy.jsonl")

##### Evidence Generation Performance

In [None]:
finetune.Evaluation.run_metrics(
                    filepath="./demo/inputs/instructions_sample_test_outputs.jsonl",
                    outpath="./demo/inputs",
                    metric_name="topic",
                    task_type="fos",
                    paper_topic_file="/home/evaluation/paper_topic_map.json", ## Please provide a file that contains the mapping between document to scientific domains.
                    USE_CUSTOM_ARGS=False)

In [None]:
df_topic_relevance = pd.read_json("./demo/inputs/instructions_sample_test_outputs_metrics_evidence.jsonl")