In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
################################
# lm-eval-harness (evaulator.py)
################################
import argparse
import json
import logging
import os
import re
import sys
from functools import partial
from pathlib import Path
from typing import Union

import numpy as np

from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.logging_utils import WandbLogger
from lm_eval.tasks import TaskManager, include_path, initialize_tasks
from lm_eval.utils import make_table, simple_parse_args_string

################################
# Own modules
################################
from utils.uploader import get_ConfigurableTask 
from utils.sql import create_dataset_table, register_task, create_task_table, checked_task

# import the PostgreSQL client for Python
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT


## ARGS

In [22]:
def parse_eval_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        "--tasks",
        "-t",
        default=None,
        metavar="task1,task2",
        help="To get full list of tasks, use the command lm-eval --tasks list",
    )
    parser.add_argument(
        "--dbname",
        type=str,
        default="postgres",
        help="Name of the database",
    )
    parser.add_argument(
        "--user",
        type=str,
        default="postgres",
        help="Name of the user",
    )
    parser.add_argument(
        "--password",
        type=str,
        default="password",
        help="Password for the user",
    )
    parser.add_argument(
        "--host",
        type=str,
        default="localhost",
        help="Host name",
    )
    parser.add_argument(
        "--port",
        type=str,
        default="5432",
        help="Port number",
    )
    parser.add_argument(
        "--include_path",
        type=str,
        default=None,
        metavar="DIR",
        help="Additional path to include if there are external tasks to include.",
    )    
    parser.add_argument(
        "--verbosity",
        "-v",
        type=str.upper,
        default="INFO",
        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
        help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
    )        
    return parser.parse_args()

In [23]:
tasks="arc_challenge"
dbname="test_db"
user="root"
password="root"
host="localhost"
port="5432"
include_path=None
verbosity="DEBUG"
# create manually args
args = argparse.Namespace(
    tasks=tasks,
    dbname=dbname,
    user=user,
    password=password,
    host=host,
    port=port,
    include_path=include_path,
    verbosity=verbosity
)

In [24]:
eval_logger = utils.eval_logger
eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
eval_logger.info(f"Verbosity set to {args.verbosity}")

2024-03-14:21:35:25,685 INFO     [3408151750.py:3] Verbosity set to DEBUG


In [25]:
initialize_tasks(args.verbosity)
task_manager = TaskManager(args.verbosity, include_path=args.include_path)

2024-03-14:21:35:25,709 INFO     [__init__.py:373] lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. It will be removed in v0.4.2 release. TaskManager will instead be used.


In [26]:
if args.include_path is not None:
    eval_logger.info(f"Including path: {args.include_path}")
    include_path(args.include_path)

In [27]:
if args.tasks is None:
    eval_logger.error("Need to specify task to evaluate.")
    sys.exit()
elif args.tasks == "list":
    eval_logger.info(
        "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks))
    )
    sys.exit()
else:
    if os.path.isdir(args.tasks):
        import glob

        task_names = []
        yaml_path = os.path.join(args.tasks, "*.yaml")
        for yaml_file in glob.glob(yaml_path):
            config = utils.load_yaml_config(yaml_file)
            task_names.append(config)
    else:
        task_list = args.tasks.split(",")
        task_names = task_manager.match_tasks(task_list)
        for task in [task for task in task_list if task not in task_names]:
            if os.path.isfile(task):
                config = utils.load_yaml_config(task)
                task_names.append(config)
        task_missing = [
            task for task in task_list if task not in task_names and "*" not in task
        ]  # we don't want errors if a wildcard ("*") task name was used

        if task_missing:
            missing = ", ".join(task_missing)
            eval_logger.error(
                f"Tasks were not found: {missing}\n"
                f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
            )
            raise ValueError(
                f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
            )

In [28]:
task_dict = get_ConfigurableTask(
    tasks=task_names,
    num_fewshot=None,
    check_integrity=False,
    gen_kwargs=None,
    task_manager= None,
    verbosity= "INFO",
    predict_only= False,    
)

2024-03-14:21:35:29,845 INFO     [uploader.py:87] get_task_dict has been updated to accept an optional argument, `task_manager`Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage


In [33]:
# check and connect to the database
try:
    conn = psycopg2.connect(
        dbname=args.dbname,
        user=args.user,
        password=args.password,
        host=args.host,
        port=args.port
    )
    print("Connected to the database")
    # Obtain a DB Cursor
    cursor = conn.cursor()
except Exception as e:
    eval_logger.error("Unable to connect to the database")
    exit(-1)

Connected to the database


In [34]:
create_task_table(connection=conn)

In [36]:
for t in task_dict:
    task_name_i = t
    print(f"task_name_i: {task_name_i}")
    dataset_path = task_dict[t].config.dataset_path
    dataset_name = task_dict[t].config.dataset_name
    table_name = dataset_path + "--" + dataset_name if dataset_name else dataset_path
    print(f"table_name: {table_name}")
    data = task_dict[t].dataset
    print(f"data: {data}")
    sample = data['train'][0]
    print(f"sample: {sample}")
    # check if the task is already registered
    if not checked_task(task_name_i, connection= conn):
        # Register task
        try:
            # Create dataset table
            create_dataset_table(table_name = table_name, 
                                data = data, 
                                connection = conn)
            # Regist task/dataset pair
            register_task(task_name = task_name_i, 
                        dataset_table_name = table_name,
                        connection = conn)
            eval_logger.info(f"Task {task_name_i} registered successfully")
        except Exception as e:
            eval_logger.error(f"Error: {e}")
#            conn.rollback()
#            cursor.close()
#            conn.close()
            exit(-1)
    else:
        eval_logger.info(f"Task {task_name_i} already registered")

task_name_i: arc_challenge
table_name: allenai/ai2_arc--ARC-Challenge
data: DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 1119
    })
    test: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 1172
    })
    validation: Dataset({
        features: ['id', 'question', 'choices', 'answerKey'],
        num_rows: 299
    })
})
sample: {'id': 'Mercury_SC_415702', 'question': 'George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?', 'choices': {'text': ['dry palms', 'wet palms', 'palms covered with oil', 'palms covered with lotion'], 'label': ['A', 'B', 'C', 'D']}, 'answerKey': 'A'}
Creating table allenai/ai2_arc--ARC-Challenge
columns: {'__id': 'INTEGER', '__split': 'TEXT', 'id': 'TEXT', 'question': 'TEXT', 'choices': 'JSON', 'answerKey': 'TEXT'}
column_definitions: [Composed([Identifier('__id'), SQL(' '), SQL('INTEGER')]), Composed([I

2024-03-14:21:40:23,417 INFO     [3791980201.py:24] Task arc_challenge registered successfully


Row inserted
Inserting row 2112 in split test
current_row: {'id': 'MCAS_2014_8_16', 'question': 'Just before sunset, a student recorded the temperature of the air and the soil at the same location. The student found that the air and the soil had the same temperature. Which of the following statements describes how the air and soil temperatures most likely changed after sunset on that day?', 'choices': {'text': ['The soil temperature increased more quickly than the air temperature.', 'The air temperature increased more quickly than the soil temperature.', 'The soil temperature decreased more quickly than the air temperature.', 'The air temperature decreased more quickly than the soil temperature.'], 'label': ['A', 'B', 'C', 'D']}, 'answerKey': 'D', '__id': 2112, '__split': 'test'}
Row inserted
Inserting row 2113 in split test
current_row: {'id': 'Mercury_7138460', 'question': 'Reef-building corals work together with photosynthetic algae that live in their tissues. Which type of energy i