diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 4f4fd79..9838595 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -20,7 +20,7 @@ jobs:
         uses: actions/checkout@v2
       - uses: actions/setup-python@v5
         with:
-          python-version: '3.9'
+          python-version: '3.10'
       - name: Create dummy keys.cfg
         run: touch keys.cfg
       - name: Install uv
diff --git a/.gitignore b/.gitignore
index e2abe2b..91edc7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,10 +2,13 @@
 keys.cfg
 **/test_result/**
 **/output/**
-**/eval_results/**
+**/eval_results*/**
 eval/logs/**
 *.h5
-
+logs/**
+**/logs/**
+**/tmp/**
+integration/**
 
 # -------
 
diff --git a/README.md b/README.md
index 2e02133..0bbe231 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,8 @@ This repo contains the evaluation code for the paper "[SciCode: A Research Codin
 
 ## 🔔News
 
+**[2025-01-24]: SciCode has been integrated with [`inspect_ai`](https://inspect.ai-safety-institute.org.uk/) for easier and faster model evaluations.**
+
 **[2024-11-04]: Leaderboard is on! Check [here](https://scicode-bench.github.io/leaderboard/). We have also added Claude Sonnet 3.5 (new) results.**
 
 **[2024-10-01]: We have added OpenAI o1-mini and o1-preview results.**
@@ -54,6 +56,19 @@ SciCode sources challenging and realistic research-level coding problems across
 4. Run `eval/scripts/gencode_json.py` to generate new model outputs (see the [`eval/scripts` readme](eval/scripts/)) for more information
 5. Run `eval/scripts/test_generated_code.py` to evaluate the unittests
 
+
+## Instructions to evaluate a new model using `inspect_ai` (recommended)
+
+Scicode has been integrated with `inspect_ai` for easier and faster model evaluation, compared with the methods above. You need to run the first three steps in the [above section](#instructions-to-evaluate-a-new-model), and then go to the `eval/inspect_ai` directory, setup correspoinding API key, and run the following command:
+
+```bash
+cd eval/inspect_ai
+export OPENAI_API_KEY=your-openai-api-key
+inspect eval scicode.py --model openai/gpt-4o --temperature 0
+```
+
+For more detailed information of using `inspect_ai`, see [`eval/inspect_ai` readme](eval/inspect_ai/)
+
 ## More information and FAQ
 
 More information, including a [FAQ section](https://scicode-bench.github.io/faq/), is provided on our [website](https://scicode-bench.github.io/).
diff --git a/eval/inspect_ai/README.md b/eval/inspect_ai/README.md
new file mode 100644
index 0000000..5e3bebf
--- /dev/null
+++ b/eval/inspect_ai/README.md
@@ -0,0 +1,40 @@
+## **SciCode Evaluation using `inspect_ai`**
+
+### 1. Set Up Your API Keys
+
+Users can follow [`inspect_ai`'s official documentation](https://inspect.ai-safety-institute.org.uk/#getting-started) to setup correpsonding API keys depending on the types of models they would like to evaluate.
+
+### 2. Setup Command Line Arguments if Needed
+
+In most cases, after users setting up the key, they can directly start the SciCode evaluation via the following command.
+
+```bash
+inspect eval scicode.py --model <your_model> --temperature 0
+```
+
+However, there are some additional command line arguments that could be useful as well.
+
+- `--max_connections`: Maximum amount of API connections to the evaluated model.
+- `--limit`: Limit of the number of samples to evaluate in the SciCode dataset.
+- `-T input_path=<another_input_json_file>`: This is useful when user wants to change to another json dataset (e.g., the dev set).
+- `-T output_dir=<your_output_dir>`: This changes the default output directory (`./tmp`).
+- `-T with_background=True/False`: Whether to include problem background.
+- `-T mode=normal/gold/dummy`: This provides two additional modes for sanity checks.
+    - `normal` mode is the standard mode to evaluate a model
+    - `gold` mode can only be used on the dev set which loads the gold answer
+    - `dummy` mode does not call any real LLMs and generates some dummy outputs
+
+For example, user can run five sames on the dev set with background as
+
+```bash
+inspect eval scicode.py \
+    --model openai/gpt-4o \
+    --temperature 0 \
+    --limit 5 \
+    -T input_path=../data/problems_dev.jsonl \
+    -T output_dir=./tmp/dev \
+    -T with_background=True \
+    -T mode=gold
+```
+
+For more information regarding `inspect_ai`, we refer users to its [official documentation](https://inspect.ai-safety-institute.org.uk/).
\ No newline at end of file
diff --git a/eval/inspect_ai/scicode.py b/eval/inspect_ai/scicode.py
new file mode 100644
index 0000000..dc4d744
--- /dev/null
+++ b/eval/inspect_ai/scicode.py
@@ -0,0 +1,415 @@
+import copy
+import time
+import shutil
+import subprocess
+from typing import Any
+from pathlib import Path
+from inspect_ai import Task, task
+from inspect_ai.dataset import json_dataset, Sample
+from inspect_ai.solver import solver, TaskState, Generate
+from inspect_ai.scorer import scorer, mean, metric, Metric, Score, Target
+from scicode.parse.parse import extract_function_name, get_function_from_code
+from scicode.gen.models import generate_dummy_response, extract_python_script
+
+BACKGOUND_PROMPT_TEMPLATE = Path("../data", "multistep_template.txt").read_text()
+DEFAULT_PROMPT_TEMPLATE = Path("../data", "background_comment_template.txt").read_text()
+
+class ScicodePromptingAssistant:
+    def __init__(
+        self,
+        output_dir: Path,
+        prompt_dir: Path,
+        with_background: bool,
+    ):
+        self.output_dir = output_dir
+        self.prompt_dir = prompt_dir
+        self.with_background = with_background
+        self.previous_llm_code = []
+        
+    def _get_background_dir(self):
+        return "with_background" if self.with_background else "without_background"
+    
+    def register_previous_response(
+        self,
+        prob_data: dict,
+        response: str,
+        previous_code: str,
+        num_steps: int,
+    ):
+        self.previous_llm_code[num_steps - 1] = extract_python_script(response)
+        self.save_response_with_steps(
+            prob_data,
+            response,
+            previous_code,
+            num_steps,
+        )
+    
+    def save_response_with_steps(
+        self, 
+        prob_data: dict, 
+        response: str,
+        previous_code: str, 
+        num_steps: int
+    ) -> None:
+        output_dir = Path(
+            self.output_dir,
+            self._get_background_dir()
+        )
+        output_dir.mkdir(parents=True, exist_ok=True)
+        prob_id = prob_data["problem_id"]
+        output_file_path = output_dir / f"{prob_id}.{num_steps}.py"
+        python_code = extract_python_script(response)
+        output_file_path.write_text(f'{previous_code}\n{python_code}', encoding="utf-8")    
+    
+    @staticmethod
+    def process_problem_code(
+        prob_data: dict, 
+        num_steps: int
+    ) -> str:
+        header_docstring = prob_data['sub_steps'][num_steps - 1]['function_header']
+        return_str = prob_data['sub_steps'][num_steps - 1]['return_line']
+        string = f"{header_docstring}\n\n{return_str}"
+        return string
+    
+    def process_problem_steps(
+        self, 
+        problem_data: dict, 
+        num_steps: int
+    ):
+        """Process problem data and return previous steps and next steps"""
+        output_lines = []
+        next_step = []
+        previous_code = []
+        for i in range(num_steps - 1):
+            output_lines.append(problem_data["sub_steps"][i]["step_description_prompt"] + '\n' +
+                                problem_data["sub_steps"][i]["step_background"] if self.with_background
+                                else problem_data["sub_steps"][i]["step_description_prompt"])
+            output_lines.append(self.previous_llm_code[i])
+            previous_code.append(self.previous_llm_code[i])
+            output_lines.append("------")
+
+        next_step.append(problem_data["sub_steps"][num_steps - 1]["step_description_prompt"] + '\n' +
+                         problem_data["sub_steps"][num_steps - 1]["step_background"] if self.with_background
+                         else problem_data["sub_steps"][num_steps - 1]["step_description_prompt"])
+        next_step.append(self.process_problem_code(problem_data, num_steps))
+        output_str = "\n\n".join(output_lines[:-1])  # Remove the last "------"
+        next_step_str = "\n\n".join(next_step)
+        previous_code_str = "\n".join(previous_code)
+        return output_str, next_step_str, previous_code_str
+    
+    def generate_prompt_with_steps(
+        self,
+        prob_data: dict,
+        num_steps: int,
+        prompt_template=DEFAULT_PROMPT_TEMPLATE,
+    ):
+        # parse the input file and extract the content
+        problem_steps_str, next_step_str, previous_code_str = self.process_problem_steps(prob_data, num_steps)
+        dependencies = prob_data["required_dependencies"]
+        assert next_step_str
+        return prompt_template.format(
+            problem_steps_str=problem_steps_str,
+            next_step_str=next_step_str,
+            dependencies=dependencies,
+        ), f'{dependencies}\n{previous_code_str}\n'
+    
+    def save_prompt_with_steps(
+            self, 
+            prob_data: dict, 
+            prompt: str, 
+            num_steps: int
+        ) -> None:
+        output_dir = Path(
+            self.prompt_dir, 
+            self._get_background_dir()
+        )
+        output_dir.mkdir(parents=True, exist_ok=True)
+        output_file_path = output_dir / f"{prob_data['problem_id']}.{num_steps}.txt"
+        output_file_path.write_text(prompt, encoding="utf-8")
+
+    def prepare_final_prompt_with_steps(
+        self,
+        prob_data: dict,
+        num_steps: int,
+        tot_steps: int,
+        prompt_template=DEFAULT_PROMPT_TEMPLATE,
+        *,
+        save: bool = True
+    ):
+        prob_id = prob_data["problem_id"]
+        if num_steps == 1:
+            self.previous_llm_code = [None] * tot_steps
+        else:
+            if len(self.previous_llm_code) != tot_steps:
+                self.previous_llm_code = [None] * tot_steps
+            for prev_step in range(num_steps - 1):
+                if self.previous_llm_code[prev_step] is None:
+                    if (
+                        (prob_id == "13" and prev_step == 5) or 
+                        (prob_id == "62" and prev_step == 0) or 
+                        (prob_id == "76" and prev_step == 2)
+                    ):
+                        prev_file_path = Path(
+                            "../data",
+                            f"{prob_id}.{prev_step+1}.txt"
+                        )
+                    else:
+                        prev_file_path = Path(
+                            self.output_dir,
+                            self._get_background_dir(),
+                            f"{prob_id}.{prev_step + 1}.py"
+                        )
+                    if prev_file_path.is_file():
+                        prev_file_content = prev_file_path.read_text(encoding='utf-8')
+                        func_name = extract_function_name(
+                            prob_data["sub_steps"][prev_step]["function_header"]
+                        )
+                        function_code = get_function_from_code(
+                            prev_file_content, func_name
+                        )
+                        self.previous_llm_code[prev_step] = function_code
+                    else:
+                        raise Exception(f'Generating problem {prob_id} step {num_steps} ahead of step {prev_step + 1}.')
+                
+        prompt, previous_code = self.generate_prompt_with_steps(
+            prob_data,
+            num_steps,
+            prompt_template,
+        )
+        if save:
+            self.save_prompt_with_steps(
+                prob_data,
+                prompt,
+                num_steps,
+            )
+        return prompt, previous_code
+
+class ScicodeEvaluator:
+    def __init__(
+        self,
+        h5py_file: str,
+        code_dir: Path,
+        log_dir: Path,
+        with_background: bool,
+    ):
+        self.h5py_file = h5py_file
+        self.code_dir = code_dir
+        self.log_dir = log_dir
+        self.with_background = with_background
+    
+    def _get_background_dir(self):
+        return "with_background" if self.with_background else "without_background"
+        
+    def test_code(
+        self,
+        prob_data: dict,
+    ):
+        code_dir = Path(
+            self.code_dir,
+            "generated_code",
+            self._get_background_dir()
+        )
+        tmp_dir = Path(f'tmp_{time.time()}')
+        tmp_dir.mkdir(parents=True, exist_ok=True)
+        
+        sub_steps = prob_data["sub_steps"]
+        problem_id = prob_data["problem_id"]
+        for idx in range(len(sub_steps)):
+            if (
+                (problem_id == "13" and idx == 5) or
+                (problem_id == "62" and idx == 0) or
+                (problem_id == "76" and idx == 2)
+            ):
+                continue
+            step_id = sub_steps[idx]["step_number"]
+            code_file_path = Path(code_dir, f"{step_id}.py")
+            assert code_file_path.is_file(), f"Code file {code_file_path} not found."
+            code_content = code_file_path.read_text(encoding='utf-8')
+            test_lst = sub_steps[idx]["test_cases"]
+            assert_file = Path(tmp_dir, f'{step_id}.py')
+            with open(assert_file, 'w', encoding='utf-8') as f:
+                f.write(code_content)
+                f.write(f"""
+
+from scicode.parse.parse import process_hdf5_to_tuple
+
+""")
+                f.write(f"targets = process_hdf5_to_tuple('{step_id}', {len(test_lst)}, '{self.h5py_file}')" + '\n')
+                for i in range(len(test_lst)):
+                    f.write(f"target = targets[{i}]\n\n")
+                    for line in test_lst[i].split('\n'):
+                        f.write(line + '\n')
+                        
+        def run_script(script_path):
+            try:
+                subprocess.run(['python', script_path], check=True, capture_output=True,
+                            text=True, timeout=1800)
+                return 0
+            except subprocess.CalledProcessError:
+                return 1
+            except subprocess.TimeoutExpired:
+                return 2
+            
+        total_steps = len(sub_steps)
+        total_correct = 0
+        for idx in range(len(sub_steps)):
+            if (
+                (problem_id == "13" and idx == 5) or
+                (problem_id == "62" and idx == 0) or
+                (problem_id == "76" and idx == 2)
+            ):
+                continue
+            step_id = sub_steps[idx]["step_number"]
+            script_path = Path(tmp_dir, f'{step_id}.py')
+            logs_dir = Path(
+                self.log_dir,
+                "evaluation_logs",
+                self._get_background_dir()
+            )
+            logs_dir.mkdir(parents=True, exist_ok=True)
+            logs_file = Path(
+                logs_dir,
+                f"{step_id}.log"
+            )
+            if logs_file.is_file():
+                with open(logs_file, 'r') as f:
+                    content = f.read().splitlines()
+                    if content[0] == 'pass':
+                        total_correct += 1
+                continue
+            ret = run_script(script_path)
+            if ret == 0:
+                with open(logs_file, 'w') as f:
+                    f.write('pass')
+                total_correct += 1
+            elif ret == 1:
+                with open(logs_file, 'w') as f:
+                    f.write('fail')
+            else:
+                with open(logs_file, 'w') as f:
+                    f.write('time out')
+        
+        shutil.rmtree(tmp_dir)
+        problem_correct = 1 if total_correct == total_steps else 0
+        return problem_correct, total_correct, total_steps
+
+def record_to_sample(record):
+    return Sample(
+        input="problem_id",
+        target=record["problem_id"],
+        id=record["problem_id"],
+        metadata={
+            k: v for k, v in record.items()
+        }
+    )
+
+def generate_gold_response(prob_data: dict, num_steps: int):
+    return f"Blah blah\n```python\n{prob_data['sub_steps'][num_steps - 1]['ground_truth_code']}\n```\n"
+
+@solver
+def scicode_solver(**params: dict[str, Any]):
+    async def solve(state: TaskState, generate: Generate) -> TaskState:
+        model_name = str(state.model).replace("/", "-")
+        prompt_assistant = ScicodePromptingAssistant(
+            output_dir=Path(params["output_dir"], model_name, "generated_code"),
+            prompt_dir=Path(params["output_dir"], model_name, "prompt"),
+            with_background=params["with_background"],
+        )
+        prompt_template = BACKGOUND_PROMPT_TEMPLATE if params["with_background"] else DEFAULT_PROMPT_TEMPLATE
+        sub_steps = state.metadata["sub_steps"]
+        for idx in range(len(sub_steps)):
+            prob_id = state.metadata["problem_id"]
+            if (
+                (prob_id == "13" and idx == 5) or
+                (prob_id == "62" and idx == 0) or
+                (prob_id == "76" and idx == 2)
+            ):
+                continue
+            prompt, previous_code = prompt_assistant.prepare_final_prompt_with_steps(
+                prob_data=state.metadata,
+                num_steps=idx+1,
+                tot_steps=len(sub_steps),
+                prompt_template=prompt_template,
+            )
+            if params["mode"] == "dummy":
+                response_from_llm = generate_dummy_response(prompt)
+            elif params["mode"] == "gold":
+                response_from_llm = generate_gold_response(state.metadata, idx+1)
+            else:
+                # ===Model Generation===
+                state.user_prompt.text = prompt
+                state_copy = copy.deepcopy(state)
+                result = await generate(state=state_copy)
+                response_from_llm = result.output.completion
+                # ===Model Generation===
+            prompt_assistant.register_previous_response(
+                prob_data=state.metadata,
+                response=response_from_llm,
+                previous_code=previous_code,
+                num_steps=idx+1,
+            )
+        return state
+    return solve
+
+@metric
+def sub_problem_correctness() -> Metric:
+    def metric(scores: list[Score]) -> int | float:
+        total_correct = 0
+        total_steps = 0
+        for score in scores:
+            total_correct += score.value["Total Correct"]
+            total_steps += score.value["Total Steps"]
+        return total_correct / total_steps
+    return metric
+
+@scorer(
+    metrics=[{
+        "Problem Correctness": [mean()],
+    }, sub_problem_correctness()]
+)
+def scicode_scorer(**params: dict[str, Any]):
+    async def score(state: TaskState, target: Target):
+        model_name = str(state.model).replace("/", "-")
+        evaluator = ScicodeEvaluator(
+            h5py_file=params["h5py_file"],
+            code_dir=Path(params["output_dir"], model_name),
+            log_dir=Path(params["output_dir"], model_name),
+            with_background=params["with_background"],
+        )
+        problem_correct, total_correct, total_steps = evaluator.test_code(state.metadata)
+        return Score(
+            value={
+                "Problem Correctness": problem_correct,
+                "Total Correct": total_correct,
+                "Total Steps": total_steps,
+            }
+        )
+    return score
+
+@task
+def scicode(
+    input_path: str = '../data/problems_all.jsonl',
+    output_dir: str = './tmp',
+    with_background: bool = False,
+    h5py_file: str = '../data/test_data.h5',
+    mode: str = 'normal',
+):
+    dataset = json_dataset(
+        input_path, 
+        record_to_sample
+    )
+    return Task(
+        dataset=dataset,
+        solver=scicode_solver(
+            input_path=input_path,
+            output_dir=output_dir,
+            with_background=with_background,
+            mode=mode,
+        ),
+        scorer=scicode_scorer(
+            input_path=input_path,
+            output_dir=output_dir,
+            with_background=with_background,
+            h5py_file=h5py_file,
+        ),
+    )
diff --git a/pyproject.toml b/pyproject.toml
index a167f24..fbafa16 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@ name = "scicode"
 dynamic = ["version",]
 description = "todo"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = {file = "LICENSE"}
 keywords = ["nlp", "code"]
 authors = [
@@ -37,6 +37,7 @@ dependencies = [
     "scipy",
     "matplotlib",
     "sympy",
+    "inspect-ai",
 ]
 
 # Classifiers help users find your project by categorizing it.
diff --git a/src/scicode/parse/parse.py b/src/scicode/parse/parse.py
index d0d0c9b..9d25221 100644
--- a/src/scicode/parse/parse.py
+++ b/src/scicode/parse/parse.py
@@ -118,9 +118,9 @@ def process_hdf5_datagroup(group):
             return process_hdf5_dict(group)
 
 
-def process_hdf5_to_tuple(step_id, test_num):
+def process_hdf5_to_tuple(step_id, test_num, h5py_file=H5PY_FILE):
     data_lst = []
-    with h5py.File(H5PY_FILE, 'r') as f:
+    with h5py.File(h5py_file, 'r') as f:
         for test_id in range(test_num):
             group_path = f'{step_id}/test{test_id + 1}'
             if isinstance(f[group_path], h5py.Group):