In [1]:
import os
import sys
import re
import pandas as pd

from tqdm import tqdm
from typing import Any
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if root not in sys.path:
    sys.path.append(root)

from src.inference import InferenceModel

INFO 05-24 06:45:19 [__init__.py:239] Automatically detected platform cuda.


In [2]:
def process(
    data: str,
    target: str,
    target_values: list[str],
) -> list[str]:
    target_values = " or ".join(target_values)

    def build_meta_prompt(X_data: Any) -> str:
        return f"""Based on the input data, predict the value of '{target}':

{X_data}

**IMPORTANT: Respond with ONLY {target_values}. Do not include any explanations, reasoning, or additional text."
"""

    return list(map(build_meta_prompt, data))

In [3]:
def parse(text: str) -> str:
    match = re.search(r"(DEAD|ALIVE)", text)
    if match and len(match.groups()) == 1:
        return match.group(1)
    else:
        return "ERROR"

In [4]:
class TitanicDataset(Dataset):
    def __init__(
        self,
        path: str,
        train: bool = True,
        shuffle: bool = True,
        train_size: float = 0.8,
    ):
        super().__init__()
        self.target = "Survived"
        self.target_values = ["DEAD", "ALIVE"]

        df = pd.read_csv(path)

        assert (
            self.target in df.columns
        ), f"Target column '{self.target}' not found in the dataframe."

        self.feature_labels = df.drop(columns=[self.target]).columns.tolist()

        X = df.drop(columns=[self.target])
        y = df[self.target]

        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            train_size=train_size,
            random_state=42,
            shuffle=shuffle,
            stratify=y,
        )

        self.X = X_train if train else X_test
        self.y = y_train if train else y_test

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X_dict = str(self.X.iloc[idx].to_dict())
        y_value = self.target_values[self.y.iloc[idx].item()]
        return X_dict, y_value

In [15]:
dataset = TitanicDataset("../dataset/titanic-dataset.csv", train=False)
data_loader = DataLoader(dataset, batch_size=32, shuffle=False)
inference_model = InferenceModel("localhost", port=23456)
num_epochs = 20

evaluation_results = []

for epoch in range(num_epochs):
    all_predictions = []
    all_true_labels = []
    
    for X, y in tqdm(data_loader):
        prompts = process(X, "Survived", ["DEAD", "ALIVE"])
        responses = inference_model.generate(prompts, {"n": 1, "max_new_tokens": 32})
        y_preds = [parse(response) for response in responses]
        
        all_predictions.extend(y_preds)
        all_true_labels.extend(y)
    
    evaluation_results.append((all_predictions, all_true_labels))

100%|██████████| 6/6 [00:08<00:00,  1.45s/it]
100%|██████████| 6/6 [00:08<00:00,  1.45s/it]
100%|██████████| 6/6 [00:08<00:00,  1.45s/it]
100%|██████████| 6/6 [00:08<00:00,  1.45s/it]
100%|██████████| 6/6 [00:08<00:00,  1.45s/it]
100%|██████████| 6/6 [00:08<00:00,  1.45s/it]
100%|██████████| 6/6 [00:08<00:00,  1.45s/it]
100%|██████████| 6/6 [00:08<00:00,  1.45s/it]
100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
100%|██████████| 6/6 [00:08<00:00,  1.47s/it]
100%|██████████| 6/6 [00:08<00:00,  1.47s/it]
100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
100%|██████████| 6/6 [00:08<00:00,  1.46s/it]
100%|██████████| 6/6 [00:08<00:00,  1.46s/it]


In [16]:
from src.utils import evaluate_multiple_runs

evaluate_multiple_runs(evaluation_results, "Performance")

Run 1/20
Run 2/20
Run 3/20
Run 4/20
Run 5/20
Run 6/20
Run 7/20
Run 8/20
Run 9/20
Run 10/20
Run 11/20
Run 12/20
Run 13/20
Run 14/20
Run 15/20
Run 16/20
Run 17/20
Run 18/20
Run 19/20
Run 20/20

Performance - Summary Statistics across 20 runs:
Accuracy Summary:
  Mean           = 0.7009
  Std Dev        = 0.0272
  95% CI         = [0.6882, 0.7137]
  Min / Max      = 0.6564 / 0.7560
  LaTeX format   = 70.09% ± 2.72%

Precision Summary:
  Mean           = 0.7455
  Std Dev        = 0.0236
  95% CI         = [0.7345, 0.7566]
  Min / Max      = 0.6963 / 0.7844
  LaTeX format   = 74.55% ± 2.36%

Recall Summary:
  Mean           = 0.7009
  Std Dev        = 0.0272
  95% CI         = [0.6882, 0.7137]
  Min / Max      = 0.6564 / 0.7560
  LaTeX format   = 70.09% ± 2.72%

F1 score Summary:
  Mean           = 0.7028
  Std Dev        = 0.0275
  95% CI         = [0.6899, 0.7156]
  Min / Max      = 0.6565 / 0.7583
  LaTeX format   = 70.28% ± 2.75%

Error rate Summary:
  Mean           = 0.0718
  Std Dev 