# Log에서 Dataset 모으기

In [2]:
import os; os.chdir("../")
import sys; sys.path.append('scripts')

In [3]:
import re
from datetime import datetime
import json

In [4]:
def parse_log(file_path, start_time_str):
    start_time = datetime.strptime(start_time_str, "%Y-%m-%d %H:%M:%S")
    results = []

    with open(file_path, "r") as log_file:
        lines = log_file.readlines()

    parsing = False
    current_entry = {}

    for line in lines:
        # Check if we should start parsing after the specific time
        match_time = re.match(r"\[(.*?)\]", line)
        if match_time:
            log_time = datetime.strptime(match_time.group(1), "%Y-%m-%d %H:%M:%S,%f")
            if log_time >= start_time and "Starting Evaluate script" in line:
                parsing = True


        if not parsing:
            continue

        # Parse Question
        elif "[INFO] - Question:" in line:
            question_match = re.search(r"Question: (.*)", line)
            if question_match:
                current_entry["Question"] = eval(question_match.group(1))  # Safely parse list

        # Parse Prediction
        elif "[INFO] - Prediction:" in line and "INFO" in line:
            prediction_match = re.search(r"Prediction: (.*)", line)
            if prediction_match:
                current_entry["Prediction"] = eval(prediction_match.group(1))  # Safely parse list

        # Parse Answer
        elif "[INFO] - Answer:" in line:
            answer_match = re.search(r"Answer: (.*)", line)
            if answer_match:
                current_entry["Answer"] = eval(answer_match.group(1))  # Safely parse list

        # If all fields are collected, save the entry and reset
        if all(key in current_entry for key in ["Question", "Prediction", "Answer"]):
            results.append(current_entry)
            current_entry = {}

    return results

In [4]:
log_path = "logs/eval-star-1b.log"
start_time = "2024-12-12 00:02:18"
parsed_data = parse_log(log_path, start_time)

with open("data/multi_samples_star-1b.json", "w") as f:
    json.dump(parsed_data, f)

print(f"Parsing from {log_path} after {start_time}")
print("Parsed data saved to data/multi_samples_star-1b.json")
print(f"Number of samples: {len(parsed_data)}")

Parsing from logs/eval-star-1b.log after 2024-12-12 00:02:18
Parsed data saved to data/multi_samples_star-1b.json
Number of samples: 298


In [5]:
log_path = "logs/eval-star-8b.log"
start_time = "2024-12-12 00:00:49"
parsed_data = parse_log(log_path, start_time)

with open("data/multi_samples_star-8b.json", "w") as f:
    json.dump(parsed_data, f)

print(f"Parsing from {log_path} after {start_time}")
print("Parsed data saved to data/multi_samples_star-1b.json")
print(f"Number of samples: {len(parsed_data)}")
# parsed_data[0]

Parsing from logs/eval-star-8b.log after 2024-12-12 00:00:49
Parsed data saved to data/multi_samples_star-1b.json
Number of samples: 274


In [7]:
log_path = "logs/test_verifier.log"
start_time = "2024-12-09 23:21:33"
save_name = "multi_samples-1b"
parsed_data = parse_log(log_path, start_time)

with open(f"data/{save_name}.json", "w") as f:
    json.dump(parsed_data, f)

print(f"Parsing from {log_path} after {start_time}")
print(f"Parsed data saved to data/{save_name}.json")
print(f"Number of samples: {len(parsed_data)}")

Parsing from logs/test_verifier.log after 2024-12-09 23:21:33
Parsed data saved to data/multi_samples-1b.json
Number of samples: 437


In [6]:
log_path = "logs/test_verifier-8b.log"
start_time = "2024-12-11 14:25:34"
save_name = "multi_samples-8b"
parsed_data = parse_log(log_path, start_time)

with open(f"data/{save_name}.json", "w") as f:
    json.dump(parsed_data, f)

print(f"Parsing from {log_path} after {start_time}")
print(f"Parsed data saved to data/{save_name}.json")
print(f"Number of samples: {len(parsed_data)}")

Parsing from logs/test_verifier-8b.log after 2024-12-11 14:25:34
Parsed data saved to data/multi_samples-8b.json
Number of samples: 1086
