In [21]:
import csv
import json
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from collections import Counter
import logging
import pandas as pd
from bg_context import class_bg
import importlib
import ollama_class

# Reload the module to apply updates
importlib.reload(ollama_class)

# Re-import the class if necessary
from ollama_class import OllamaLLM
import numpy as np

In [23]:
logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

In [24]:
ollama_model = OllamaLLM(api_url="http://localhost:11434/api/generate")

In [15]:
# 定义文件路径
data_files = {
    "planning": "output_classification_generated_planning.csv",
    "monitor": "output_classification_enhance_monitoring.csv",
    "evaluating": "output_classification_generated_evaluating.csv",
}

# 定义测试数据数量
NUM_TEST_SAMPLES = 80

MODES = ["basic", "1_ex", "5_ex", "10_ex", "chain_of_thoughts"]

In [13]:
# 创建文件夹结构
def create_experiment_folder(mode):
    base_dir = "classification"
    timestamp_dir = os.path.join(base_dir, mode, time.strftime("%Y%m%d_%H%M%S"))
    os.makedirs(timestamp_dir, exist_ok=True)
    return timestamp_dir

def create_experiment_folder_custom(path, mode):
    base_dir = path
    timestamp_dir = os.path.join(base_dir, mode, time.strftime("%Y%m%d_%H%M%S"))
    os.makedirs(timestamp_dir, exist_ok=True)
    return timestamp_dir

# 读取CSV文件的函数
def read_csv_data(file_path, max_rows=20):
    """
    Reads up to max_rows rows from a CSV file.
    Args:
        file_path (str): Path to the CSV file.
        max_rows (int): Maximum number of rows to read.
    Returns:
        list: A list of dictionaries with "text" and "tag" keys.
    """
    data = []
    try:
        with open(file_path, "r") as csvfile:
            reader = csv.reader(csvfile)
            next(reader)  # Skip the header row
            for i, row in enumerate(reader):
                if i >= max_rows:
                    break
                data.append({"text": row[0], "tag": row[1]})
        return data
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return []

# 初始化混淆矩阵
def initialize_confusion_matrix(tags):
    size = len(tags)
    return np.zeros((size, size), dtype=int)

# 更新混淆矩阵
def update_confusion_matrix(matrix, y_true, y_pred, tags):
    tag_to_index = {tag: idx for idx, tag in enumerate(tags)}
    for true, pred in zip(y_true, y_pred):
        if true in tags and pred in tags:
            matrix[tag_to_index[true], tag_to_index[pred]] += 1
    return matrix

# 绘制混淆矩阵热力图
def generate_multiclass_heatmap(matrix, tags, output_path, mode):
    plt.figure(figsize=(8, 6))
    sns.heatmap(matrix, annot=True, fmt="d", cmap="Blues", xticklabels=tags, yticklabels=tags)
    plt.xlabel("Predicted Class")
    plt.ylabel("True Class")
    plt.title(f"Confusion Matrix Heatmap for Mode: {mode}")
    plt.savefig(os.path.join(output_path, "confusion_matrix.png"))
    plt.close()

# 计算分类性能指标
def calculate_class_metrics(matrix, tags):
    metrics = {}
    for idx, tag in enumerate(tags):
        tp = matrix[idx, idx]
        fp = matrix[:, idx].sum() - tp
        fn = matrix[idx, :].sum() - tp
        tn = matrix.sum() - (tp + fp + fn)

        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
        accuracy = (tp + tn) / matrix.sum() if matrix.sum() > 0 else 0

        metrics[tag] = {
            "precision": precision,
            "recall": recall,
            "f1_score": f1_score,
            "accuracy": accuracy
        }
    return metrics

def safe_parse_json(response):
    """
    Safely parse a JSON string into a Python dictionary.
    Args:
        response (str): JSON string to parse.
    Returns:
        dict: Parsed dictionary if successful.
    Raises:
        ValueError: If the input is not valid JSON.
    """
    try:
        parsed = json.loads(response)
        if not isinstance(parsed, dict):
            raise ValueError("Parsed JSON is not a dictionary.")
        return parsed
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format: {response[:100]}") from e

def evaluate_classification(data, mode, model: OllamaLLM, all_tags):
    results = []
    y_true, y_pred = [], []
    start_time = time.time()

    for item in data:
        text = item["text"]
        true_tag = item["tag"]
        try:
            # 调用分类函数
            response = model.classify_metacognition(model="llama3.2", text=text, context=class_bg, mode=mode)
            
            # 安全解析 JSON 响应
            response_dict = safe_parse_json(response)
            predicted_tag = response_dict.get("classification", "na")

            y_true.append(true_tag)
            y_pred.append(predicted_tag)
            results.append({
                "text": text,
                "true_tag": true_tag,
                "predicted_tag": predicted_tag,
                "error": False
            })
        except ValueError as ve:
            # JSON 解析失败
            y_true.append(true_tag)
            y_pred.append("error")
            results.append({
                "text": text,
                "true_tag": true_tag,
                "predicted_tag": "error",
                "error": True,
                "error_message": f"JSON parsing error: {ve}"
            })
        except Exception as e:
            # 其他异常
            y_true.append(true_tag)
            y_pred.append("error")
            results.append({
                "text": text,
                "true_tag": true_tag,
                "predicted_tag": "error",
                "error": True,
                "error_message": f"Unexpected error: {str(e)}"
            })

    total_time = time.time() - start_time
    avg_time_per_sample = total_time / len(data) if data else 0

    # 构建混淆矩阵
    confusion_matrix = initialize_confusion_matrix(all_tags)
    confusion_matrix = update_confusion_matrix(confusion_matrix, y_true, y_pred, all_tags)

    # 计算分类性能
    metrics = calculate_class_metrics(confusion_matrix, all_tags)
    metrics["total_time"] = total_time
    metrics["avg_time_per_sample"] = avg_time_per_sample

    return metrics, results, confusion_matrix

# 保存结果到文件
def save_results(metrics, results, confusion_matrix, output_path, tags):
    os.makedirs(output_path, exist_ok=True)
    
    # 保存混淆矩阵热力图
    generate_multiclass_heatmap(confusion_matrix, tags, output_path, mode="classification")
    
    # 保存分类指标
    metrics_path = os.path.join(output_path, "metrics.json")
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=4)

    # 保存详细结果
    results_path = os.path.join(output_path, "results.json")
    with open(results_path, "w") as f:
        json.dump(results, f, indent=4)

In [17]:
# 主测试逻辑
all_tags = list(data_files.keys()) + ["na"]  # 包括所有可能的标签

for mode in MODES:
    print(f"Processing mode: {mode}")
    mode_output_path = create_experiment_folder(mode)

    # 初始化模式级别混淆矩阵和结果列表
    overall_confusion_matrix = initialize_confusion_matrix(all_tags)
    overall_results = []

    for tag, file_path in data_files.items():
        try:
            print(f"Processing dataset for tag: {tag}")
            # 读取数据
            data = read_csv_data(file_path, max_rows=NUM_TEST_SAMPLES)
            if not data:
                print(f"Warning: No data found in {file_path}, skipping.")
                continue

            # 执行分类评估
            metrics, results, confusion_matrix = evaluate_classification(data, mode, ollama_model, all_tags)

            # 更新模式级别的混淆矩阵
            overall_confusion_matrix = update_confusion_matrix(
                overall_confusion_matrix,
                [r["true_tag"] for r in results],
                [r["predicted_tag"] for r in results],
                all_tags
            )
            overall_results.extend(results)  # 合并所有文件的结果

            # 保存单文件的结果
            save_results(metrics, results, confusion_matrix, mode_output_path, all_tags)
            print(f"Results for tag {tag} saved in {mode_output_path}")
        except Exception as e:
            print(f"Error processing dataset for tag {tag}: {e}")

    # 计算模式级别的总指标
    try:
        overall_metrics = calculate_class_metrics(overall_confusion_matrix, all_tags)

        # 保存模式级别的结果
        generate_multiclass_heatmap(overall_confusion_matrix, all_tags, mode_output_path, mode)
        metrics_path = os.path.join(mode_output_path, "overall_metrics.json")
        with open(metrics_path, "w") as f:
            json.dump(overall_metrics, f, indent=4)

        # 保存合并的详细结果
        results_path = os.path.join(mode_output_path, "overall_results.json")
        with open(results_path, "w") as f:
            json.dump(overall_results, f, indent=4)

        print(f"Overall results and metrics saved for mode: {mode}")
    except Exception as e:
        print(f"Error generating overall results for mode {mode}: {e}")

print("All experiments completed.")

Processing mode: basic
Processing dataset for tag: planning
Results for tag planning saved in classification/basic/20241204_174518
Processing dataset for tag: monitor
Results for tag monitor saved in classification/basic/20241204_174518
Processing dataset for tag: evaluating
Results for tag evaluating saved in classification/basic/20241204_174518
Overall results and metrics saved for mode: basic
Processing mode: 1_ex
Processing dataset for tag: planning
Results for tag planning saved in classification/1_ex/20241204_174937
Processing dataset for tag: monitor
Results for tag monitor saved in classification/1_ex/20241204_174937
Processing dataset for tag: evaluating
Results for tag evaluating saved in classification/1_ex/20241204_174937
Overall results and metrics saved for mode: 1_ex
Processing mode: 5_ex
Processing dataset for tag: planning
Results for tag planning saved in classification/5_ex/20241204_175348
Processing dataset for tag: monitor
Results for tag monitor saved in classific

In [None]:
# def evaluate_classification(data, mode, model: OllamaLLM, tags, classification_method="classify_metacognition"):
#     results = []
#     y_true, y_pred = [], []
#     start_time = time.time()

#     for item in data:
#         text = item["text"]
#         true_tag = item["tag"]
#         try:
#             # 动态调用分类方法
#             response = getattr(model, classification_method)(model="llama3.2", text=text, context=class_bg, mode=mode)
#             predicted_tag = response.get("classification", "na")

#             y_true.append(true_tag)
#             y_pred.append(predicted_tag)
#             results.append({
#                 "text": text,
#                 "true_tag": true_tag,
#                 "predicted_tag": predicted_tag,
#                 "error": False
#             })
#         except Exception as e:
#             y_true.append(true_tag)
#             y_pred.append("error")
#             results.append({
#                 "text": text,
#                 "true_tag": true_tag,
#                 "predicted_tag": "error",
#                 "error": True,
#                 "error_message": str(e)
#             })

#     total_time = time.time() - start_time
#     avg_time_per_sample = total_time / len(data) if data else 0

#     confusion_matrix = initialize_confusion_matrix(tags)
#     confusion_matrix = update_confusion_matrix(confusion_matrix, y_true, y_pred, tags)

#     metrics = calculate_class_metrics(confusion_matrix, tags)
#     metrics["total_time"] = total_time
#     metrics["avg_time_per_sample"] = avg_time_per_sample

#     return metrics, results, confusion_matrix

In [34]:
data_files = {
    "monitor": "output_classification_enhance_monitoring.csv",
    "na": "output_classification_generated_na.csv",
}
NUM_TEST_SAMPLES = 100
MODES = ["basic", "1_ex", "5_ex", "10_ex", "chain_of_thoughts"]
tags = ["monitor", "na"]

class_bg_na_or_monitor = """
The input text is a student's reflection after completing a computer science test on the CompassX platform, 
an online CS learning tool. Students are asked to analyze their performance, 
focusing on what they did well, where they struggled, and how they plan to address any mistakes. 
Responses are often informal, fragmented, and focused on specific programming concepts or errors. 
Despite this, the reflections frequently indicate monitoring behaviors such as identifying mistakes or tracking progress.
"""

base_output_path = "classification_monitor_na"

In [35]:
for mode in MODES:
    print(f"Processing mode: {mode}")
    mode_output_path = create_experiment_folder_custom(base_output_path, mode)

    # 初始化模式级别混淆矩阵和结果列表
    overall_confusion_matrix = initialize_confusion_matrix(tags)
    overall_results = []

    for tag, file_path in data_files.items():
        try:
            print(f"Processing dataset for tag: {tag}")
            data = read_csv_data(file_path, max_rows=NUM_TEST_SAMPLES)
            if not data:
                print(f"Warning: No data found in {file_path}, skipping.")
                continue

            y_true, y_pred = [], []
            results = []
            start_time = time.time()

            for item in data:
                text = item["text"]
                true_tag = item["tag"]
                try:
                    # 调用 classify_monitor_or_na 方法
                    response = ollama_model.classify_monitor_or_na(
                        model="llama3.2", text=text, context=class_bg_na_or_monitor, mode=mode
                    )
                    response = json.loads(response)
                    predicted_tag = response.get("classification", "na")
                    y_true.append(true_tag)
                    y_pred.append(predicted_tag)
                    results.append({
                        "text": text,
                        "true_tag": true_tag,
                        "predicted_tag": predicted_tag,
                        "error": False
                    })
                except Exception as e:
                    y_true.append(true_tag)
                    y_pred.append("error")
                    results.append({
                        "text": text,
                        "true_tag": true_tag,
                        "predicted_tag": "error",
                        "error": True,
                        "error_message": str(e)
                    })

            total_time = time.time() - start_time
            avg_time_per_sample = total_time / len(data) if data else 0

            # 更新混淆矩阵
            confusion_matrix = initialize_confusion_matrix(tags)
            confusion_matrix = update_confusion_matrix(confusion_matrix, y_true, y_pred, tags)
            overall_confusion_matrix = update_confusion_matrix(overall_confusion_matrix, y_true, y_pred, tags)

            # 保存单文件结果
            metrics = calculate_class_metrics(confusion_matrix, tags)
            metrics["total_time"] = total_time
            metrics["avg_time_per_sample"] = avg_time_per_sample

            # 保存结果
            with open(os.path.join(mode_output_path, f"{tag}_results.json"), "w") as f:
                json.dump(results, f, indent=4)
            with open(os.path.join(mode_output_path, f"{tag}_metrics.json"), "w") as f:
                json.dump(metrics, f, indent=4)

        except Exception as e:
            print(f"Error processing dataset for tag {tag}: {e}")

    # 保存模式级别结果
    try:
        overall_metrics = calculate_class_metrics(overall_confusion_matrix, tags)
        generate_multiclass_heatmap(overall_confusion_matrix, tags, mode_output_path, mode)

        # 保存总体结果
        with open(os.path.join(mode_output_path, "overall_results.json"), "w") as f:
            json.dump(overall_results, f, indent=4)
        with open(os.path.join(mode_output_path, "overall_metrics.json"), "w") as f:
            json.dump(overall_metrics, f, indent=4)

        print(f"Overall results and metrics saved for mode: {mode}")
    except Exception as e:
        print(f"Error generating overall results for mode {mode}: {e}")

print("Monitor/NA classification pipeline completed.")

Processing mode: basic
Processing dataset for tag: monitor
Processing dataset for tag: na
Overall results and metrics saved for mode: basic
Processing mode: 1_ex
Processing dataset for tag: monitor
Processing dataset for tag: na
Overall results and metrics saved for mode: 1_ex
Processing mode: 5_ex
Processing dataset for tag: monitor
Processing dataset for tag: na
Overall results and metrics saved for mode: 5_ex
Processing mode: 10_ex
Processing dataset for tag: monitor
Processing dataset for tag: na
Overall results and metrics saved for mode: 10_ex
Processing mode: chain_of_thoughts
Processing dataset for tag: monitor
Processing dataset for tag: na
Overall results and metrics saved for mode: chain_of_thoughts
Monitor/NA classification pipeline completed.
