問題あたりのユーザーをまとめたquestion_user.csvを作成

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re

# フォルダのパスを指定してください
folder_path = "KT1"

# 自然順ソート用の関数
def natural_sort_key(file_name):
    match = re.match(r"u(\d+)\.csv", file_name)
    return int(match.group(1)) if match else float('inf')

# フォルダ内のファイルリストを取得しソート
file_list = sorted([f for f in os.listdir(folder_path) if f.startswith("u") and f.endswith(".csv")], key=natural_sort_key)

# 問題ごとのユーザーを保存する辞書
question_data = {}

# 各ファイルを処理
for file_name in tqdm(file_list, desc="Processing user files"):
    file_path = os.path.join(folder_path, file_name)
    try:
        # ファイルを読み込む
        user_df = pd.read_csv(file_path)
        user_id = os.path.splitext(file_name)[0]  # ユーザーIDをファイル名から取得

        # 各問題について、ユーザーをセットに追加
        for question_id in user_df["question_id"]:
            if question_id not in question_data:
                question_data[question_id] = set()
            question_data[question_id].add(user_id)
    except Exception as e:
        print(f"Error processing {file_name}: {e}")



Processing user files: 100%|██████████| 784309/784309 [14:35<00:00, 895.85it/s] 


In [2]:
# 辞書をDataFrameに変換し、適切な形式に整える
question_data_expanded = [{"question_id": qid, "users": ",".join(users)} for qid, users in question_data.items()]
question_data_df = pd.DataFrame(question_data_expanded)

# CSVファイルに保存
question_data_df.to_csv("question_user.csv", index=False)
print(f"Results saved to question_user.csv")


Results saved to question_user.csv


In [3]:
print(question_data_df)

      question_id                                              users
0           q5012  u791298,u7403,u405788,u626263,u462020,u375328,...
1           q4706  u494769,u38764,u10681,u309732,u9157,u175301,u2...
2           q4366  u2255,u38780,u263262,u221031,u282781,u363698,u...
3           q4829  u437632,u350916,u402472,u493071,u276687,u57977...
4           q6528  u441006,u540648,u315292,u586866,u650801,u40106...
...           ...                                                ...
12279      q10340                                             u24040
12280      q10341                                             u24040
12281      q10342                                             u24040
12282      q10343                                             u24040
12283      q10552                                             u26999

[12284 rows x 2 columns]


In [4]:
import matplotlib.pyplot as plt

# タグ数の分布を確認
tag_counts = user_tags_df["tag_count"]

# データの要約で頻度分布を作成
frequency = tag_counts.value_counts().sort_index()

# ヒストグラムを描画
plt.figure(figsize=(10, 6))
plt.bar(frequency.index, frequency.values, edgecolor="black", alpha=0.7)
plt.title("Distribution of Tag Counts per User")
plt.xlabel("Number of skills")
plt.ylabel("Number of Users")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# タグ数の分布の要約統計量を表示
print("Tag count distribution summary:")
print(tag_counts.describe())


NameError: name 'user_tags_df' is not defined

In [4]:
import os
import logging
import numpy as np
from EduCDM import EMDINA as DINA

# ファイルの相対パスを設定
current_dir = os.getcwd()
data_file_path = os.path.join(current_dir, 'FrcSub', 'data.txt')
q_matrix_file_path = os.path.join(current_dir, 'FrcSub', 'q.txt')

# ファイルの存在確認
if not os.path.exists(data_file_path):
    raise FileNotFoundError(f"{data_file_path} not found.")
if not os.path.exists(q_matrix_file_path):
    raise FileNotFoundError(f"{q_matrix_file_path} not found.")

# ファイルの読み込み
response_data = np.loadtxt(data_file_path, dtype=int)  # 学生-問題の回答データ
q_matrix = np.loadtxt(q_matrix_file_path, dtype=int)  # Q行列

# データの形状取得
stu_num, prob_num = response_data.shape
_, know_num = q_matrix.shape

# 回答データをDINA形式に変換
R = -1 * np.ones((stu_num, prob_num))
for stu_id in range(stu_num):
    for prob_id in range(prob_num):
        R[stu_id, prob_id] = response_data[stu_id, prob_id]
        print(f"R[{stu_id}, {prob_id}] = {R[stu_id, prob_id]}")

# ログレベルを設定
logging.getLogger().setLevel(logging.INFO)

# DINAモデルの初期化
cdm = DINA(R, q_matrix, stu_num, prob_num, know_num, skip_value=-1)

# モデルの訓練
cdm.train(epoch=10, epsilon=1e-3)  # epochやepsilonは調整可能

# 訓練済みモデルの保存
model_path = os.path.join(current_dir, 'dina_model.params')
cdm.save(model_path)

# モデルの評価 (RMSE, MAE)
rmse, mae = cdm.eval([{'user_id': i, 'item_id': j, 'score': R[i, j]}
                      for i in range(stu_num) for j in range(prob_num) if R[i, j] != -1])
print("RMSE: %.6f, MAE: %.6f" % (rmse, mae))

# 学習済みモデルのパラメータをロード
model_path = "dina_model.params"
cdm.load(model_path)

attributes = []
# 学生ごとのアトリビュート（知識状態）を表示
print("Student Knowledge Attributes:")
for stu_id in range(cdm.stu_num):
    # 学生の状態IDを取得
    state_id = cdm.theta[stu_id]
    # 状態IDに対応する知識状態（バイナリベクトル）
    knowledge_state = cdm.all_states[state_id]
    attributes.append(knowledge_state)
    print(f"Student {stu_id}: State ID = {state_id}, Knowledge State = {knowledge_state}")


R[0, 0] = 0.0
R[0, 1] = 0.0
R[0, 2] = 0.0
R[0, 3] = 1.0
R[0, 4] = 0.0
R[0, 5] = 0.0
R[0, 6] = 1.0
R[0, 7] = 1.0
R[0, 8] = 0.0
R[0, 9] = 1.0
R[0, 10] = 1.0
R[0, 11] = 1.0
R[0, 12] = 0.0
R[0, 13] = 1.0
R[0, 14] = 1.0
R[0, 15] = 1.0
R[0, 16] = 0.0
R[0, 17] = 1.0
R[0, 18] = 1.0
R[0, 19] = 1.0
R[1, 0] = 0.0
R[1, 1] = 1.0
R[1, 2] = 1.0
R[1, 3] = 1.0
R[1, 4] = 0.0
R[1, 5] = 1.0
R[1, 6] = 1.0
R[1, 7] = 1.0
R[1, 8] = 1.0
R[1, 9] = 1.0
R[1, 10] = 1.0
R[1, 11] = 1.0
R[1, 12] = 1.0
R[1, 13] = 1.0
R[1, 14] = 1.0
R[1, 15] = 1.0
R[1, 16] = 1.0
R[1, 17] = 1.0
R[1, 18] = 1.0
R[1, 19] = 1.0
R[2, 0] = 0.0
R[2, 1] = 1.0
R[2, 2] = 1.0
R[2, 3] = 1.0
R[2, 4] = 0.0
R[2, 5] = 1.0
R[2, 6] = 1.0
R[2, 7] = 1.0
R[2, 8] = 0.0
R[2, 9] = 0.0
R[2, 10] = 0.0
R[2, 11] = 0.0
R[2, 12] = 0.0
R[2, 13] = 1.0
R[2, 14] = 1.0
R[2, 15] = 1.0
R[2, 16] = 0.0
R[2, 17] = 0.0
R[2, 18] = 0.0
R[2, 19] = 0.0
R[3, 0] = 1.0
R[3, 1] = 1.0
R[3, 2] = 1.0
R[3, 3] = 1.0
R[3, 4] = 1.0
R[3, 5] = 1.0
R[3, 6] = 0.0
R[3, 7] = 1.0
R[3, 8] = 0.0
R[3,

INFO:root:save parameters to /Users/nagairyousuke/Documents/kashima_lab/KnowledgeTracing/dina_model.params
evaluating: 100%|██████████| 10720/10720 [00:00<00:00, 1260030.79it/s]
INFO:root:load parameters from dina_model.params


RMSE: 0.293777, MAE: 0.179607
Student Knowledge Attributes:
Student 0: State ID = 239, Knowledge State = [1. 1. 1. 0. 1. 1. 1. 1.]
Student 1: State ID = 251, Knowledge State = [1. 1. 1. 1. 1. 0. 1. 1.]
Student 2: State ID = 210, Knowledge State = [1. 1. 0. 1. 0. 0. 1. 0.]
Student 3: State ID = 127, Knowledge State = [0. 1. 1. 1. 1. 1. 1. 1.]
Student 4: State ID = 3, Knowledge State = [0. 0. 0. 0. 0. 0. 1. 1.]
Student 5: State ID = 2, Knowledge State = [0. 0. 0. 0. 0. 0. 1. 0.]
Student 6: State ID = 64, Knowledge State = [0. 1. 0. 0. 0. 0. 0. 0.]
Student 7: State ID = 0, Knowledge State = [0. 0. 0. 0. 0. 0. 0. 0.]
Student 8: State ID = 64, Knowledge State = [0. 1. 0. 0. 0. 0. 0. 0.]
Student 9: State ID = 64, Knowledge State = [0. 1. 0. 0. 0. 0. 0. 0.]
Student 10: State ID = 215, Knowledge State = [1. 1. 0. 1. 0. 1. 1. 1.]
Student 11: State ID = 223, Knowledge State = [1. 1. 0. 1. 1. 1. 1. 1.]
Student 12: State ID = 3, Knowledge State = [0. 0. 0. 0. 0. 0. 1. 1.]
Student 13: State ID = 0,