In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from utils import build_dense_graph

In [2]:
class KTDataset(Dataset):
    def __init__(self, features, questions, answers):
        super(KTDataset, self).__init__()
        self.features = features
        self.questions = questions
        self.answers = answers

    def __getitem__(self, index):
        return self.features[index], self.questions[index], self.answers[index]

    def __len__(self):
        return len(self.features)


In [3]:
def pad_collate(batch):
    (features, questions, answers) = zip(*batch)
    features = [torch.LongTensor(feat) for feat in features]
    questions = [torch.LongTensor(qt) for qt in questions]
    answers = [torch.LongTensor(ans) for ans in answers]
    feature_pad = pad_sequence(features, batch_first=True, padding_value=-1)
    question_pad = pad_sequence(questions, batch_first=True, padding_value=-1)
    answer_pad = pad_sequence(answers, batch_first=True, padding_value=-1)
    return feature_pad, question_pad, answer_pad

In [4]:
def load_dataset(file_path, batch_size, graph_type, dkt_graph_path=None, train_ratio=0.7, val_ratio=0.2, shuffle=True, model_type='GKT', use_binary=True, res_len=2, use_cuda=True):
    r"""
    Parameters:
        file_path: input file path of knowledge tracing data
        batch_size: the size of a student batch
        graph_type: the type of the concept graph
        shuffle: whether to shuffle the dataset or not
        use_cuda: whether to use GPU to accelerate training speed
    Return:
        concept_num: the number of all concepts(or questions)
        graph: the static graph is graph type is in ['Dense', 'Transition', 'DKT'], otherwise graph is None
        train_data_loader: data loader of the training dataset
        valid_data_loader: data loader of the validation dataset
        test_data_loader: data loader of the test dataset
    NOTE: stole some code from https://github.com/lccasagrande/Deep-Knowledge-Tracing/blob/master/deepkt/data_util.py
    """
    df = pd.read_csv(file_path)
    if "skill_id" not in df.columns:
        raise KeyError(f"The column 'skill_id' was not found on {file_path}")
    if "correct" not in df.columns:
        raise KeyError(f"The column 'correct' was not found on {file_path}")
    if "user_id" not in df.columns:
        raise KeyError(f"The column 'user_id' was not found on {file_path}")

    # if not (df['correct'].isin([0, 1])).all():
    #     raise KeyError(f"The values of the column 'correct' must be 0 or 1.")

    # Step 1.1 - Remove questions without skill
    df.dropna(subset=['skill_id'], inplace=True)

    # Step 1.2 - Remove users with a single answer
    df = df.groupby('user_id').filter(lambda q: len(q) > 1).copy()

    # Step 2 - Enumerate skill id
    df['skill'], _ = pd.factorize(df['skill_id'], sort=True)  # we can also use problem_id to represent exercises

    # Step 3 - Cross skill id with answer to form a synthetic feature
    # use_binary: (0,1); !use_binary: (1,2,3,4,5,6,7,8,9,10,11,12). Either way, the correct result index is guaranteed to be 1
    if use_binary:
        df['skill_with_answer'] = df['skill'] * 2 + df['correct']
    else:
        df['skill_with_answer'] = df['skill'] * res_len + df['correct'] - 1


    # Step 4 - Convert to a sequence per user id and shift features 1 timestep
    feature_list = []
    question_list = []
    answer_list = []
    seq_len_list = []

    def get_data(series):
        feature_list.append(series['skill_with_answer'].tolist())
        question_list.append(series['skill'].tolist())
        answer_list.append(series['correct'].eq(1).astype('int').tolist())
        seq_len_list.append(series['correct'].shape[0])

    df.groupby('user_id').apply(get_data)
    max_seq_len = np.max(seq_len_list)
    print('max seq_len: ', max_seq_len)
    student_num = len(seq_len_list)
    print('student num: ', student_num)
    feature_dim = int(df['skill_with_answer'].max() + 1)
    print('feature_dim: ', feature_dim)
    question_dim = int(df['skill'].max() + 1)
    print('question_dim: ', question_dim)
    concept_num = question_dim

    # print('feature_dim:', feature_dim, 'res_len*question_dim:', res_len*question_dim)
    # assert feature_dim == res_len * question_dim

    kt_dataset = KTDataset(feature_list, question_list, answer_list)
    train_size = int(train_ratio * student_num)
    val_size = int(val_ratio * student_num)
    test_size = student_num - train_size - val_size
    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(kt_dataset, [train_size, val_size, test_size])
    print('train_size: ', train_size, 'val_size: ', val_size, 'test_size: ', test_size)

    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=pad_collate)
    valid_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=pad_collate)
    test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=pad_collate)

    graph = None
    if model_type == 'GKT':
        if graph_type == 'Dense':
            graph = build_dense_graph(concept_num)
        elif graph_type == 'Transition':
            graph = build_transition_graph(question_list, seq_len_list, train_dataset.indices, student_num, concept_num)
        elif graph_type == 'DKT':
            graph = build_dkt_graph(dkt_graph_path, concept_num)
        if use_cuda and graph_type in ['Dense', 'Transition', 'DKT']:
            graph = graph.cuda()
    return concept_num, graph, train_data_loader, valid_data_loader, test_data_loader

In [5]:
def build_transition_graph(question_list, seq_len_list, indices, student_num, concept_num):
    graph = np.zeros((concept_num, concept_num))
    student_dict = dict(zip(indices, np.arange(student_num)))
    for i in range(student_num):
        if i not in student_dict:
            continue
        questions = question_list[i]
        seq_len = seq_len_list[i]
        for j in range(seq_len - 1):
            pre = questions[j]
            next = questions[j + 1]
            graph[pre, next] += 1
    np.fill_diagonal(graph, 0)
    # row normalization
    rowsum = np.array(graph.sum(1))
    def inv(x):
        if x == 0:
            return x
        return 1. / x
    inv_func = np.vectorize(inv)
    r_inv = inv_func(rowsum).flatten()
    r_mat_inv = np.diag(r_inv)
    graph = r_mat_inv.dot(graph)
    # covert to tensor
    graph = torch.from_numpy(graph).float()
    return graph

In [6]:
def build_dkt_graph(file_path, concept_num):
    graph = np.loadtxt(file_path)
    assert graph.shape[0] == concept_num and graph.shape[1] == concept_num
    graph = torch.from_numpy(graph).float()
    return graph

In [7]:
train_ratio = 0.6
val_ratio = 0.2
shuggle = True
dkt_graph_dir = 'dkt-graph'
dkt_graph = 'dkt_graph.txt'

In [8]:
# load dataset
dataset_path = './data/assistment_test15.csv'
# model이 dkt일 때 필요한 것으로 파악
dkt_graph_path = os.path.join(dkt_graph_dir, dkt_graph)
if not os.path.exists(dkt_graph_path):
    dkt_graph_path = None

In [9]:
print(dkt_graph_path)

None


In [10]:
file_path=dataset_path
batch_size=128
graph_type = 'Dense'
dkt_graph_path=None
train_ratio=0.7
val_ratio=0.2
shuffle=True
model_type='GKT'
use_binary=True
res_len=2
use_cuda=True

In [11]:
# def load_dataset
"""
    Parameters:
        file_path: input file path of knowledge tracing data
        batch_size: the size of a student batch
        graph_type: the type of the concept graph
        shuffle: whether to shuffle the dataset or not
        use_cuda: whether to use GPU to accelerate training speed
    Return:
        concept_num: the number of all concepts(or questions)
        graph: the static graph is graph type is in ['Dense', 'Transition', 'DKT'], otherwise graph is None
        train_data_loader: data loader of the training dataset
        valid_data_loader: data loader of the validation dataset
        test_data_loader: data loader of the test dataset
"""

"\n    Parameters:\n        file_path: input file path of knowledge tracing data\n        batch_size: the size of a student batch\n        graph_type: the type of the concept graph\n        shuffle: whether to shuffle the dataset or not\n        use_cuda: whether to use GPU to accelerate training speed\n    Return:\n        concept_num: the number of all concepts(or questions)\n        graph: the static graph is graph type is in ['Dense', 'Transition', 'DKT'], otherwise graph is None\n        train_data_loader: data loader of the training dataset\n        valid_data_loader: data loader of the validation dataset\n        test_data_loader: data loader of the test dataset\n"

In [12]:
df = pd.read_csv(file_path)
if "skill_id" not in df.columns:
    raise KeyError(f"The column 'skill_id' was not found on {file_path}")
if "correct" not in df.columns:
    raise KeyError(f"The column 'correct' was not found on {file_path}")
if "user_id" not in df.columns:
    raise KeyError(f"The column 'user_id' was not found on {file_path}")
df

Unnamed: 0,order_id,assignment_id,user_id,assistment_id,problem_id,original,correct,attempt_count,ms_first_response,tutor_mode,...,hint_count,hint_total,overlap_time,template_id,answer_id,answer_text,first_action,bottom_hint,opportunity,opportunity_original
0,35450204,220674,70363,33159,51444,1,0,2,25390,tutor,...,0,3,42000,30799,,88,0,,1,1.0
1,35450295,220674,70363,33110,51395,1,1,1,4859,tutor,...,0,3,4859,30059,,41,0,,2,2.0
2,35450311,220674,70363,33196,51481,1,0,14,19813,tutor,...,3,4,124564,30060,,65,0,0.0,3,3.0
3,35450555,220674,70363,33172,51457,1,1,1,16031,tutor,...,0,4,16031,30060,,12,0,,4,4.0
4,35450573,220674,70363,33174,51459,1,1,1,15047,tutor,...,0,4,15047,30060,,6,0,,5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1882,24521097,266769,80119,46959,85315,0,1,1,12062,tutor,...,0,4,12062,29915,,28,0,,2,
1883,24521264,266769,80119,47021,85499,0,0,1,2328,tutor,...,2,2,9156,30525,,,1,1.0,3,
1884,24521277,266769,80119,47021,85500,0,1,1,5500,tutor,...,0,2,5500,30525,,15,0,,4,
1885,24521289,266769,80119,47021,85501,0,0,2,9547,tutor,...,0,3,13813,30525,,4,0,,5,


In [13]:
# Step 1.1 - Remove questions without skill
df.dropna(subset=['skill_id'], inplace=True)
df

Unnamed: 0,order_id,assignment_id,user_id,assistment_id,problem_id,original,correct,attempt_count,ms_first_response,tutor_mode,...,hint_count,hint_total,overlap_time,template_id,answer_id,answer_text,first_action,bottom_hint,opportunity,opportunity_original
0,35450204,220674,70363,33159,51444,1,0,2,25390,tutor,...,0,3,42000,30799,,88,0,,1,1.0
1,35450295,220674,70363,33110,51395,1,1,1,4859,tutor,...,0,3,4859,30059,,41,0,,2,2.0
2,35450311,220674,70363,33196,51481,1,0,14,19813,tutor,...,3,4,124564,30060,,65,0,0.0,3,3.0
3,35450555,220674,70363,33172,51457,1,1,1,16031,tutor,...,0,4,16031,30060,,12,0,,4,4.0
4,35450573,220674,70363,33174,51459,1,1,1,15047,tutor,...,0,4,15047,30060,,6,0,,5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544,32071815,271622,70740,36072,56268,1,0,2,126423,tutor,...,0,4,128338,32893,,4,0,,7,7.0
1545,32071916,271622,70740,36025,56221,1,1,1,43181,tutor,...,0,4,43181,32892,,3,0,,8,8.0
1546,32071935,271622,70740,36105,56301,1,0,3,54782,tutor,...,0,4,57417,32906,,1,0,,9,9.0
1547,32071971,271622,70740,36127,56323,1,0,5,75579,tutor,...,0,4,85061,32906,,3,0,,10,10.0


In [14]:
# Step 1.2 - Remove users with a single answer
df = df.groupby('user_id').filter(lambda q: len(q) > 1).copy()
df

Unnamed: 0,order_id,assignment_id,user_id,assistment_id,problem_id,original,correct,attempt_count,ms_first_response,tutor_mode,...,hint_count,hint_total,overlap_time,template_id,answer_id,answer_text,first_action,bottom_hint,opportunity,opportunity_original
0,35450204,220674,70363,33159,51444,1,0,2,25390,tutor,...,0,3,42000,30799,,88,0,,1,1.0
1,35450295,220674,70363,33110,51395,1,1,1,4859,tutor,...,0,3,4859,30059,,41,0,,2,2.0
2,35450311,220674,70363,33196,51481,1,0,14,19813,tutor,...,3,4,124564,30060,,65,0,0.0,3,3.0
3,35450555,220674,70363,33172,51457,1,1,1,16031,tutor,...,0,4,16031,30060,,12,0,,4,4.0
4,35450573,220674,70363,33174,51459,1,1,1,15047,tutor,...,0,4,15047,30060,,6,0,,5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544,32071815,271622,70740,36072,56268,1,0,2,126423,tutor,...,0,4,128338,32893,,4,0,,7,7.0
1545,32071916,271622,70740,36025,56221,1,1,1,43181,tutor,...,0,4,43181,32892,,3,0,,8,8.0
1546,32071935,271622,70740,36105,56301,1,0,3,54782,tutor,...,0,4,57417,32906,,1,0,,9,9.0
1547,32071971,271622,70740,36127,56323,1,0,5,75579,tutor,...,0,4,85061,32906,,3,0,,10,10.0


In [15]:
df['skill_id']

0         1.0
1         1.0
2         1.0
3         1.0
4         1.0
        ...  
1544    375.0
1545    375.0
1546    375.0
1547    375.0
1548    375.0
Name: skill_id, Length: 1549, dtype: float64

In [16]:
# Step 2 - Enumerate skill id
# 객체를 열거형 또는 범주형 변수로 인코딩
df['skill'], _ = pd.factorize(df['skill_id'], sort=True)  # we can also use problem_id to represent exercises

In [17]:
df['skill']

0        0
1        0
2        0
3        0
4        0
        ..
1544    73
1545    73
1546    73
1547    73
1548    73
Name: skill, Length: 1549, dtype: int64

In [18]:
# Step 3 - Cross skill id with answer to form a synthetic feature
# use_binary: (0,1); !use_binary: (1,2,3,4,5,6,7,8,9,10,11,12). Either way, the correct result index is guaranteed to be 1
if use_binary:
    df['skill_with_answer'] = df['skill'] * 2 + df['correct']
else:
    df['skill_with_answer'] = df['skill'] * res_len + df['correct'] - 1

In [19]:
df['skill_with_answer']

0         0
1         1
2         0
3         1
4         1
       ... 
1544    146
1545    147
1546    146
1547    146
1548    147
Name: skill_with_answer, Length: 1549, dtype: int64

In [20]:
# Step 4 - Convert to a sequence per user id and shift features 1 timestep
feature_list = []
question_list = []
answer_list = []
seq_len_list = []

In [21]:
def get_data(series):
    feature_list.append(series['skill_with_answer'].tolist())
    question_list.append(series['skill'].tolist())
    answer_list.append(series['correct'].eq(1).astype('int').tolist())
    seq_len_list.append(series['correct'].shape[0])

df.groupby('user_id').apply(get_data)
max_seq_len = np.max(seq_len_list)
print('max seq_len: ', max_seq_len)
student_num = len(seq_len_list)
print('student num: ', student_num)
feature_dim = int(df['skill_with_answer'].max() + 1)
print('feature_dim: ', feature_dim)
question_dim = int(df['skill'].max() + 1)
print('question_dim: ', question_dim)
concept_num = question_dim

max seq_len:  368
student num:  15
feature_dim:  148
question_dim:  74


In [22]:
kt_dataset = KTDataset(feature_list, question_list, answer_list)
train_size = int(train_ratio * student_num)
val_size = int(val_ratio * student_num)
test_size = student_num - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(kt_dataset, [train_size, val_size, test_size])
print('train_size: ', train_size, '// val_size: ', val_size, '// test_size: ', test_size)

train_size:  10 // val_size:  3 // test_size:  2


In [23]:
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=pad_collate)
valid_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=pad_collate)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=pad_collate)

graph = None
if model_type == 'GKT':
    if graph_type == 'Dense':
        graph = build_dense_graph(concept_num)
    elif graph_type == 'Transition':
        graph = build_transition_graph(question_list, seq_len_list, train_dataset.indices, student_num, concept_num)
    elif graph_type == 'DKT':
        graph = build_dkt_graph(dkt_graph_path, concept_num)
    if use_cuda and graph_type in ['Dense', 'Transition', 'DKT']:
        graph = graph.cuda()

print("graph : ", graph)

graph :  tensor([[0.0000, 0.0137, 0.0137,  ..., 0.0137, 0.0137, 0.0137],
        [0.0137, 0.0000, 0.0137,  ..., 0.0137, 0.0137, 0.0137],
        [0.0137, 0.0137, 0.0000,  ..., 0.0137, 0.0137, 0.0137],
        ...,
        [0.0137, 0.0137, 0.0137,  ..., 0.0000, 0.0137, 0.0137],
        [0.0137, 0.0137, 0.0137,  ..., 0.0137, 0.0000, 0.0137],
        [0.0137, 0.0137, 0.0137,  ..., 0.0137, 0.0137, 0.0000]],
       device='cuda:0')


In [24]:
print("RETURN VALUES")
print("concept_num : ", concept_num)
print("graph : ", graph)
print("train_data_loader : ", train_data_loader)
print("valid_data_loader : ", valid_data_loader)
print("test_data_loader : ", test_data_loader)

RETURN VALUES
concept_num :  74
graph :  tensor([[0.0000, 0.0137, 0.0137,  ..., 0.0137, 0.0137, 0.0137],
        [0.0137, 0.0000, 0.0137,  ..., 0.0137, 0.0137, 0.0137],
        [0.0137, 0.0137, 0.0000,  ..., 0.0137, 0.0137, 0.0137],
        ...,
        [0.0137, 0.0137, 0.0137,  ..., 0.0000, 0.0137, 0.0137],
        [0.0137, 0.0137, 0.0137,  ..., 0.0137, 0.0000, 0.0137],
        [0.0137, 0.0137, 0.0137,  ..., 0.0137, 0.0137, 0.0000]],
       device='cuda:0')
train_data_loader :  <torch.utils.data.dataloader.DataLoader object at 0x7f97f0d646d0>
valid_data_loader :  <torch.utils.data.dataloader.DataLoader object at 0x7f97f0d64fa0>
test_data_loader :  <torch.utils.data.dataloader.DataLoader object at 0x7f97f0d64eb0>


In [25]:
graph.shape

torch.Size([74, 74])

In [26]:
graph.sum()

tensor(74.0000, device='cuda:0')