In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import random
from torch.optim import lr_scheduler

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [None]:
class MYDataset(Dataset):
    def __init__(self, data):        
        self.data = data

    def __len__(self,):
        return len(self.data)

    def __getitem__(self, id):
        return self.data[id]

In [None]:
questions_counter = 124

def load_data(path, split = False):
    global questions_counter
    lines = ''
    with open(path, 'r') as file:
        lines = file.readlines()

    # Counting questions
    # lines_counter =0
    # while lines_counter < len(lines):
    #     questions = lines[lines_counter+1].split(',')
    #     for i,v in enumerate(questions):
    #         try:
    #             if int(v) not in questions_map.keys():
    #                 questions_map[int(v)] = questions_counter
    #                 questions_counter += 1
    #         except ValueError:
    #                 pass
    #     lines_counter+=3

    # Filling data
    lines_counter =0
    data = []
    while lines_counter < len(lines):
        questions = lines[lines_counter+1].split(',')
        answers = lines[lines_counter+2].split(',')
        interactions = {}
        for i,v in enumerate(questions):
            try:
                question_id = int(v)
                # +1 for next question id
                # +1 for y
                interaction = [0.0] * (questions_counter * 2 + 2)
                interaction[question_id] = 1.0
                if int(answers[i]) == 1:
                    interaction[question_id + questions_counter] = 1.0
                    # interaction[-1] = 1.0
                
                if i < int(lines[lines_counter]) - 1:
                    next_question_number = questions[i+1]
                    next_question_id = int(next_question_number)
                    interaction[-2] = next_question_id
                    # Set y
                    interaction[-1] = int(answers[i+1])
                
                interactions.pop(question_id, None)
                interactions[question_id] = interaction
                
            except ValueError:
                    pass
                        
        lines_counter+=3

        # Ignore students with 1 interaction
        if len(interactions) < 2:
            continue

        interactions_list = list(interactions.values())

        # Padding
        for i in range( questions_counter - len(interactions_list) ):
            interactions_list.append([0] * (questions_counter * 2 + 2))
        
        data.append(interactions_list)

    if split:
        random.shuffle(data)
        size = len(data)
        split1 = data[: size - size // 80]
        split2 = data[size - size // 80:]
        return torch.tensor(split1), torch.tensor(split2)

    return torch.tensor(data)

training_data, validation_data = load_data('assistments2009/train.csv', split=True)

training_data = training_data.to(device)
validation_data = validation_data.to(device)

print('training_data.shape',training_data.shape)
print('validation_data.shape',validation_data.shape)

training_data = MYDataset(training_data)
validation_data = MYDataset(validation_data)

In [None]:

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self, dropout):
        super().__init__()
        self.rnn = nn.LSTM(questions_counter * 2, hidden_size=questions_counter,
                        #   batch_first=True
                          )
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x, _ = self.rnn(x)
        x = self.dropout(x)
        return self.sigmoid(x)

# model = NeuralNetwork(dropout=0.25)
# x = torch.rand(questions_counter, questions_counter * 2)
# print(model(x).shape)

In [None]:
def train(params, loss_fn):
    print(params)
    model = NeuralNetwork(dropout=params['dropout']).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
    scheduler = lr_scheduler.StepLR(optimizer, gamma=params['gamma'], step_size = 1)
    epochs = params['epochs']
    v_loss = 1_000_000
    model_path = ''
    no_change_counter = 0
    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}\n-------------------------------')
        lr = optimizer.param_groups[0]['lr']
        model.train()
        last_loss = 0
        for id, student in enumerate(training_data):
            input = student[:, :-2]
            truth = student[:, -1]
            q_ids = student[:, -2].long()
            assert(truth[-1] == 0)
            optimizer.zero_grad()
            # pred = torch.squeeze(model(input))
            pred = model(input)
            truth2 = pred.detach().clone()
            batch_indices = torch.arange(len(q_ids), device=device)
            truth2[batch_indices, q_ids] = truth
            loss = loss_fn(pred,truth2)
            last_loss = loss.item()
            loss.backward()       
            optimizer.step()
            
        # print(f'lr {lr:8f} train loss {last_loss:.8f} ')
        
        v_losses = []
        model.eval()
        with torch.no_grad():
            for id, data in enumerate(validation_data):
                input = data[:, :-2]
                truth = data[:, -1]
                q_ids = data[:, -2].long()
                assert(truth[-1] == 0)
                pred = model(input)
                truth2 = pred.detach().clone()
                batch_indices = torch.arange(len(q_ids), device=device)
                truth2[batch_indices, q_ids] = truth
                loss = loss_fn(pred, truth2)
                v_losses.append(loss)
            
        v_loss_epoch = sum(v_losses) / len(v_losses)
        print(f'lr {lr:8f} train loss {last_loss:.8f} val loss {v_loss_epoch:.8f}')

        if v_loss - v_loss_epoch > 0.00001:
            v_loss = v_loss_epoch
            no_change_counter = 0
            model_path = '-'.join(str(value) for value in params.values()) + 'chkpnt.pt'
            torch.save(model.state_dict(), model_path )
        elif no_change_counter > params['patience'] - 1:
            break
        else:
            no_change_counter += 1
        
        scheduler.step()
    print('v_loss', v_loss, model_path)

# Tune the model
# for loss in [nn.BCELoss() ]:
#     for lr in [5e-2,1e-2,1e-3,1e-4,1e-5]:
#         for gamma in [0.75, 0.9]:
#             for dropout in [ 0.1, 0.25]:
#                 print(loss)
#                 train(params = {'lr':lr, 'gamma': gamma, 'epochs':10, 'patience':3, 'verbose': False, 'dropout' : dropout,  }, loss_fn = loss)

train(params = {'lr':1e-5, 'gamma': 0.75, 'epochs':5, 'patience':3, 'verbose': False, 'dropout' : 0.25,  }, loss_fn = nn.BCELoss())

In [None]:
testing_data = load_data('assistments2009/test.csv',)
model = NeuralNetwork(dropout=0.25).to(device)
# model.load_state_dict(torch.load(''))

y_true = []
y_scores = []

model.eval()
with torch.no_grad():
    for id, student in enumerate(testing_data):
        input = data[:, :-2]
        truth = data[:, -1]
        q_ids = data[:, -2].long()
        assert(truth[-1] == 0)
        pred = model(input)
        # Skip students with 1 interaction
        last_interaction_id = 0
        for j, interaction in enumerate( data):
            if torch.equal(interaction, torch.zeros(questions_counter * 3 + 1)):
                last_interaction_id = j - 1
                break
        if last_interaction_id == 0:
            continue
        
        input = data[:, :-1]
        truth = data[:, -1]
        assert(truth[-1] == 0)
        pred = torch.squeeze(model(input))
        for i in range(last_interaction_id):
            y_true.append(truth[i])
            y_scores.append(pred[i])

from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_true, y_scores)
print("AUC:", auc)