In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#utils.py

import torch
import numpy as np
import pandas as pd #HDKIM
from torch.autograd import Variable

def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

def make_std_mask(x, pad):
    "Create a mask to hide padding and future words."
    mask = torch.unsqueeze((x!=pad), -1)

    tgt_mask = mask & Variable(
        subsequent_mask(x.size(-1)).type_as(mask.data))
    #         print('tgt_mask size after: ', tgt_mask.size())
    return tgt_mask

In [None]:
!nvcc --version
!rm -rf ~/.nv/

In [None]:
# multihead_attn.py

import torch
import torch.nn as nn
import numpy as np
import math
import torch.nn.functional as F
import copy
from torch.nn import LayerNorm


def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


def attention(query, key, value, key_masks=None, query_masks=None, future_masks=None, dropout=None, infer=False):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    layernorm = LayerNorm(d_k).cuda()
    # query shape = [nbatches, h, T_q, d_k]       key shape = [nbatches, h, T_k, d_k] == value shape
    # scores shape = [nbatches, h, T_q, T_k]  == p_attn shape
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    # if key_masks is not None:
    #     scores = scores.masked_fill(key_masks.unsqueeze(1).cuda() == 0, -1e9)
    if future_masks is not None:
        scores = scores.masked_fill(future_masks.unsqueeze(0).cuda() == 0, -1e9)


    p_attn = F.softmax(scores, dim=-1)
    outputs = p_attn
    # if query_masks is not None:
    #     outputs = outputs * query_masks.unsqueeze(1)
    if dropout is not None:
        outputs = dropout(outputs)
    outputs = torch.matmul(outputs, value)

    outputs += query
    return layernorm(outputs), p_attn


class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.2, infer=False):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        self.layernorm = LayerNorm(d_model).cuda()
        self.infer = infer

    def forward(self, query, key, value, key_masks=None, query_masks=None, future_masks=None):
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = \
            [F.relu(l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2), inplace=True)
             for l, x in zip(self.linears, (query, key, value))]
        # k v shape = [nbatches, h, T_k, d_k],  d_k * h = d_model
        # q shape = [nbatches, h, T_q, d_k]
        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(query, key, value, query_masks=query_masks,
                                 key_masks=key_masks, future_masks=future_masks, dropout=self.dropout, infer=self.infer)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous() \
            .view(nbatches, -1, self.h * self.d_k)
        return self.layernorm(x)

In [None]:
!pip install ../input/humanfriendly82/humanfriendly-8.2-py2.py3-none-any.whl

In [None]:
!pip  install ../input/coloredlogs140/coloredlogs-14.0-py2.py3-none-any.whl

In [None]:
#wordtest.py

import logging
import coloredlogs
import pickle

logger = logging.getLogger('__file__')
coloredlogs.install(level='INFO', logger=logger)

def pickle_io(path, mode='r', obj=None):
    """
    Convinient pickle load and dump.
    """
    if mode in ['rb', 'r']:
        logger.info("Loading obj from {}...".format(path))
        with open(path, 'rb') as f:
            obj = pickle.load(f)
        logger.info("Load obj successfully!")
        return obj
    elif mode in ['wb', 'w']:
        logger.info("Dumping obj to {}...".format(path))
        with open(path, 'wb') as f:
            pickle.dump(obj, f)
        logger.info("Dump obj successfully!")

class WordTestResource(object):

    def __init__(self, resource_path, verbose=False):

        resource = pickle_io(resource_path, mode='r')

        self.id2index = resource['id2index']
        self.index2id = resource['index2id']
        self.num_skills = len(self.id2index)

        if verbose:
            self.word2id = resource['word2id']
            self.id2all = resource['id2all']
            # rank0 already be set to a large number
            self.words_by_rank = resource['words_by_rank']
            self.pos2id = resource['pos2id']
            self.words_by_rank.sort(key=lambda x: x[u'rank'])
            self.id_by_rank = [x[u'word_id'] for x in self.words_by_rank]

def str2bool(s):
    if s not in {'False', 'True'}:
        raise ValueError('Not a valid boolean string')
    return s == 'True'

In [None]:
#config.py

class DefaultConfig(object):
    model = 'SAKT'
    #train_data = "../input/assist2015files/assist2015_train.csv"  # train_data_path
    #test_data = "../input/assist2015files/assist2015_test.csv"
    batch_size = 4 #HDKIM 256
    state_size = 200
    num_heads = 5
    max_len = 50
    dropout = 0.1
    max_epoch = 5 #10
    lr = 3e-3
    lr_decay = 0.9
    max_grad_norm = 1.0
    weight_decay = 0  # l2正则化因子

opt = DefaultConfig()

In [None]:
!pip install ../input/prefetchgenerator101/prefetch_generator-1.0.1

In [None]:
# dataset.py

import csv
import torch
import time
import itertools
import numpy as np
#from config import DefaultConfig
#from wordtest import WordTestResource
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from prefetch_generator import BackgroundGenerator

import joblib #HDKIM

class Data(Dataset):
    #HDKIM def __init__(self, train=True):
    def __init__(self, df, train=True):
        start_time = time.time()
        #HDKIM if train:
        #HDKIM    fileName = opt.train_data
        #HDKIM else:
        #HDKIM     fileName = opt.test_data
        self.students = []
        self.max_skill_num = 0
        begin_index = 1e9
        
        #HDKIM with open(fileName, "r") as csvfile:
            #HDKIM for num_ques, ques, ans in itertools.zip_longest(*[csvfile] * 3):
                #HDKIM num_ques = int(num_ques.strip().strip(','))
                #HDKIM ques = [int(q) for q in ques.strip().strip(',').split(',')]
                #HDKIM ans = [int(a) for a in ans.strip().strip(',').split(',')]
        for index, row in df.iterrows():
                num_ques = int(row['num_ques'])
                #print(row['num_ques'])
                #print(row['ques'])
                #print(row['ans'])
                ques = [int(q) for q in row['ques']]
                ans = [int(a) for a in row['ans']]
                
                tmp_max_skill = max(ques)
                tmp_min_skill = min(ques)
                begin_index = min(tmp_min_skill, begin_index)
                self.max_skill_num = max(tmp_max_skill, self.max_skill_num)
                
                #HDKIM if (num_ques <= 2):
                #HDKIM     continue
                #HDKIM elif num_ques <= opt.max_len:
                #HDKIM if num_ques <= opt.max_len:
                '''
                if num_ques <= opt.max_len:
                    problems = np.zeros(opt.max_len, dtype=np.int64)
                    correct = np.ones(opt.max_len, dtype=np.int64)
                    problems[-num_ques:] = ques[-num_ques:]
                    correct[-num_ques:] = ans[-num_ques:]
                    self.students.append((num_ques, problems, correct))
                else:
                    start_idx = 0
                    while opt.max_len + start_idx <= num_ques:
                        problems = np.array(ques[start_idx:opt.max_len + start_idx])
                        correct = np.array(ans[start_idx:opt.max_len + start_idx])
                        tup = (opt.max_len, problems, correct)
                        start_idx += opt.max_len
                        self.students.append(tup)
                    left_num_ques = num_ques - start_idx
                ''' 
                #HDKIM
                # first part of the student
                copy_len = opt.max_len - 1
                if copy_len > num_ques:
                    copy_len = num_ques
                problems = np.zeros(opt.max_len, dtype=np.int64)
                correct = np.ones(opt.max_len, dtype=np.int64)
                problems[-copy_len:] = ques[-copy_len:]
                correct[-copy_len:] = ans[-copy_len:]
                tup = (copy_len, problems, correct)
                self.students.append(tup)
                
                if num_ques > opt.max_len - 1:
                    start_idx = opt.max_len - 1
                    while opt.max_len - 1 + start_idx <= num_ques:
                        problems = np.array(ques[(start_idx-1):(start_idx + opt.max_len -1 )])
                        correct = np.array(ans[(start_idx-1):(start_idx + opt.max_len -1)])
                        tup = (opt.max_len, problems, correct)
                        self.students.append(tup)
                        start_idx += (opt.max_len-1)
                    left_num_ques = num_ques - start_idx
                    
                    #HDKIM if left_num_ques>2: 
                    if left_num_ques>0:
                        problems = np.zeros(opt.max_len, dtype=np.int64)
                        correct = np.ones(opt.max_len, dtype=np.int64)
                        problems[-left_num_ques:] = ques[-left_num_ques:]
                        correct[-left_num_ques:] = ans[-left_num_ques:]
                        tup = (left_num_ques, problems, correct)
                        self.students.append(tup)
                        
        if train==False:
            if len(self.students) % opt.batch_size > 0:
                for i in range(opt.batch_size - (len(self.students) % opt.batch_size)):
                    self.students.append(tup)
                    
        print(len(self.students))


    def __getitem__(self, index):
        student = self.students[index]
        problems = student[1]
        #print("before",problems)
        correct = student[2]
        #HDKIM x = np.zeros(opt.max_len - 1)
        x = problems[:-1].copy()
        # we assume max_skill_num + 1 = num_skills because skill index starts from 0 to max_skill_num
        x += (correct[:-1] == 1) * (self.max_skill_num + 1)
        problems = problems[1:]
        correct = correct[1:]
        
        #print("after",problems)
        
        return x, problems, correct

    def __len__(self):
        return len(self.students)


    
class DataLoaderX(DataLoader):

    def __iter__(self):
        return BackgroundGenerator(super().__iter__())


class DataPrefetcher():
    def __init__(self, loader, device):
        self.loader = iter(loader)
        self.device = device
        self.stream = torch.cuda.Stream()
        # With Amp, it isn't necessary to manually convert data to half.
        # if args.fp16:
        #     self.mean = self.mean.half()
        #     self.std = self.std.half()
        self.preload()

    def preload(self):
        try:
            self.batch = next(self.loader)
        except StopIteration:
            self.batch = None
            return
        with torch.cuda.stream(self.stream):
            for k in range(len(self.batch)):
                self.batch[k] = self.batch[k].to(device=self.device, non_blocking=True)

            # With Amp, it isn't necessary to manually convert data to half.
            # if args.fp16:
            #     self.next_input = self.next_input.half()
            # else:
            #     self.next_input = self.next_input.float()

    def next(self):
        torch.cuda.current_stream().wait_stream(self.stream)
        batch = self.batch
        self.preload()
        return batch

In [None]:
# student_model.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import math
#from config import DefaultConfig
#from utils import subsequent_mask
from torch.autograd import Variable
#from multihead_attn import MultiHeadedAttention
from torch.nn import LayerNorm

class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, state_size, dropout=0.1, max_len=50):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        self.pe = torch.zeros(max_len, state_size)
        position = torch.arange(0.0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0.0, state_size, 2) *
                             -(math.log(10000.0) / state_size))
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)],
                         requires_grad=False)
        return self.dropout(x)


class student_model(nn.Module):

    def __init__(self, num_skills, state_size, num_heads=2, dropout=0.2, infer=False):
        super(student_model, self).__init__()
        self.infer = infer
        self.num_skills = num_skills
        self.state_size = state_size
        # we use the (num_skills * 2 + 1) as key padding_index
        self.embedding = nn.Embedding(num_embeddings=num_skills*2+1,
                                      embedding_dim=state_size)
                                      # padding_idx=num_skills*2
        # self.position_embedding = PositionalEncoding(state_size)
        self.position_embedding = nn.Embedding(num_embeddings=opt.max_len-1,
                                               embedding_dim=state_size)
        # we use the (num_skills + 1) as query padding_index
        self.problem_embedding = nn.Embedding(num_embeddings=num_skills+1,
                                      embedding_dim=state_size)
                                      # padding_idx=num_skills)
        self.multi_attn = MultiHeadedAttention(h=num_heads, d_model=state_size, dropout=dropout, infer=self.infer)
        self.feedforward1 = nn.Linear(in_features=state_size, out_features=state_size)
        self.feedforward2 = nn.Linear(in_features=state_size, out_features=state_size)
        self.pred_layer = nn.Linear(in_features=state_size, out_features=num_skills)
        self.dropout = nn.Dropout(dropout)
        self.layernorm = LayerNorm(state_size)

    def forward(self, x, problems, target_index):
        # self.key_masks = torch.unsqueeze( (x!=self.num_skills*2).int(), -1)
        # self.problem_masks = torch.unsqueeze( (problems!=self.num_skills).int(), -1)
        x = self.embedding(x)
        pe = self.position_embedding(torch.arange(x.size(1)).unsqueeze(0).cuda())
        x += pe
        # x = self.position_embedding(x)
        problems = self.problem_embedding(problems)
        # self.key_masks = self.key_masks.type_as(x)
        # self.problem_masks = self.problem_masks.type_as(problems)
        # x *= self.key_masks
        # problems *= self.problem_masks
        x = self.dropout(x)
        res = self.multi_attn(query=self.layernorm(problems), key=x, value=x,
                              key_masks=None, query_masks=None, future_masks=None)
        outputs = F.relu(self.feedforward1(res))
        outputs = self.dropout(outputs)
        outputs = self.dropout(self.feedforward2(outputs))
        # Residual connection
        outputs += self.layernorm(res)
        outputs = self.layernorm(outputs)
        logits = self.pred_layer(outputs)
        
        #HDKIM logits = logits.contiguous().view(logits.size(0) * opt.max_len - 1, -1)
        logits = logits.contiguous().view(logits.size(0) * (opt.max_len - 1), -1)
        logits = logits.contiguous().view(-1)
        selected_logits = torch.gather(logits, 0, torch.LongTensor(target_index).cuda())
        return selected_logits

In [None]:
# run.py

import time
import torch
import numpy as np
import torch.nn as nn
#from dataset import DataPrefetcher
#from config import DefaultConfig
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn import metrics
from sklearn.metrics import r2_score

def run_epoch(m, dataloader, optimizer, scheduler, criterion, num_skills,
                  epoch_id=None, writer=None, is_training=True):
    epoch_start_time = time.time()
    if is_training:
        m.train()
    else:
        m.eval()
    m.cuda()
    actual_labels = []
    pred_labels = []
    num_batch = len(dataloader)
    prefetcher = DataPrefetcher(dataloader, device='cuda')
    batch = prefetcher.next()
    k = 0

    if is_training:
        while batch is not None:
            target_index = []
            x, problems, correctness = batch
            x = x.long()
            problems = problems.long()
            correctness = correctness.view(-1).float()

            #HDKIM actual_labels += list(np.array(correctness))
            actual_labels += list(np.array(correctness.cpu()))
            offset = 0
            helper = np.array(problems.cpu()).reshape(-1)
            for i in range(problems.size(0)):
                for j in range(problems.size(1)):
                    target_index.append((offset + helper[i * problems.size(1) + j])) #HDKIM j+1 -> j?
                    offset += num_skills

            logits = m(x, problems, target_index) #HDKIM , correctness)
            pred = torch.sigmoid(logits)
            loss = criterion(pred, correctness.cuda())
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(m.parameters(), opt.max_grad_norm)
            
            #HDKIM
            # GPU
            # optimizer.step()
            # TPU
            xm.optimizer_step(optimizer, barrier=True) 
            
            scheduler.step()
            pred_labels += list(np.array(pred.data.cpu()))
            k += 1
            if k % 500 == 0:
                print('\r batch{}/{}'.format(k, num_batch), end='')
            #HDKIM if k >= num_batch - 1:
            if k >= num_batch:
                break
            batch = prefetcher.next()
    else:
        with torch.no_grad():
            while batch is not None:
                target_index = []
                x, problems, correctness = batch
                x = x.long()
                actual_num_problems = torch.sum(problems != num_skills, dim=1)
                num_problems = problems.size(1)
                problems = problems.long()
                correctness = correctness.view(-1).float()
                offset = 0
                helper = np.array(problems.cpu()).reshape(-1)
                for i in range(problems.size(0)):
                    for j in range(problems.size(1)):
                        target_index.append((offset + helper[i * problems.size(1) + j]))
                        offset += num_skills

                logits = m(x, problems, target_index) #HDKIM , correctness)
                pred = torch.sigmoid(logits)
                for J in range(x.size(0)):
                    actual_num_problem = actual_num_problems[J]
                    num_to_throw = num_problems - actual_num_problem

                    pred[J * num_problems:J * num_problems + num_to_throw] = correctness[
                                                                             J * num_problems:J * num_problems + num_to_throw]
                #HDKIM actual_labels += list(np.array(correctness))
                actual_labels += list(np.array(correctness.cpu()))

                pred_labels += list(np.array(pred.data.cpu()))
                
                k += 1
                if k % 500 == 0:
                    print('\r batch{}/{}'.format(k, num_batch), end='')
   
                #HDKIM last batch removing
                #HDKIM if k >= num_batch - 1:
                if k>=num_batch:
                    break
                batch = prefetcher.next()


    rmse = sqrt(mean_squared_error(actual_labels, pred_labels))
    fpr, tpr, thresholds = metrics.roc_curve(actual_labels, pred_labels, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    r2 = r2_score(actual_labels, pred_labels)
    acc = metrics.accuracy_score(actual_labels, np.array(pred_labels) >= 0.5)
    epoch_end_time = time.time()
    print('Epoch costs %.2f s' % (epoch_end_time - epoch_start_time))
    #HDKIM return rmse, auc, r2, acc
    return rmse, auc, r2, acc, pred_labels

In [None]:
# main.py

import torch
import torch.nn as nn
import torch.optim as optim
#from dataset import Data
#from dataset import DataLoaderX
#from config import DefaultConfig
#from student_model import student_model
#from run import run_epoch

if __name__ == '__main__':

    #ques = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")
    #print(ques.head())
    #print(ques.question_id.max()) #13522 -> num_skills = 13522+1
    
    num_skills = 13523
    m = student_model(num_skills=num_skills, state_size=opt.state_size,
                      num_heads=opt.num_heads, dropout=opt.dropout, infer=False)
   
    PATH = '../input/sakt-self-attentive-knowledge-tracing/sakt_model_auc_920.pkl'
    m.load_state_dict(torch.load(PATH))
    m.eval()
    
    torch.backends.cudnn.benchmark = True
    best_auc = 0
    optimizer = optim.Adam(m.parameters(), lr=opt.lr, weight_decay=opt.weight_decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=opt.lr_decay)
    criterion = nn.BCELoss()

    import riiideducation
    
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    
    for (test_df, sample_prediction_df) in iter_test:
        test_df = test_df[test_df.content_type_id == False].reset_index(drop=True)
        test_df = test_df.sort_values(['user_id','timestamp'], ascending=True).reset_index(drop = True)
        
        num_ques = test_df[['user_id','content_id']].groupby('user_id')['content_id'].count().to_frame()
        num_ques.columns = ["num_ques"]
        ques = test_df[['user_id','content_id']].groupby('user_id')['content_id'].apply(list).to_frame()
        ques.columns = ['ques']
        num_ques = num_ques.merge(ques,on="user_id",how="left")
        # creating dummy answers
        anses = []
        for que in num_ques.ques:
            anses.append([1]*len(que)) 
        num_ques["ans"] = anses

        test_dataset = Data(num_ques,train=False)
        test_loader = DataLoaderX(test_dataset, batch_size=opt.batch_size, num_workers=4, pin_memory=True)
        
        epoch = opt.max_epoch
        
        rmse, auc, r2, acc, preds = run_epoch(m, test_loader, optimizer, scheduler, criterion,
                                       num_skills=num_skills, epoch_id=epoch, is_training=False)
        
        final_preds = []
        start_idx = 0        
        for num in num_ques.num_ques:
            copy_len = opt.max_len - 1
            if copy_len > num:
                copy_len = num
            next_idx = start_idx + opt.max_len - 1
            final_preds.append(preds[(next_idx - copy_len):next_idx])
            start_idx = next_idx
            if num > opt.max_len - 1:
                while opt.max_len - 1 + (next_idx-start_idx) <= num:
                    final_preds.append(preds[next_idx:(next_idx + opt.max_len - 1)])            
                    next_idx += (opt.max_len - 1)
                left_num = num - (next_idx-start_idx)
                if left_num>0:
                    next_idx += (opt.max_len - 1)
                    final_preds.append(preds[(next_idx-left_num):next_idx])
                start_idx = next_idx  
        final_preds = np.concatenate(final_preds)
        
        if len(final_preds) == test_df.shape[0]:
            test_df['answered_correctly'] =  final_preds
        else:
            test_df['answered_correctly'] = 0.65
            
        env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    print("mission completed!")