In [None]:
import numpy as np
import pandas as pd
import random
import gc

from tqdm.notebook import tqdm

random.seed(1)

In [None]:
!pip install adabelief-tf

In [None]:
train_df = pd.read_csv("../input/riiid-test-answer-prediction/train.csv",
                       usecols=[2, 3, 4, 6],
                       dtype={
                              'user_id': 'int32',
                              'content_id': 'int16',
                              'user_answer': 'int8',
                              }
                      )
lectures_df = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')

In [None]:
question_id_map = {id_: i+1 for i, id_ in enumerate(questions_df["question_id"])}
lecture_id_map = {id_: i+questions_df.shape[0]+1 for i, id_ in enumerate(lectures_df["lecture_id"])}
questions_df["content_id"] = questions_df["question_id"].map(question_id_map)
lectures_df["content_id"] = lectures_df["lecture_id"].map(lecture_id_map)

train_df.loc[train_df["user_answer"] != -1, "content_id"] = train_df.loc[train_df["user_answer"] != -1, "content_id"].map(question_id_map)
train_df.loc[train_df["user_answer"] == -1, "content_id"] = train_df.loc[train_df["user_answer"] == -1, "content_id"].map(lecture_id_map)

train_df["choice_id"] = train_df["content_id"].astype(np.int32)*4 + train_df["user_answer"].astype(np.int32) * (train_df["user_answer"] >= 0)


In [None]:
import tensorflow as tf
choice_parts = [0] * ((lectures_df["content_id"].max()+1) * 4)
choice_tags = [[] for _ in range((lectures_df["content_id"].max()+1) * 4)]

for i, row in questions_df.iterrows():
    tags = [] if pd.isna(row["tags"]) else list(map(int, row["tags"].split()))
    for i in range(4):
        choice_tags[i + row["content_id"]*4] = [t for t in tags]
        choice_parts[i + row["content_id"]*4] = row["part"]

for i, row in lectures_df.iterrows():
    tags = [row["tag"]]
    for i in range(4):
        choice_tags[i + row["content_id"]*4] = [t for t in tags]
        choice_parts[i + row["content_id"]*4] = row["part"]
        
choice_parts = tf.constant(choice_parts)
choice_tags = tf.keras.preprocessing.sequence.pad_sequences(choice_tags, dtype="int16", value=-1, padding="post") + 1

In [None]:
start_of_records = (lectures_df["content_id"].max() + 1) * 4
records = []
records_ixs = {}
for i, (user_id, df) in tqdm(enumerate(train_df.groupby("user_id")), total=train_df["user_id"].nunique()):
    records.append(np.int32(np.concatenate([[start_of_records], df["choice_id"].values])))
    records_ixs[user_id] = i

In [None]:
from gc import collect
del train_df
collect()

In [None]:
sentences = []
for rec in tqdm(records):
    sentence = [str(start_of_records//4)]
    for word in rec:
        sentence.append(str(word//4))
    sentences.append(sentence)

In [None]:
from gensim.models import word2vec
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

w2v_model = word2vec.Word2Vec(size=64, min_count=1, window=3, iter=30, sample=1e-5, ns_exponent=.5, workers=4, sg=1) 
w2v_model.build_vocab(sentences, progress_per=10000)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=5, report_delay=1)

In [None]:
w2v_vec = []
for word in range(lectures_df["content_id"].max()+2):
    word = str(word)
    if word in w2v_model.wv.vocab:
        w2v_vec.append(w2v_model.wv[word])
    else:
        w2v_vec.append(w2v_model.wv.vectors.mean(axis=0))

In [None]:
import umap
embs = np.array(w2v_vec)

ixs = np.arange(13943)
np.random.shuffle(ixs)

mapper = umap.UMAP(n_neighbors=15, n_components=2, metric="cosine", verbose=True).fit(embs[ixs])

import matplotlib.pyplot as plt
part = choice_parts.numpy()[np.arange(13942, dtype=np.uint16)*4]
umap_emb = mapper.transform(embs)
for part in range(1, 8):
    ix = np.where((choice_parts.numpy() == part)[::4])
    plt.scatter(umap_emb[ix, 0], umap_emb[ix, 1], s=5, label=part)


plt.legend()
plt.tight_layout()

In [None]:
w2v_vec = []
for word in range(lectures_df["content_id"].max()+2):
    word = str(word)
    if word in w2v_model.wv.vocab:
        w2v_vec.append(w2v_model.wv[word])
    else:
        w2v_vec.append(w2v_model.wv.vectors.mean(axis=0))
w2v_vec = np.array(w2v_vec)

pd.to_pickle(w2v_vec, "word2vec_weight.npy")