## About

This Notebook generates LDA feature used in the [9th place LightGBM](https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/210354) with 20M rows of data.

In the actual solution, I ran this script in GCP with all rows (due to [this behavior](https://github.com/pandas-dev/pandas/issues/26314), pandas version must be 0.21 or earlier to run this script with all rows).

In [None]:
import gc

import pandas as pd
import numpy as np

from sklearn.decomposition import LatentDirichletAllocation

In [None]:
df = pd.read_feather('/kaggle/input/riiidtrainfeather/train.f', 
                     columns=['user_id', 'content_id', 'content_type_id', 'answered_correctly']).head(20000000)
q = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')

df = df[(df['answered_correctly'] == 0) & (df['content_type_id'] == 0)].drop('answered_correctly', axis=1)
gc.collect()

In [None]:
mat = df.groupby(['content_id', 'user_id'])['content_type_id'].count()
mat = mat.astype(np.uint8)
mat = mat.unstack(fill_value=0)

In [None]:
LDA_DIM = 10

lda = LatentDirichletAllocation(LDA_DIM, random_state=0)
transformed = lda.fit_transform(mat)

In [None]:
transformed_df = pd.DataFrame(transformed)
transformed_df.columns = [f'lda_item_inc_{i}' for i in range(LDA_DIM)]
transformed_df = transformed_df.astype(np.float32)
transformed_df['question_id'] = mat.index.tolist()

In [None]:
qmat = pd.merge(q[['question_id']], transformed_df, on='question_id', how='left')
qmat.head()

In [None]:
qmat.columns = ['question_id'] + [f'lda_item_inc_{i}' for i in range(LDA_DIM)]
qmat.to_feather('question_lda_incorrect_mat.f')