![](https://images.squarespace-cdn.com/content/5eddaaf244c33b340e4f21bd/1600233681723-SBKXS01CLPA4OZFMJYDZ/Screen+Shot+2020-09-16+at+2.21.10+PM.png?content-type=image%2Fpng)

# RIIID answer correctness prediction challenge

This is my solution notebook to [RIIID answer correctness prediction](https://www.kaggle.com/c/riiid-test-answer-prediction) competition. This notebook is intended for beginners with little coding experience. Please Upvote if you like the notebook, which motivates me for more contributions.

#### *Reference:* https://www.kaggle.com/datafan07/riiid-challenge-eda-baseline-model

###### Loading dependencies

In [None]:
import pandas as pd
import numpy as np
np.random.seed(3)
import matplotlib.pyplot as plt
import seaborn as sns
import riiideducation
import gc

%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [None]:
env = riiideducation.make_env()

In [None]:
types = {
        'row_id': 'int64','timestamp': 'int64','user_id': 'int32','content_id': 'int16','content_type_id': 'int8',
        'task_container_id': 'int16','user_answer': 'int8','answered_correctly': 'int8', 
        'prior_question_elapsed_time': 'float32','prior_question_had_explanation': 'boolean'
}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',low_memory=False, 
                       nrows=10**6, dtype=types)

questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')

lectures_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')

test_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')


train_df.head()

In [None]:
train_df.describe()

In [None]:
train_df['user_id'].nunique()

we have 3824 unique users

In [None]:
train_df.info()

we have few missing values for last tow columns.

In [None]:
train_df['content_type_id'].value_counts()

we have both lectures and questions

In [None]:
train_df['answered_correctly'].value_counts()

there are 3 possible values for answered correctly column

In [None]:
train_df[train_df['content_type_id'] == 1]['user_answer'].value_counts()

when the content is lecture the ANSWERED_CORRECTLY == -1 (NULL)

Files [https://www.kaggle.com/c/riiid-test-answer-prediction/data](http://)

train.csv

*    row_id: (int64) ID code for the row.
*    timestamp: (int64) the time in milliseconds between this user interaction and the first event completion from that user.
*    user_id: (int32) ID code for the user.
*    content_id: (int16) ID code for the user interaction
*    content_type_id: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.
*   task_container_id: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

*    user_answer: (int8) the user's answer to the question, if any. Read -1 as null, for lectures.
*    answered_correctly: (int8) if the user responded correctly. Read -1 as null, for lectures.
*    prior_question_elapsed_time: (float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.
*    prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

In [None]:
lectures_df.head()



lectures.csv: metadata for the lectures watched by users as they progress in their education.

*   lecture_id: foreign key for the train/test content_id column, when the content type is lecture (1).
*   part: top level category code for the lecture.
*   tag: one tag codes for the lecture. The meaning of the tags will not be provided, but these codes are sufficient for clustering the lectures together.
*   type_of: brief description of the core purpose of the lecture

In [None]:
questions_df.head()

questions.csv: metadata for the questions posed to users.

*     question_id: foreign key for the train/test content_id column, when the content type is question (0).
*     bundle_id: code for which questions are served together.
*     correct_answer: the answer to the question. Can be compared with the train user_answer column to check if the user was right.
*     part: the relevant section of the TOEIC test.
*     tags: one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

In [None]:
print(f'Train data shape:{train_df.shape}')
print(f'questions data shape:{questions_df.shape}')
print(f'lectures data shape:{lectures_df.shape}')
print(f'test data shape:{test_df.shape}')

## Data Transform and feature engineering

Encoding tags

In [None]:
from category_encoders import HashingEncoder
enc = HashingEncoder(cols = 'tags', n_components = 6)
questions_df = enc.fit_transform(questions_df)

In [None]:
from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()    

In [None]:
train_df = train_df.loc[train_df['answered_correctly'] != -1].reset_index(drop=True)
train_df = pd.merge(train_df,questions_df[['question_id','part','col_0', 'col_1', 'col_2',
                                          'col_3', 'col_4', 'col_5']], how='left',
                    left_on='content_id', right_on='question_id').sort_values('row_id')
train_df['part'] = train_df['part'].astype('int8')

In [None]:
usr_ans = train_df.groupby('user_id').agg({ 'answered_correctly': ['mean','sum', 'count']})
usr_ans.columns = ['avg_correct_answer','num_of_correct', 'total_answers']
usr_ans['num_of_correct'] = usr_ans['num_of_correct'].astype('int16')
usr_ans['total_answers'] = usr_ans['total_answers'].astype('int16')

train_df = pd.merge(train_df, usr_ans, how='left', on = 'user_id')

cnt_ans = train_df.groupby('content_id').agg({ 'answered_correctly': ['mean','sum', 'count']})
cnt_ans.columns = ['avg_correct_answer_c','num_of_correct_c', 'total_answers_c']
cnt_ans['num_of_correct_c'] = cnt_ans['num_of_correct_c'].astype('int32')
cnt_ans['total_answers_c'] = cnt_ans['total_answers_c'].astype('int32')

train_df = pd.merge(train_df, cnt_ans, how='left', on = 'content_id')

In [None]:
X = train_df.copy()
X['prior_question_elapsed_time'].fillna(0, inplace=True)
X['prior_question_had_explanation'] = X['prior_question_had_explanation'].fillna(value = False).astype(bool)

del train_df
gc.collect()

X = X.sort_values(['user_id'])
y = X[["answered_correctly"]]
X = X.drop(["answered_correctly"], axis=1)

In [None]:
X["prior_question_had_explanation_enc"] = lb_make.fit_transform(X["prior_question_had_explanation"])
X['prior_question_had_explanation_enc'] = X['prior_question_had_explanation_enc'].astype('int8')

In [None]:
X = X[['user_id','content_id','question_id','avg_correct_answer','task_container_id','num_of_correct','avg_correct_answer','total_answers','avg_correct_answer_c', 
       'num_of_correct_c','total_answers_c','prior_question_elapsed_time','prior_question_had_explanation_enc','part',
      'col_0', 'col_1', 'col_2','col_3', 'col_4', 'col_5']]

## Training LGBM

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(boosting_type= 'gbdt', colsample_bytree = 0.64, learning_rate= 0.01, objective= 'binary', 
                       random_state= 500, reg_alpha= 0.8, reg_lambda= 1, subsample= 0.6)

## GridSearchCV

In [None]:
#from sklearn.model_selection import GridSearchCV

gridParams = {'learning_rate': [0.01, 0.1],
              'random_state' : [500], 
              'colsample_bytree' : [0.62, 0.64], 
              'subsample' : [0.6,0.65], 
              'reg_alpha' : [0.8,1], 
              'reg_lambda' : [0.9,1]}

grid = GridSearchCV(model, gridParams, verbose=1, cv=4, n_jobs=-1)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
model.fit(X, y)

In [None]:
cnt_ans=cnt_ans.reset_index()
usr_ans=usr_ans.reset_index()

## Making predictions on test set

In [None]:
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:

    test_df = test_df.merge(usr_ans, how = 'left', on = 'user_id')
    test_df = test_df.merge(cnt_ans, how = 'left', on = 'content_id')
    
    test_df = pd.merge_ordered(test_df,questions_df[['question_id','part','col_0', 'col_1', 'col_2',
                                                     'col_3', 'col_4', 'col_5']], how='left', 
                               left_on='content_id', right_on='question_id', fill_method='ffill')
    
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df['prior_question_elapsed_time'].fillna(0, inplace=True)
    test_df['avg_correct_answer'].fillna(0.5, inplace=True)
    test_df['avg_correct_answer_c'].fillna(0.5, inplace=True)
    test_df.fillna(value = -1, inplace = True)
    
    test_df["prior_question_had_explanation_enc"] = lb_make.fit_transform(test_df["prior_question_had_explanation"])
  
    y_pred = model.predict_proba(test_df[['user_id','content_id','question_id','avg_correct_answer','task_container_id','num_of_correct',
                                          'avg_correct_answer','total_answers','avg_correct_answer_c',
                                          'num_of_correct_c','total_answers_c','prior_question_elapsed_time',
                                          'prior_question_had_explanation_enc','part','col_0', 'col_1', 
                                          'col_2','col_3', 'col_4', 'col_5']])[:, 1]
    
    test_df['answered_correctly'] = y_pred
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

**Thank you !!! Happy learning :)**