**Welcome!** Here is a baseline model for the Riiid challenge explained:

In [None]:
import riiideducation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
env = riiideducation.make_env()

The dataset for training exceeds the RAM, if you do not use Google Cloud Storage. The dataset for testing, on the other hand, cannot be accessed directly, but the organisers of this competition provide a module for handling the data in batches. It's explained in this [Notebook](https://www.kaggle.com/sohier/competition-api-detailed-introduction). However, there are also more efficient ways to download and store the training data than csv to pandas(See this [Notebook](https://www.kaggle.com/rohanrao/riiid-with-blazing-fast-rid)). Still, we simply resort to using csv to pandas: We load the dataset that contains statistics on one specific answer given by a user to a question. Unfortunately, there are users in the test set for which we do not have data in this dataset:

In [None]:
group3 = pd.read_csv("../input/flatten/group3.csv")
question2 = pd.read_csv("../input/flatten/question2.csv",usecols=[1,2,3,4,5])
results_u2_final = pd.read_csv("../input/flatten/results_u2_final.csv",usecols = [1,2])
results_u_final = pd.read_csv("../input/flatten/results_u_final.csv",usecols = [1,2,3])
#diff_score = pd.read_csv("../input/for-predicton-6/diff_score.csv",usecols = [1,2,3])
#part_score = pd.read_csv("../input/for-predicton-6/part_score.csv",usecols = [1,2,3])
dp = pd.read_csv("../input/flatten/dp.csv",usecols = [1,2,3,4,5,6,7,8,9,10,11,12])
columns = dp.columns
dp = dp.drop(columns = "part_score_7")
dp.columns = columns[1:]


In [None]:
results_u_final.head()

In [None]:
#question2 = question2.astype({"quest_pct":"float16",'content_id':"category"})
#group3 = group3.astype({"avg_questions":"float32","avg_questions_seen":"float32",'task_container_id':"category"})
#results_u_final = results_u_final.astype({"answered_correctly_user":"float32","answered_user":"float32","user_id":"category"})
#results_u2_final = results_u2_final.astype({"user_id":"category"})
#part_score = part_score.astype({"user_id":"category","part":"category","part_score":"float32"})
#diff_score = diff_score.astype({"user_id":"category","difficulty":"category","diff_score":"float32"})

#question2 = question2.astype({"quest_pct":"float16",'question_id':"category"})
#group3 = group3.astype({"avg_questions":"float32","avg_questions_seen":"float32",'task_container_id':"category"})
#results_u_final = results_u_final.astype({"answered_correctly_user":"float32","answered_user":"float32","user_id":"category"})
#results_u2_final = results_u2_final.astype({"user_id":"category"})

In [None]:
prior_mean_user = 0.5664264045515732
content_mean = 0.7094600658138017
elapsed_mean = 13238.587890625

In [None]:
#import pickle
 
#lb_make = pickle.load(open('../input/for-predicton-6/lb_make.pickle', 'rb'))

In [None]:
model = lgb.Booster(model_file='../input/flatten/lgb_classifier_1.txt')

In [None]:
#diffs = [0.165512,0.341864,0.544930,0.741148,0.898677]
#parts = [0.815105,0.745296,0.744598,0.724918,0.666118,0.699596,0.717958]

In [None]:
results_u3_final = pd.merge(results_u_final,results_u2_final,how="outer",on="user_id")
results_u3_final = pd.merge(results_u3_final,dp,how="outer",on="user_id")
medians = results_u3_final.median()
medians["user_id"]

# Predict

We create an iterator of the test set using the function provided by the compition organiser. For each element in this iterator, we do the following: 1 We add the features that we computed, 2 We replace missing data in the same way that we did it in the training set, 3 We predict the target, and 4 We submit the predicitions with the function that is provided by the compition organisers:

In [None]:
iter_test = env.iter_test()

def fillpart(data):
    def getpartaverage(i):
        return parts[i-1]

    _data = data.copy()
    _data.loc['part_score',:] = _data.part.map(getpartaverage)
    return _data

def filldiff(data):
    def getdiffaverage(i):
        return diffs[i-1]

    _data = data.copy()
    _data.loc['part_score',:] = _data.part.map(getdiffaverage)
    return _data

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    

    test_df['task_container_id'] = test_df.task_container_id.mask(test_df.task_container_id > 9999, 9999)
    test_df = pd.merge(test_df, group3, left_on=['task_container_id'], right_index= True, how="left")
    test_df = pd.merge(test_df, question2, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df = pd.merge(test_df, results_u3_final, on = 'user_id', how = 'left')


    test_df['explanation_mean_user'].fillna(prior_mean_user,  inplace=True)
    test_df['answered_correctly_user'].fillna(0.65,  inplace=True)
    test_df['quest_pct'].fillna(content_mean,  inplace=True)

    test_df['part'].fillna(5, inplace = True)
    test_df['avg_questions_seen'].fillna(1, inplace = True)
    test_df['prior_question_elapsed_time'].fillna(elapsed_mean, inplace = True)
    test_df['difficulty'].fillna(3,inplace=True)
    test_df[columns[1:]].fillna(medians,inplace=True)

    test_df['answered_correctly'] =  model.predict(test_df[['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_questions_seen',
       'prior_question_elapsed_time', 'part', 'difficulty', "diff_score_0","diff_score_1","diff_score_2","part_score_1","part_score_2","part_score_3",
              "part_score_4","part_score_5","part_score_6","part_score_7"]])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

In [None]:
#print(a)

# Acknowledgement
I am grateful to Takamotoki for inspiring me with this notebook: https://www.kaggle.com/takamotoki/lgbm-iii-part2