In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

import lightgbm as lgb

import riiideducation

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Notebook by Braysen Goodwin

## Heavily based off of: https://www.kaggle.com/erikbruin/riiid-comprehensive-eda-baseline by Erik Bruin

# Read In Data

In [None]:
%%time

train = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")

print("Train size:", train.shape)

In [None]:
train.head()

# Preprocess Data

In [None]:
features = ['content_id', 'timestamp', 'content_type_id', 'task_container_id', 'prior_question_elapsed_time', 'prior_question_had_explanation']

def setupDataframes(dataframe, garbageCollect=False, includeLabels = False):
    """
    normalizes the dataframe by filling the missing values with special values, 
      conveting to the correct type, and only keeping the necessary data.
    
    params:
      dataframe - the pandas dataframe to normalize
      garbageCollect = False - whether to garbage collect after memory intensive operations
      includeLabels - should include also return the labels
    
    returns:
          data
            data - a pandas dataframe with the correct data to run the model on
        or 
          data, labels
            data - a pandas dataframe with the correct data to run the model on
            lables - the target labels for each row
    """
    data = dataframe[features]
    
    data['content_type_id'] = data['content_type_id'].replace(np.nan, -1)
    data['prior_question_elapsed_time'] = data['prior_question_elapsed_time'].replace(np.nan, -1)
    data['prior_question_had_explanation'] = data['prior_question_had_explanation'].replace(np.nan, -1)
    data['prior_question_had_explanation'] = data['prior_question_had_explanation'].apply(lambda x: -1 if x is None else int(x))
    
    if garbageCollect:
        gc.collect()

    data['content_type_id'] = data['content_type_id'].astype('int32')
    data['prior_question_had_explanation'] = data['prior_question_had_explanation'].astype('int32')

    if garbageCollect:
        gc.collect()
    
    if includeLabels:
        return data, dataframe['answered_correctly']
    
    return data
    
    
    
    
    
    

In [None]:
train, labels = setupDataframes(train, garbageCollect=True, includeLabels=True)

gc.collect()

train.head()

In [None]:
labels.head()

In [None]:
trainingCount = 90000000

# convert the dataset into an object the model can understand
train_dataset = lgb.Dataset(train[:trainingCount], labels[:trainingCount])
valid_dataset = lgb.Dataset(train[trainingCount:], labels[trainingCount:])

# Make and train the model

In [None]:
%%time
model = lgb.train(
    {'objective': 'binary', 'metric': 'auc'}, 
    train_dataset,
    valid_sets=[train_dataset, valid_dataset],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=8
)

In [None]:
gc.collect()

# Create Submission

In [None]:
env = riiideducation.make_env()

In [None]:
for (test_df, sample_prediction_df) in env.iter_test():
    tesdata = setupDataframes(test_df)
    test_df['answered_correctly'] =  model.predict(tesdata[features])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])