In [None]:
# load libraries
import numpy as np  # linear algebra
import pandas as pd # general data functionality

## Prepare training data

In [None]:
print('Preparing training data...')

# import training data
data_types_dict = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

Data = pd.read_csv('../input/riiid-test-answer-prediction/train.csv', 
                   nrows=10**6,
                   dtype = data_types_dict)

In [None]:
Data_questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv', 
                   nrows=10**3)
Data_questions.head()

In [None]:
# create content group prior means
# (https://www.kaggle.com/lgreig/simple-lgbm-baseline)
# I have created this based on the entire training sample and import
#     a saved copy of it now to save time
results_c = pd.read_csv('../input/riid-questions-priorscsv/question_priors.csv')
ansered_correctly_unconditional = results_c['answered_correctly_content'].mean()

In [None]:
# Create data regarding questions
# (https://www.kaggle.com/jsylas/riiid-lgbm-starter)
Data_questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv',
                             usecols=[0,1,3,4],
                             dtype={'question_id': 'int16','part': 'int8','bundle_id': 'int8','tags': 'str'})

tag = Data_questions["tags"].str.split(" ", n = 10, expand = True) 
tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']

Data_questions =  pd.concat([Data_questions,tag],axis=1)
Data_questions['tags1'] = pd.to_numeric(Data_questions['tags1'], errors='coerce')
Data_questions['tags2'] = pd.to_numeric(Data_questions['tags2'], errors='coerce')
Data_questions['tags3'] = pd.to_numeric(Data_questions['tags3'], errors='coerce')
Data_questions['tags4'] = pd.to_numeric(Data_questions['tags4'], errors='coerce')
Data_questions['tags5'] = pd.to_numeric(Data_questions['tags5'], errors='coerce')
Data_questions['tags6'] = pd.to_numeric(Data_questions['tags6'], errors='coerce')

Data_questions['tags1'].fillna(0, inplace = True)
Data_questions['tags2'].fillna(0, inplace = True)
Data_questions['tags3'].fillna(0, inplace = True)
Data_questions['tags4'].fillna(0, inplace = True)
Data_questions['tags5'].fillna(0, inplace = True)
Data_questions['tags6'].fillna(0, inplace = True)

Data_questions.drop(['tags'], axis =1, inplace = True)

In [None]:
# merge data together
Data = Data.merge(results_c, on = 'content_id', how = 'left')
Data = Data.merge(Data_questions, how = 'left', left_on = 'content_id', right_on = 'question_id')

In [None]:
# remove invalid 'answered_correctly' entries and keep only questions
Data = Data[Data['answered_correctly'] != -1]
Data = Data[Data['content_type_id'] == 0]

# impute missing values  
Data['prior_question_had_explanation'].fillna(False, inplace = True)
Data = Data.replace([np.inf, -np.inf], np.nan)
Data['prior_question_elapsed_time'].fillna(0, inplace = True)

# drop unwanted columns 
Data.drop(['user_id', 'row_id', 'user_answer', 'question_id', 'task_container_id',
           'content_id', 'content_type_id'], 
          axis = 1, inplace = True)

# cast variables as factors (keep code in case needed)
Data['prior_question_had_explanation'] = Data['prior_question_had_explanation'].astype(int)

In [None]:
# Create vector or Y
# store correct answers for later
labels = np.array(Data['answered_correctly'])

# Create matrix of regressors
# store regressors
Data = Data.drop('answered_correctly', axis = 1)
# store regressor names
feature_list = list(Data.columns)# Convert to numpy array
# cast regressors as numpy array for scikit learn
Data = np.array(Data)

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split 

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(Data, 
                                                                            labels, 
                                                                            test_size = 0.25, 
                                                                            random_state = 42)

## Fit the model

In [None]:
print('Fitting the model...')

# Import the model we are using
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier

# instantiate the random forest classifier
model_rf = RandomForestClassifier(n_estimators = 10,
                                  max_depth = 3,
                                  random_state =1)

# instantiate the random forest classifier
model_gbm = GradientBoostingClassifier(n_estimators = 10,
                                       max_depth = 1,
                                       random_state =1)

model_nn = MLPClassifier(hidden_layer_sizes = [50,25],
                         random_state=1)

# package classifiers together in voting machine
model_voting = VotingClassifier(estimators=[('rf',  model_rf), 
                                            ('gbm', model_gbm),
                                            ('nn',  model_nn)], 
                                voting = 'soft',
                                n_jobs = 4)

# fit the voting machine
model_voting_fit = model_voting.fit(Data, labels)

In [None]:
from sklearn.metrics import roc_auc_score

# Evaluate the fit of the models 
# predictions  
predictions = model_voting_fit.predict(test_features)
# calcualte AUC
roc_auc_score(predictions, test_labels)

In [None]:
print('Submitting official predictions...')

import riiideducation
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    
    # Merge in extra features
    test_df = test_df.merge(results_c, on = 'content_id', how = 'left')
    test_df = test_df.merge(Data_questions, how = 'left', left_on = 'content_id', right_on = 'question_id')
   
    # Data cleaning
    ## impute missing values  
    test_df['prior_question_had_explanation'].fillna(False, inplace = True)
    test_df['prior_question_elapsed_time'].fillna(0, inplace = True)
    with pd.option_context('mode.use_inf_as_na', True):
        test_df = test_df.dropna(subset = ['prior_question_elapsed_time'], how = 'all')

    
    test_df['answered_correctly_content'].fillna(ansered_correctly_unconditional, inplace = True)
    
    test_df['bundle_id'].fillna(0, inplace = True)
    test_df['part'].fillna(0, inplace = True)
    test_df['tags1'].fillna(0, inplace = True)
    test_df['tags2'].fillna(0, inplace = True)
    test_df['tags3'].fillna(0, inplace = True)
    test_df['tags4'].fillna(0, inplace = True)
    test_df['tags5'].fillna(0, inplace = True)
    test_df['tags6'].fillna(0, inplace = True)
    
    ## drop unwanted columns 
    row_id = test_df['row_id']
    content_type_id = test_df['content_type_id']
    test_df = test_df[feature_list]

    ## cast variables as factors
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].astype(int)

    # Make predictions
    test_df['answered_correctly'] = model_voting_fit.predict_proba(test_df)[:,1]
    test_df['row_id'] = row_id
    test_df['content_type_id'] = content_type_id
    
    # Submit predictions
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])