# Riiid classification using NN & Random Forest

The initial purpose of this notebook was to make a very simple baseline for my own usage to better understand the data and the submission process. As I got a fairly decent score (given the simplicity of the model) I decided to share the notebook in case it can be helpful to someone else. This model could be improved by adding more features.

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from numpy import loadtxt
import tensorflow as tf
from sklearn.model_selection import train_test_split
import keras
import pandas as pd
import numpy as np
import riiideducation
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping

## Read the data
<p>I read the data in feather format from this notebook https://www.kaggle.com/aralai/riiid-feather-dataset. It's much faster! 
If you don't want to use feather, you can just replace the following lines with the csv format reading.

In [None]:
#%%time
#col_list = ["user_id", "content_id","answered_correctly"]
train = pd.read_feather('../input/user-id-content-id-answered-correctly/user_id_content_id_answered_correctly.feather')
questions = pd.read_feather('../input/riiid-feather-dataset/questions.feather')
lectures = pd.read_feather('../input/riiid-feather-dataset/lectures.feather')
example_test = pd.read_feather('../input/riiid-feather-dataset/example_test.feather')
example_sample_submission = pd.read_feather('../input/riiid-feather-dataset/example_sample_submission.feather')

In [None]:

train = train.sample(frac=0.11)


In [None]:
train

In [None]:
questions.columns

In [None]:
questions.tags.value_counts()

In [None]:
questions.part.value_counts()

In [None]:
train.columns

In [None]:
#df = train["user_id","content_id","answered_correctly"]

# ERFAN: Try to use/make different features user_id and content_id are probably not super informative to models.
#df = train[['user_id','content_id','answered_correctly']]


In [None]:
def prepare_features(col_name):
    #df = train[train.content_type_id==0][[col_name,'answered_correctly']].groupby(col_name).agg(['count','sum'])
    df = train[[col_name,'answered_correctly']].groupby(col_name).agg(['count','sum'])
    #print(df)
    if col_name == 'content_id':
        col_name = 'question'
        #TODO: Add question_entropy
        # try decomposition methods?
        
    elif col_name == 'user_id':
        col_name = 'student'
        # TODO: user choice entropy
        
    df.columns=[col_name + '_total', col_name + '_correct']
    df = df.astype('uint64')
    df[col_name +'_incorrect'] = df[col_name + '_total'] - df[ col_name + '_correct']
    df[col_name +'_correct_ratio'] = df[ col_name + '_correct']/df[col_name + '_total']
    return df
    

In [None]:
questions_dataframe = prepare_features('content_id')
questions_dataframe['content_id'] = list(questions_dataframe.index)
questions_dataframe = questions_dataframe.rename_axis("question_index")
questions_dataframe

In [None]:
#import matplotlib.pyplot as plt
#fig = plt.figure()
#ax = fig.add_axes([0,0,1,1])
#langs = ['Answered Correctly', 'Answered Incorrectly']
#results = [sum(questions_dataframe['question_correct']),sum(questions_dataframe['question_incorrect'])]
#ax.bar(langs,results)
#plt.show()

In [None]:
#import matplotlib.pyplot as plt

# Pie chart, where the slices will be ordered and plotted counter-clockwise:
#labels = 'Answered Correctly', 'Answered Incorrectly'
#sizes = [sum(questions_dataframe['question_correct']), sum(questions_dataframe['question_incorrect'])]
#explode = (0, 0.1)  # only "explode" the 2nd slice (i.e. 'Hogs')

#fig1, ax1 = plt.subplots()
#ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
       # shadow=True, startangle=90)
#ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

#plt.show()

In [None]:
users_dataframe = prepare_features('user_id')
users_dataframe['user_id'] = list(users_dataframe.index)
users_dataframe = users_dataframe.rename_axis("user_index")
users_dataframe

In [None]:
df = pd.merge(train,users_dataframe,how = 'inner',on = 'user_id')
df = pd.merge(df,questions_dataframe,how = 'inner',on = 'content_id')
df

# Replace Null values

In [None]:
#df['prior_question_elapsed_time'].fillna(-1,inplace = True)
#df['prior_question_had_explanation'].fillna(-1,inplace = True)

In [None]:
#df['prior_question_had_explanation'] *= 1 # convert from boolean to numbers

In [None]:
#df['prior_question_elapsed_time'].value_counts()

In [None]:
#df['prior_question_had_explanation'].value_counts()

In [None]:
#feature_list = ['content_id','prior_question_elapsed_time','prior_question_had_explanation', 'question_correct_ratio','student_correct_ratio']
feature_list = ['content_id','question_correct_ratio','student_correct_ratio']
X = df[feature_list].to_numpy()

y = df['answered_correctly'].to_numpy()

In [None]:
X = X.astype(float)


In [None]:
X

In [None]:
y = y.astype(float)

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)


In [None]:
X_train

In [None]:
# ERFAN: This structure needs revision - I dont think it's super good for this task.
model = Sequential()
model.add(Dense(128, input_dim=len(feature_list), activation='relu'))
model.add(Dense(96, activation='relu'))
model.add(Dense(32, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

In [None]:
optimizer = keras.optimizers.Adam(lr=0.0001)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=9)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[tf.keras.metrics.AUC(),'accuracy']) #ERFAN: added auc since that's what we are optimizing for - loss seems static.

In [None]:
model.fit(X_train, y_train, epochs=30, batch_size=50,validation_data=(X_test, y_test),verbose=1, callbacks=[es]) 

In [None]:
predictions = model.predict(X_test)

In [None]:
nnpredictions = model.predict_classes(X_test)

In [None]:
X_test

In [None]:
y_test

In [None]:
predictions

In [None]:
nnpredictions

In [None]:
def prepare_test_features(df,test_df,feature_list):
    #['content_id','prior_question_elapsed_time','prior_question_had_explanation', 'question_correct_ratio','student_correct_ratio']
    
        
    test_df = pd.merge(test_df,questions_dataframe[['content_id', 'question_correct_ratio']],how='left',on='content_id')
    test_df = pd.merge(test_df,users_dataframe[['user_id','student_correct_ratio']],how='left',on='user_id')
    #test_df['prior_question_elapsed_time'].fillna(-1,inplace = True)
    #test_df['prior_question_had_explanation'] *= 1 # convert from boolean to numbers
    #test_df['prior_question_had_explanation'].fillna(-1,inplace = True)
    #test_df.fillna(-1,inplace = True)
    return test_df[feature_list].to_numpy().astype(float)

## Submission<p>
<code>example_test</code> contains just a few dummy rows that can be used for development. The real submission must be made by using <code>riiideducation</code> package. To do that, we must create an environment and then loop through all the batches provided by <code>iter_test</code>.

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

For each batch in <code>test_df</code>, we will predict the probability of answering correctly (<code>nb.predict</code>) and then we will send the resulting data back to the environment. This last part is done in <code>env.predict</code>. Notice that we must not create any <code>submission.csv</code> file, this is done automatically by <code>env.predict</code>.

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_questions = test_df['content_id'].to_numpy()
    test_users = test_df['user_id'].to_numpy()
    test_set = prepare_test_features(df,test_df,feature_list)
    answered_correctly = model.predict_classes(test_set)
    #answered_correctly = clf.predict(test_set)
    test_df['answered_correctly'] = answered_correctly
    env.predict(test_df.loc[test_df['content_type_id']==0,['row_id','answered_correctly']])