In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

In [None]:
#%%time
#read in data (dropped some columns)
#train_all = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',low_memory=False,nrows=10**6)
#train_q=train_all[train_all.content_type_id==0]
#train_q.fillna(0.0,inplace=True)
#train_q.prior_question_had_explanation=train_q.prior_question_had_explanation.astype(int)


# Draw plot to see if there are any obvious trends
#g = sns.pairplot(train_q,
#                 vars = ['answered_correctly',#'content_id',
#                         'prior_question_elapsed_time' ,'prior_question_had_explanation'],
#                 kind='scatter',
#                 markers = '.')
#plt.show()

# Read Data
Read in the training data and question reference data.

We only keep the column 

**'timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly'**

Then, we only consider the data where **'conten_type_id' == 0**, which means this record is about a question not a lecture

In [None]:
%%time
dtype = {'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','answered_correctly':'int8'}
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', usecols=[1, 2, 3,4,7], dtype=dtype)
train_df = train_df[train_df.content_type_id == 0]
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

question_df=pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv',usecols=[0,3])


#Set up part
question_df.part=question_df.part.fillna(-1).astype('int8')
question_df.columns=['content_id','part']

# Split the training dataset into 10 subsets

In [None]:
%%time
n=train_df.shape
n=n[0]
n=n//10
separated=[]
for i in range(0,10):
    separated.append(train_df.iloc[i*n:(i+1)*n-1,1:].copy())

# Group all the training data by **'user_id'**

In [None]:
%%time
group = train_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))
content_id=train_df.content_id
IDs=train_df.user_id
del train_df

# Create a **'user_ref'** to store the information of each user.

It includes:
*  The number of different questions in total.
*  The number of tries (because a user might do a question several times)
*  The historical correct rate of this user



In [None]:
%%time
a,b=np.unique(group.iloc[0][0],return_counts=True)
n=group.shape
n=n[0]
user_ref=[]
ids=np.asarray(group.index)
for i in range(0,n):
    temp=group.iloc[i][0]
    unique, counts=np.unique(temp,return_counts=True)
    del temp
    types=len(unique)
    all_q=sum(counts)
    avg=group.iloc[i][1].mean()
    temp=[ids[i],types,all_q,avg]
    user_ref.append(temp)

user_ref=pd.DataFrame(user_ref, columns=['user_id', 'number_of_questions', 'number_of_total_tries','correct_rate'])

# For each event, we attach the information of the user and the information of the question

In [None]:
%%time
new_trains=[]
for i in range(0,10):
    temp=separated[0].merge(user_ref,left_on='user_id', right_on='user_id')
    temp=temp.merge(question_df,left_on='content_id',right_on='content_id')
    new_trains.append(temp)
    del separated[0]

# Put the 10 subsets back together

In [None]:
%%time
final_train=new_trains[0]
#final_train=[]
del new_trains[0]
for i in range(0,9):
    #final_train=pd.concat([final_train,new_trains[0]])
    final_train=final_train.append(new_trains[0], ignore_index=True)
    del new_trains[0]

# Finishing the Setup
Use the train_X as the train set, it has columns 

**'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'number_of_questions', 'number_of_total_tries', 'correct_rate', 'part'**

And train_Y  is its corresponding y

In [None]:
%%time
train_Y=final_train[['answered_correctly']]
train_X=final_train[['user_id', 'content_id', 'content_type_id', 'number_of_questions', 'number_of_total_tries', 'correct_rate', 'part']]
del final_train

# Define a function to deal with test dataset.



In [None]:
#We will assume the format of test set is the same as the give sample
#It takes a given dataset (with only question entries), and convert it 
#to the same format as the training set.
def formattest(given):
    test=given[['user_id', 'content_id', 'content_type_id']]
    test=test.merge(user_ref,left_on='user_id', right_on='user_id',how='left').merge(question_df,left_on='content_id',right_on='content_id',how='left')
    test=test.fillna(-1)
    return test


In [None]:
train_X.head()
#train_X.shape

In [None]:
train_Y.head()


In [None]:
xmean = train_X.mean()
train_X.fillna(xmean)
ymean = train_Y.mean()
train_Y.fillna(ymean)

train_X = train_X.to_numpy()
train_Y = train_Y.to_numpy()

In [None]:
from sklearn.model_selection import train_test_split
print(train_X.shape)
print(train_Y.shape)
train_X = train_X[1:1000000,:]
train_Y = train_Y[1:1000000,:]

print(train_X.shape)
print(train_Y.shape)
X_train, X_test, Y_train, Y_test = train_test_split(train_X, train_Y, test_size=0.4, random_state=0)

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split  
#from sklearn.preprocessing import Imputer 
from sklearn.preprocessing import MinMaxScaler  
from sklearn.preprocessing import label_binarize  
from sklearn.decomposition import PCA 
from sklearn.ensemble import RandomForestClassifier  
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier


#clf = AdaBoostClassifier(n_estimators=500)
#clf.fit(X_train, Y_train)
#y_pred = clf.predict(X_test)
#accuracy_score(Y_test, y_pred)



In [None]:
#clf = GradientBoostingClassifier(n_estimators=500)
#clf.fit(X_train, Y_train)
#y_pred = clf.predict(X_test)
#accuracy_score(Y_test, y_pred)

In [None]:
%%time
rfc = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=3, max_features=None, random_state=0)
rfc.fit(X_train, Y_train)
y_pred = rfc.predict(X_test)
accuracy_score(Y_test, y_pred)

In [None]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()


In [None]:
for (test_df,sample_prediction_df) in iter_test:
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    sample_test = test_df[test_df['content_type_id'] == 0]
    test = formattest(sample_test)
    del sample_test
    #del sample_test
    res = rfc.predict_proba(test)[:,1]
    test_df['answered_correctly'] = res 
    del test
    test_df.fillna(0.5, inplace=True)
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])