In [3]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score

from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense

In [4]:
train = pd.read_csv(
    'data/train.csv',
    usecols=[1, 2, 3, 4, 5, 7, 8, 9],
    nrows=10**7*1,
    dtype={
        'timestamp': 'int64',
        'user_id': 'int32',
        'content_id': 'int16',
        'content_type_id': 'int8',
        'task_container_id': 'int16',
        'answered_correctly':'int8',
        'prior_question_elapsed_time': 'float32',
        'prior_question_had_explanation': 'boolean'
    }
)

In [5]:
questions_df = pd.read_csv(
    'data/questions.csv',                         
    usecols=[0, 3],
    dtype={
        'question_id': 'int16',
        'part': 'int8'}
)

In [6]:
questions_df

Unnamed: 0,question_id,part
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
13518,13518,5
13519,13519,5
13520,13520,5
13521,13521,5


In [7]:
lectures_df = pd.read_csv('data/lectures.csv')
lectures_df

Unnamed: 0,lecture_id,tag,part,type_of
0,89,24584,5,concept
1,100,22243,1,concept
2,185,7035,6,concept
3,192,31458,5,solving question
4,317,19653,5,solving question
...,...,...,...,...
413,32535,20004,5,solving question
414,32570,4358,3,solving question
415,32604,9093,6,concept
416,32625,23993,2,concept


In [8]:
features_df = train.iloc[:int(9 /10 * len(train))]
train_df = train.iloc[int(9 /10 * len(train)):]

In [9]:
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]
grouped_by_user_df = train_questions_only_df.groupby('user_id')
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']}).copy()
user_answers_df.columns = [
    'mean_user_accuracy', 
    'questions_answered', 
    'std_user_accuracy', 
    'median_user_accuracy', 
    'skew_user_accuracy'
]

user_answers_df

Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,0.695652,46,0.465215,1.0,-0.879359
124,0.233333,30,0.430183,0.0,1.328338
2746,0.578947,19,0.507257,1.0,-0.347892
5382,0.672000,125,0.471374,1.0,-0.741648
8623,0.642202,109,0.481566,1.0,-0.601619
...,...,...,...,...,...
196087402,0.357143,14,0.497245,0.0,0.670360
196112832,0.764706,17,0.437237,1.0,-1.372252
196116047,0.640000,25,0.489898,1.0,-0.621247
196120982,0.601562,256,0.490535,1.0,-0.417349


In [10]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']}).copy()
content_answers_df.columns = [
    'mean_accuracy', 
    'question_asked', 
    'std_accuracy', 
    'median_accuracy', 
    'skew_accuracy'
]

content_answers_df

Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.901274,628,0.298532,1.0,-2.696906
1,0.889571,652,0.313665,1.0,-2.491635
2,0.558317,4064,0.496649,1.0,-0.234957
3,0.779570,2046,0.414638,1.0,-1.349820
4,0.627244,2841,0.483623,1.0,-0.526582
...,...,...,...,...,...
13518,0.782051,78,0.415525,1.0,-1.393293
13519,0.569620,79,0.498293,1.0,-0.286693
13520,0.702703,74,0.460188,1.0,-0.905426
13521,0.782051,78,0.415525,1.0,-1.393293


In [11]:
del features_df
del grouped_by_user_df
del grouped_by_content_df

In [12]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'skew_accuracy'
]

target = 'answered_correctly'

In [13]:
train_df = train_df[train_df[target] != -1]

train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')

train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
train_df = train_df.fillna(value=0.5)

train_df = train_df[features + [target]]
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.5)

train_df

Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,prior_question_elapsed_time,prior_question_had_explanation,skew_accuracy,answered_correctly
0,0.611722,273.0,0.488254,1.0,-0.461015,0.867188,384.0,0.339815,1.0,19000.0,True,-2.172419,1
1,0.611722,273.0,0.488254,1.0,-0.461015,0.533800,429.0,0.499439,1.0,15000.0,True,-0.135984,0
2,0.611722,273.0,0.488254,1.0,-0.461015,0.715385,1040.0,0.451448,1.0,13000.0,True,-0.956033,1
3,0.611722,273.0,0.488254,1.0,-0.461015,0.674593,2271.0,0.468630,1.0,16000.0,True,-0.745776,0
4,0.611722,273.0,0.488254,1.0,-0.461015,0.719101,890.0,0.449691,1.0,16000.0,True,-0.976647,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
981089,0.500000,0.5,0.500000,0.5,0.500000,0.608618,557.0,0.488498,1.0,23000.0,True,-0.446302,1
981090,0.500000,0.5,0.500000,0.5,0.500000,0.453099,4776.0,0.497848,0.0,13000.0,True,0.188495,0
981091,0.500000,0.5,0.500000,0.5,0.500000,0.507086,1623.0,0.500104,1.0,66000.0,True,-0.028372,0
981092,0.500000,0.5,0.500000,0.5,0.500000,0.541939,3517.0,0.498309,1.0,37000.0,True,-0.168422,1


In [14]:
train_df, test_df = train_test_split(train_df, random_state=666, test_size=0.2)

In [15]:
def create_nn():
  model = tf.keras.Sequential(
      [
       tf.keras.layers.Input(12),
       tf.keras.layers.BatchNormalization(),
       tf.keras.layers.Dense(200, activation="relu"),
       tf.keras.layers.BatchNormalization(),
       tf.keras.layers.Dropout(0.5),
       tf.keras.layers.Dense(30, activation="relu"),
       tf.keras.layers.BatchNormalization(),
       tf.keras.layers.Dropout(0.2),
       tf.keras.layers.Dense(1, activation="sigmoid")
      ]
  )
  model.compile(optimizer='adam', loss="binary_crossentropy", metrics=['accuracy'])
  return model

In [16]:
nn = create_nn()
history = nn.fit(np.array(train_df[features], np.float), 
          np.array(train_df[target], np.float),
          validation_split=0.2,
          epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
