## Notebook Set Up

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import random
import riiideducation
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', skiprows=lambda i: i>0 and random.random() > 0.1)
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')

In [None]:
train.head(2)

## Data Prep

In [None]:
# question stats - question_success_rate, part_success_rate
# user general stats - user_success_rate, user_part_success_rate, user_relative_success_rate, 
# user current stats - lectures_watched, prior_question_elapsed_time, prior_question_had_explanation, prior_group_answers_correct

In [None]:
user_general_stats = train[train['content_type_id']==0][['user_id', 'answered_correctly']].groupby('user_id').agg({'answered_correctly':
                                  ['count', np.sum]})
user_general_stats.columns = user_general_stats.columns.droplevel()
user_general_stats = user_general_stats.reset_index().rename(columns={'sum': 'correct_answers', 'count': 'total_questions'})
user_general_stats['user_success_rate'] = user_general_stats['correct_answers']/user_general_stats['total_questions']
user_general_stats = user_general_stats[['user_id', 'user_success_rate']]
user_general_stats.head()

In [None]:
question_stats = train[train['content_type_id']==0][['content_id', 'answered_correctly']].groupby('content_id').agg({'answered_correctly':
                                  ['count', np.sum]})
question_stats.columns = question_stats.columns.droplevel()
question_stats = question_stats.reset_index().rename(columns={'sum': 'correct_answers', 'count': 'total_questions'})
question_stats['question_success_rate'] = question_stats['correct_answers']/question_stats['total_questions']
question_stats = question_stats[['content_id', 'question_success_rate']]
question_stats.head()

In [None]:
training = pd.merge(train[train['content_type_id']==0], user_general_stats, on='user_id', how='left')
training = pd.merge(training, question_stats, on='content_id', how='left')
training.head()

In [None]:
mine = training[['user_success_rate', 'question_success_rate', 'answered_correctly']]
mine = mine.groupby('answered_correctly').agg('mean').reset_index()
display(mine)

## Model Build

In [None]:
model = keras.Sequential()
model.add(Dense(1, input_shape=(2,), activation='sigmoid'))
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['binary_accuracy']) #auc

In [None]:
X = training[['user_success_rate', 'question_success_rate']].values
y = training[['answered_correctly']].values
X.shape
y.shape

In [None]:
model.fit(
  x=X,
  y=y,
  shuffle=True,
  epochs=1,
  batch_size=1024#16
)

In [None]:
eval = model.evaluate(x=X, y=y)

In [None]:
#test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')

#testing = pd.merge(test[test['content_type_id']==0], user_general_stats, on='user_id', how='left')
#testing = pd.merge(testing, question_stats, on='content_id', how='left')
#testing = testing.fillna(testing.median())
#testing.head()

#X_test = testing[['user_success_rate', 'question_success_rate']].values
#X_test.shape

#probs = model.predict(X_test)
#probs = [y for x in probs for y in x]
#preds = list(np.round(probs).astype(int))

#test_ids = list(testing['row_id'].values)
#group_nums = list(testing['group_num'].values)

#output = pd.DataFrame({'row_id': test_ids, 'answered_correctly': list(probs)})#, 'group_num': group_nums})
#output['answered_correctly'] = 0.5

#output.to_csv("submission.csv", index=False)

## Submission

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df[test_df['content_type_id']==0], user_general_stats, on='user_id', how='left')
    test_df = pd.merge(test_df, question_stats, on='content_id', how='left')
    #test_df = test_df.fillna(test_df.median())
    
    X_test = test_df[['user_success_rate', 'question_success_rate']].values
    probs = model.predict(X_test)
    probs = [y for x in probs for y in x]
    
    test_df['answered_correctly'] = list(probs) #0.5
    test_df['answered_correctly'] = test_df['answered_correctly'].fillna(0.5).round(1).astype(float)
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

In [None]:
test_df.dtypes

In [None]:
test_df.head()