In [None]:
# useful
import numpy as np
import pandas as pd

# neural nets
import tensorflow as tf
import tensorflow.keras.models as M
import tensorflow.keras.layers as L
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding


# custom
import riiideducation

**DATA CLEANING**

In [None]:
train_df = pd.read_csv(
    '/kaggle/input/riiid-test-answer-prediction/train.csv',
    usecols=[
      
        'user_id', 
        'content_id',
        'task_container_id',
        'user_answer', 
        'answered_correctly', 
        'prior_question_elapsed_time',
        'prior_question_had_explanation'
    ],
       dtype={

           'user_id': 'int32',
           'content_id': 'int16',
           'task_container_id': 'int8',
           'user_answer': 'int8',
           'answered_correctly': 'int8',
           'prior_question_elapsed_time': 'float32', 
           'prior_question_had_explanation': 'boolean'
       }
)


In [None]:
question = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')

In [None]:
train_df = train_df.loc[train_df['answered_correctly'] != -1].reset_index(drop=True)
train_df

In [None]:
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
train_df

In [None]:
train_df = train_df.merge(question, left_on = 'content_id',right_on = 'question_id',how = 'left')
train_df.drop(columns=['question_id','correct_answer','tags','bundle_id'],axis=1,inplace=True)
train_df

In [None]:
features_df = train_df.iloc[:int(9 /10 * len(train_df))]
train_df = train_df.iloc[int(9 /10 * len(train_df)):]

In [None]:
grouped_by_user_df = features_df.groupby('user_id')
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['count', 'std', 'median']}).copy()
user_answers_df.columns = ['questions_answered', 'std_user_accuracy', 'median_user_accuracy']

In [None]:
grouped_by_task_container_df = features_df.groupby('task_container_id')
task_container_df = grouped_by_task_container_df.agg({'answered_correctly': ['count', 'std', 'median']}).copy()
task_container_df.columns = ['questions_tc', 'std_tc_accuracy', 'median_tc_accuracy']

In [None]:
grouped_by_content_df = features_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['count', 'std', 'median']}).copy()
content_answers_df.columns = ['question_asked', 'std_accuracy', 'median_accuracy']

In [None]:
grouped_by_question_df = features_df.groupby('content_id')
question_df = grouped_by_question_df.agg({'user_answer': ['count', 'std', 'median']}).copy()
question_df.columns = ['number_ca', 'std_ca_accuracy', 'median_ca_accuracy']

In [None]:
import gc
del features_df
del grouped_by_user_df
del grouped_by_content_df

gc.collect()

In [None]:
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')
train_df = train_df.merge(task_container_df, how='left', on='task_container_id')
train_df = train_df.merge(question_df,how = 'left', on='content_id')
train_df

In [None]:
features = [ 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy', 
    'questions_tc', 
    'std_tc_accuracy',
    'median_tc_accuracy',  
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'part'
]

target = 'answered_correctly'

In [None]:
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0)

In [None]:
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype(np.int8)
train_df

**DATA VISUALISATION**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
type_count1 = train_df['answered_correctly'].value_counts()
sns.barplot(type_count1.index.astype('str'), type_count1.values, alpha=0.8)
plt.title('correct answers vs wrong answers')
plt.ylabel('Number of Answers', fontsize=12)
plt.xlabel('correct or wrong', fontsize=12)
plt.show()

In [None]:
type_count = train_df['prior_question_had_explanation'].value_counts()
sns.barplot(type_count.index.astype('str'), type_count.values, alpha=0.8)
plt.title('Prior Questions with Explanation vs without Explanation')
plt.ylabel('Number of Answers', fontsize=12)
plt.xlabel('Explanation vs No Explanation', fontsize=12)
plt.show()

In [None]:
df_grouped = pd.DataFrame(train_df.groupby(['part'])['answered_correctly'].agg(np.sum)) #vizualisation of data
df_grouped.reset_index(inplace=True)
df_grouped.sort_values(by=['answered_correctly'], ascending=True)\
          .plot(kind='barh', x='part', y='answered_correctly', 
                figsize=(9,5), legend=False, color='darkblue')# visualizatoin of transaction amount data
plt.xlabel('\nThe number of correct answers', fontsize=12)
plt.ylabel('TOEIC\n', fontsize=12)
plt.title('\nThe number of correct answers by different parts of TOEIC\n', fontsize=14, fontweight='bold');

**TRAINING**

In [None]:
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(12),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(120, activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(20, activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer='adam', loss="binary_crossentropy", metrics=['accuracy'])
    return model   

In [None]:
 from sklearn.model_selection import KFold

In [None]:
res = pd.DataFrame()
res['row_id'] = [i for i in range(9927130)]
res.loc[:, ['answered_correctly']] = 0
models = []

for n, (tr, te) in enumerate(KFold(n_splits=2, random_state=666, shuffle=True).split(train_df[target])):
    print(f'Fold {n}')
    
    model = create_model()
    
    model.fit(
        train_df[features].values[tr],
        train_df[target].values[tr],
        validation_split=0.2,
        epochs=50, 
        batch_size=5120
    )

    res.loc[te, ['answered_correctly']] = model.predict(train_df[features].values[te])
    models.append(model)

## PREDICTION

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    y_preds = []
    test_df = test_df.merge(user_answers_df, how = 'left', on = 'user_id')
    test_df = test_df.merge(content_answers_df, how = 'left', on = 'content_id')
    test_df = test_df.merge(task_container_df, how = 'left', on = 'task_container_id')
    test_df = test_df.merge(question_df,how = 'left', on='content_id')
    test_df = test_df.merge(question, left_on = 'content_id',right_on = 'question_id',how = 'left')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].astype(np.int8)
    test_df = test_df.replace([np.inf, -np.inf], np.nan)
    test_df.fillna(value=0, inplace = True)

    for model in models:
        y_pred = model.predict(test_df[features].values)
        y_preds.append(y_pred)

    y_preds = sum(y_preds) / len(y_preds)
    test_df['answered_correctly'] = y_preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])