In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import warnings
warnings.simplefilter('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.utils import shuffle
import lightgbm as lgb
from lightgbm import LGBMClassifier
import eli5

import riiideducation

%matplotlib inline
# for heatmap and other plots
colorMap1 = sns.color_palette("RdBu_r")
# for countplot and others plots
colorMap2 = 'Blues_r'


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"
}



train=pd.read_csv("/kaggle/input/riiid-test-answer-prediction/train.csv",dtype=dtypes)
train.head()

In [None]:
train.drop(['row_id', 'timestamp'], axis=1, inplace=True)

In [None]:
train.describe().style.background_gradient(cmap='Blues')

In [None]:
corr_matrix = train.corr()
corr_matrix["answered_correctly"].sort_values(ascending=False)

In [None]:
freq_answered_tasks = train['task_container_id'].value_counts().reset_index()
freq_answered_tasks.columns = [
    'task_container_id', 
    'freq'
]

train['freq_task_id'] = ''
train.loc[train['task_container_id'].isin(freq_answered_tasks[freq_answered_tasks['freq'] < 10000]['task_container_id'].values), 'freq_task_id'] = 'very rare answered'
train.loc[train['task_container_id'].isin(freq_answered_tasks[freq_answered_tasks['freq'] >= 10000]['task_container_id'].values), 'freq_task_id'] = 'rare answered'
train.loc[train['task_container_id'].isin(freq_answered_tasks[freq_answered_tasks['freq'] >= 50000]['task_container_id'].values), 'freq_task_id'] = 'normal answered'
train.loc[train['task_container_id'].isin(freq_answered_tasks[freq_answered_tasks['freq'] >= 200000]['task_container_id'].values), 'freq_task_id'] = 'often answered'
train.loc[train['task_container_id'].isin(freq_answered_tasks[freq_answered_tasks['freq'] >= 400000]['task_container_id'].values), 'freq_task_id'] = 'very often answered'

In [None]:
N = 30 # number of users

user_freq = train['user_id'].value_counts().reset_index()
user_freq.columns = [
    'user_id', 
    'count'
]

# Add ' - ' to convert user_id to str and not sort
user_freq['user_id'] = user_freq['user_id'].astype(str) + ' - '
user_freq = user_freq.sort_values(['count'], ascending=False).head(N)


In [None]:
N = 30 # number of users

content_id_freq = train['content_id'].value_counts().reset_index()
content_id_freq.columns = [
    'content_id', 
    'count'
]

# Add ' - ' to convert content_id to str and not sort
content_id_freq['content_id'] = content_id_freq['content_id'].astype(str) + ' - '
content_id_freq = content_id_freq.sort_values(['count'], ascending=False).head(N)


In [None]:
content_type_freq = train['content_type_id'].value_counts().reset_index()
content_type_freq.columns = ['content_type_id',
                             'share']

In [None]:
train = train[train['answered_correctly'] != -1].reset_index(drop=True, inplace=False)

In [None]:
train.groupby(['content_type_id', 'answered_correctly']).agg({'answered_correctly': 'count'})

In [None]:
task_ids_freq = train['task_container_id'].value_counts().reset_index()
task_ids_freq.columns = ['task_container_id', 'count']

fig, ax = plt.subplots(figsize=(15, 10))

sns.pointplot(x='task_container_id', y='count', data=task_ids_freq, palette=colorMap2)
xticks_range = range(min(task_ids_freq['task_container_id']), 
                     max(task_ids_freq['task_container_id']),
                     1000)
plt.xticks(list(xticks_range), list(xticks_range))

In [None]:
questions = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/questions.csv")
lectures = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/lectures.csv")
test_example = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/example_test.csv")

In [None]:
part_freq = questions['part'].value_counts().reset_index()
part_freq.columns = [
    'part', 
    'count'
]

# Add ' - ' to convert content_id to str and not sort
part_freq['part'] = part_freq['part'].astype(str) + ' - '
part_freq = part_freq.sort_values(['count'], ascending=False).head(N)

In [None]:
N = 30

tags_freq = questions['tags'].value_counts().reset_index()
tags_freq.columns = [
    'tag',
    'count'
]

# Add ' - ' to convert content_id to str and not sort
tags_freq['tag'] = tags_freq['tag'].astype(str) + ' - '
tags_freq = tags_freq.sort_values(['count'], ascending=False).head(N)


In [None]:

tags = questions['tags'].str.split(' ').explode('tags').reset_index()
tags_freq = tags['tags'].value_counts().reset_index()
tags_freq.columns = [
    'tag',
    'count'
]

# Add ' - ' to convert content_id to str and not sort
tags_freq['tag'] = tags_freq['tag'].astype(str) + ' - '
tags_freq = tags_freq.sort_values(['count'], ascending=False).head(N)

In [None]:
part_freq = lectures['part'].value_counts().reset_index()
part_freq.columns = [
    'part', 
    'count'
]

# Add ' - ' to convert content_id to str and not sort
part_freq['part'] = part_freq['part'].astype(str) + ' - '
part_freq = part_freq.sort_values(['count'], ascending=False).head(N)


In [None]:
N = 30

tags_freq = lectures['tag'].value_counts().reset_index()
tags_freq.columns = [
    'tag',
    'count'
]

# Add ' - ' to convert content_id to str and not sort
tags_freq['tag'] = tags_freq['tag'].astype(str) + ' - '
tags_freq = tags_freq.sort_values(['count'], ascending=False).head(N)


In [None]:
n = int(train.shape[0] * 0.1)
train_sample = train.sample(n=n, random_state=42)

In [None]:
del questions
del lectures

In [None]:
user_characteristics = train.groupby('user_id').agg({'answered_correctly':
                                                  ['mean', 'median', 'std', 'skew', 'count']})
user_characteristics.columns = [
    'mean_user_acc',
    'median_user_acc',
    'std_user_acc',
    'skew_user_acc',
    'number_of_answered_q'
]

In [None]:
task_container_characteristics = train.groupby('task_container_id').agg({'answered_correctly':
                                                                      ['mean', 'median', 'std', 'skew', 'count']})
task_container_characteristics.columns = [
    'mean_task_acc',
    'median_task_acc',
    'std_task_acc',
    'skew_task_acc',
    'number_of_asked_task_containers'
]

In [None]:
content_characteristics = train.groupby('content_id').agg({'answered_correctly':
                                                        ['mean', 'median', 'std', 'skew', 'count']})
content_characteristics.columns = [
    'mean_acc',
    'median_acc',
    'std_acc',
    'skew_acc',
    'number_of_asked_q'
]

In [None]:
df = train_sample.copy()
del train_sample

In [None]:
df = df.merge(user_characteristics, how='left', on='user_id')
df = df.merge(task_container_characteristics, how='left', on='task_container_id')
df = df.merge(content_characteristics, how='left', on='content_id')

In [None]:
  features = [
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'mean_user_acc',
    'median_user_acc',
    'std_user_acc',
    'skew_user_acc',
    'number_of_answered_q',
    'mean_task_acc',
    'median_task_acc',
    'std_task_acc',
    'skew_task_acc',
    'number_of_asked_task_containers',
    'mean_acc',
    'median_acc',
    'std_acc',
    'skew_acc',
    'number_of_asked_q'
]

target = 'answered_correctly'

In [None]:
col_to_drop = set(df.columns.values.tolist()).difference(features + [target])
for col in col_to_drop:
    del df[col]

In [None]:
df['prior_question_had_explanation'] = df['prior_question_had_explanation'].fillna(value=False).astype(bool)
df = df.fillna(value=0.5)

In [None]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0.5)

In [None]:
df.head(5)

In [None]:
train_df, test_df, y_train, y_test = train_test_split(df[features], df[target], random_state=777, test_size=0.2)

In [None]:
params = {
    'num_leaves': 30, 
    'n_estimators': 300, 
    'min_data_in_leaf': 100, 
    'max_depth': 5, 
    'lambda': 0.0, 
    'feature_fraction': 1.0
}

In [None]:
model = LGBMClassifier(**params)
model.fit(train_df, y_train)

In [None]:
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    # merge
    test_df = test_df.merge(user_characteristics, on = "user_id", how = "left")
    test_df = test_df.merge(task_container_characteristics, on = "task_container_id", how = "left")
    test_df = test_df.merge(content_characteristics, on = "content_id", how = "left")
    
    # type transformation
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
    test_df.fillna(value = 0.5, inplace = True)
    test_df = test_df.replace([np.inf, -np.inf], np.nan)
    test_df = test_df.fillna(0.5)
    
    # preds
    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:, 1]
    cols_to_submission = ['row_id', 'answered_correctly', 'group_num']
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

In [None]:
my_submission = test_df['row_id','answered_correctly']
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)