In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.utils import shuffle
import lightgbm as lgb
from lightgbm import LGBMClassifier
import eli5

from catboost import CatBoostClassifier
import xgboost as xgb
import riiideducation
import plotly.express as px


# for heatmap and other plots
colorMap1 = sns.color_palette("RdBu_r")
# for countplot and others plots
colorMap2 = 'Blues_r'


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
subset_size = int(5E6)

train_data = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',low_memory=False,
                        nrows=subset_size, 
                       dtype={'row_id': 'int64',
                              'timestamp': 'int64',
                              'user_id': 'int32',
                              'content_id': 'int16',
                              'content_type_id': 'int8',
                              'task_container_id': 'int16',
                              'user_answer': 'int8',
                              'answered_correctly': 'int8',
                              'prior_question_elapsed_time': 'float32', 
                              'prior_question_had_explanation': 'boolean',
                             } )
train_data.head()

In [None]:
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
questions.head()

In [None]:
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
lectures.head()

In [None]:
print(train_data.describe())
print(train_data.shape)

In [None]:
train_data.isnull().sum()

In [None]:
# Let's check a correlation matrix to get more information between the columns

corr_matrix=train_data.corr()
#corr_matrix['answered_correctly'].sort_values(ascending=True)
corr_matrix

In [None]:
plt.figure(figsize=(13,10))
sns.heatmap(corr_matrix,annot=True,
           linewidths=5,cmap=colorMap1)

In [None]:
# Let's check the distribution of prior_question_elapsed_time

plt.figure(figsize=(8,8))
ax = sns.countplot(x="prior_question_elapsed_time", 
                   data=train_data[train_data['prior_question_elapsed_time'].notnull()],
                   palette=colorMap2)

In [None]:
plt.figure(figsize=(15, 11))
ax = sns.countplot(x="prior_question_had_explanation", hue="answered_correctly", 
                   data=train_data[train_data['prior_question_had_explanation'].notnull()], 
                   palette=colorMap2)

In [None]:
print(train_data['content_type_id'].value_counts())
ds = train_data['content_type_id'].value_counts().reset_index()

In [None]:
ds = train_data['content_type_id'].value_counts().reset_index()

ds.columns = [
    'content_type_id', 
    'percent'
]

ds['percent'] /=len(train_data)

print(ds)

fig = px.pie(
    ds, 
    names='content_type_id', 
    values='percent', 
    title='Lecures & questions', 
    width=800,
    height=500 
)

fig.show()

In [None]:
ds=train_data['user_answer'].value_counts().reset_index()
ds.columns = [
    'user_answer', 
    'percent_of_answers'
]
ds['percent_of_answers']/=len(train_data)
ds = ds.sort_values(['percent_of_answers'])
fig = px.bar(
    ds, 
    x='user_answer', 
    y='percent_of_answers', 
    orientation='v', 
    title='Percent of user answers for every option', 
    width=500,
    height=400 
)

fig.show()

In [None]:
task_ids_freq = train_data['task_container_id'].value_counts().reset_index()
print(task_ids_freq)
task_ids_freq.columns = ['task_container_id', 'count']

print(task_ids_freq)

fig, ax = plt.subplots(figsize=(15, 10))

sns.pointplot(x='task_container_id', y='count', data=task_ids_freq, palette=colorMap2)
xticks_range = range(min(task_ids_freq['task_container_id']), 
                     max(task_ids_freq['task_container_id']),
                     1000)
plt.xticks(list(xticks_range), list(xticks_range))

In [None]:
features_df = train_data.iloc[:int(9/10 * len(train_data))]
train = train_data.iloc[int(9/10 * len(train_data)):]

In [None]:
features_df.shape

In [None]:
train.shape

In [None]:
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]
grouped_by_user_df = train_questions_only_df.groupby('user_id')
grouped_by_user_df.count()

In [None]:
user_answers_df = grouped_by_user_df.agg(
    {
        'answered_correctly': [
            'mean', 
            'count', 
            'std', 
            'median', 
            'skew'
        ]
    }
).copy()
user_answers_df

In [None]:

user_answers_df.columns = [
    'mean_user_accuracy',
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy', 
    'skew_user_accuracy'
]
user_answers_df

In [None]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg(
    {
        'answered_correctly': [
            'mean', 
            'count', 
            'std', 
            'median', 
            'skew'
        ]
    }
).copy()

content_answers_df.columns = [
    'mean_accuracy', 
    'question_asked', 
    'std_accuracy', 
    'median_accuracy', 
    'skew_accuracy'
]
content_answers_df

In [None]:
del features_df
del grouped_by_user_df
del grouped_by_content_df

In [None]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'skew_accuracy'
]

target = 'answered_correctly'

In [None]:
target

In [None]:
train = train[train[target] != -1]


In [None]:
train.head()

In [None]:
train = train.merge(user_answers_df, how='left', on='user_id')
train = train.merge(content_answers_df, how='left', on='content_id')
train

In [None]:
train.isnull().sum()

In [None]:
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(value=False).astype(bool)
train.isnull().sum()

In [None]:
df = train.fillna(value=0.5)
df.isnull().sum()

In [None]:
col_to_drop = set(train.columns.values.tolist()).difference(features + [target])
print(col_to_drop)
for col in col_to_drop:
    del df[col]

In [None]:
df.head()

In [None]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0.5)
k = pd.get_dummies(df['prior_question_had_explanation'],drop_first=True)
df = df.drop(['prior_question_had_explanation'],axis=1)
df

In [None]:
df = pd.concat([df,k],axis=1)
df

In [None]:
df.isnull().sum()

In [None]:
train_df, test_df, y_train, y_test = train_test_split(df.drop(['answered_correctly'],axis=1), df[target],
                                                      random_state=777, test_size=0.4)

In [None]:
params = {
    'num_leaves': 30, 
    'n_estimators': 300, 
    'min_data_in_leaf': 100, 
    'max_depth': 5, 
    'lambda': 0.0, 
    'feature_fraction': 1.0
}

In [None]:
model = LGBMClassifier(**params)
model.fit(train_df, y_train)

In [None]:
print('LGB ROC-AUC score: ', roc_auc_score(y_test.values, model.predict_proba(test_df)[:, 1]))


In [None]:
params_cat = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'task_type': 'GPU' ,
    'grow_policy': 'Lossguide',
    'iterations': 2500,
    'learning_rate': 4e-2,
    'random_seed': 0,
    'l2_leaf_reg': 1e-1,
    'depth': 15,
    'max_leaves': 10,
    'border_count': 128,
    'verbose': 50,
}

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfclf = RandomForestClassifier(n_estimators=100)
rfclf.fit(train_df,y_train)
pred = rfclf.predict(test_df)

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test,pred))

In [None]:
test_df.dtypes

In [None]:
from xgboost import XGBClassifier
xgbclf = XGBClassifier(n_estimators=100)
xgbclf.fit(train_df,y_train)
xgb_pred = xgbclf.predict(test_df)

In [None]:
print(roc_auc_score(y_test,xgb_pred))

In [None]:
# from sklearn.neural_network import MLPClassifier
# mlp = MLPClassifier()
# mlp.fit(train_df,y_train)
# mlp_pred = mlp.predict(test_df)

In [None]:
# print(roc_auc_score(y_test,mlp_pred))


In [None]:
#eli5.show_weights(model, top=20)
eli5.show_weights(rfclf, top=20)

In [None]:
lgb.plot_importance(model)


In [None]:
print(test_df.columns)
test_df.dtypes

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    # merge
    test_df = test_df.merge(user_answers_df, on = "user_id", how = "left")
    #test_df = test_df.merge(task_container_characteristics, on = "task_container_id", how = "left")
    test_df = test_df.merge(content_answers_df, on = "content_id", how = "left")
    
#     print(test_df.columns())
#     #Deleting columns
#     col_to_drop_tst = set(test_df.columns.values.tolist()).difference(features)
#     print(col_to_drop_tst)
#     for col in col_to_drop_tst:
#         if col != 'task_container_id':
#             del df[col]
        
        
    # type transformation
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
    test_df.fillna(value = 0.5, inplace = True)
    test_df = test_df.replace([np.inf, -np.inf], np.nan)
    test_df = test_df.fillna(0.5)
    
    #object transaformation
    l = pd.get_dummies(test_df['prior_question_had_explanation'],drop_first=True)
    test_df = test_df.drop(['prior_question_had_explanation'],axis=1)
    test_df = pd.concat([test_df,l],axis=1)
    test_df
    
    # preds
    #test_df['answered_correctly'] = model.predict_proba(test_df[features])[:, 1]
    test_df['answered_correctly'] = rfclf.predict_proba(test_df)
    cols_to_submission = ['row_id', 'answered_correctly', 'group_num']
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])