In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
!cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

# Rapids Imports
import cudf
import cupy # CuPy is an open-source array library accelerated with NVIDIA CUDA.

In [None]:
%%time
#get the data
import numpy as np
import pandas as pd
dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"
}


train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',dtype=dtypes)
train = train.iloc[:1500000]
print('Loaded dataset!')

## look at the values in answered_correctly
### reasons will be explained later

In [None]:
train.answered_correctly.value_counts()

## do the same for user_answer 

In [None]:
train.user_answer.value_counts()

### how many nan values are there in prior_question_elapsed_time

In [None]:
train.prior_question_elapsed_time.isna().sum()

In [None]:
train.isna().sum()

In [None]:
del_ids = ((train.user_answer == -1) & (train.answered_correctly == -1) & 
 (train['prior_question_elapsed_time'].isna() == True) ).sum()

In [None]:
print('percentage Ids that have been erased {:.4f}%'.format(del_ids/len(train)*100))

### -1 values are lectures (do not need them for trainning)
### therefore drop them 

In [None]:
print('Train length : ', len(train))
#remove the -1 values in the data it affects the algorithms ability to learn
train = train.drop(train[(train.user_answer == -1) & (train.answered_correctly == -1) & 
 (train['prior_question_elapsed_time'].isna() == True)].index)
print("We erased  {:.3}% of all data.".format(del_ids/len(train)*100))


In [None]:
train.shape  

In [None]:
train.user_answer.value_counts()

In [None]:
print('Train shape: ' ,train.shape)

# preprocessing the data

In [None]:
# Find Missing Data if any
total = len(train)

for column in train.columns:
    if train[column].isna().sum() != 0:
        print("{} has: {:,} ({:.2}%) missing values.".format(column, train[column].isna().sum(), 
                                                             (train[column].isna().sum()/total)*100))

In [None]:
# Fill in missing values 
train["prior_question_elapsed_time"] = train["prior_question_elapsed_time"].fillna(np.float32(train["prior_question_elapsed_time"].mean()))
train["prior_question_had_explanation"] = train["prior_question_had_explanation"].fillna(train["prior_question_had_explanation"].value_counts().index[0])
train["prior_question_had_explanation"] = train["prior_question_had_explanation"].astype(int)
train["content_type_id"] = train["content_type_id"].astype(int)
train.head()

In [None]:
print("{} has: {:,} unique user ids,\n{:.0f} is the average number of times user_id appears.".format(train.columns[2], (train['user_id'].value_counts().unique()).sum(), 
                                                             train['user_id'].value_counts().mean()))
#see if dropping users with < 15 appearences is better than dropping users with < mean_id
#mean_id = train['user_id'].value_counts().mean()

In [None]:
# Select ids to erase
# user_ids with less than 5 appearences where most-likely jerking off so, we remove them
ids_to_erase = train["user_id"].value_counts().reset_index()[train["user_id"].value_counts().reset_index()["user_id"] < 15]\
                                                                                                                ["index"].values
# Erase the ids
new_train = train[~train['user_id'].isin(ids_to_erase)]

print("We erased {} rows meaning {:.3}% of all data.".format(len(train)-len(new_train), (1 - len(new_train)/len(train))*100))
del ids_to_erase
# del train


In [None]:
new_train.shape

In [None]:
del train

In [None]:
new_train.columns

In [None]:
new_train.head()

In [None]:
new_train.drop('row_id',1,inplace=True)
new_train.drop('user_answer',1,inplace=True)
new_train.drop('user_id',1,inplace=True)

In [None]:
y = new_train['answered_correctly']
new_train.drop('answered_correctly',1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split



In [None]:
x_train,x_test,y_train,y_test = train_test_split(new_train,y,test_size=0.3)

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [None]:
from sklearn.model_selection import TimeSeriesSplit, KFold

n_folds = 5
folds = TimeSeriesSplit(n_splits=n_folds)
folds = KFold(n_splits=5)

In [None]:
columns = x_train.columns
splits = folds.split(x_train,y_train)

y_preds = np.zeros(x_test.shape[0])
y_oof = np.zeros(x_train.shape[0])

score_auc = 0
 
feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

In [None]:
params = {
        'num_leaves': 64,
        'min_child_weight' : 0.03,
        'max_depth': -1,
        'feature_fraction': 0.04,
        'learning_rate': 0.006,
        'min_data_in_leaf': 80,
        'metric': 'auc',
        'bagging_fraction': 0.33,
        'boosting_type': 'gbdt',
        'reg_alpha' : 0.3,
        'reg_lambda' : 0.6,
        'verbosity': -1,
        'random_sate' : 0
        }

In [None]:
%%time
for fold_n, (train_index, valid_index) in enumerate(splits):
    x_tr , x_val = x_train[columns].iloc[train_index], x_train[columns].iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    dtrain = lgb.Dataset(x_tr, label=y_tr)
    dvalid = lgb.Dataset(x_val, label=y_val)
    
    clf = lgb.train(params, dtrain,10000,valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=100)
    
    feature_importances[f'fold_{fold_n + 1} '] = clf.feature_importance()
    
    y_pred_val = clf.predict(x_val)
    y_oof[valid_index] = y_pred_val
    print(f"Fold {fold_n + 1} | AUC : {roc_auc_score(y_val,y_pred_val)}")
    
    score_auc += roc_auc_score(y_val, y_pred_val) / n_folds
    
    y_preds += clf.predict(x_test)  / n_folds
    
    del x_tr, x_val, y_tr, y_val

In [None]:
%%time
print('Loading riiid!')
import riiideducation
#Create the env
env = riiideducation.make_env()
print('Loaded riiideducation!')

#Create the iterator
iter_test = env.iter_test()

#Iter and predict
for (test_df, sample_prediction_df) in iter_test:
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(np.float32(test_df['prior_question_elapsed_time'].mean()))
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(test_df['prior_question_had_explanation'].value_counts().index[0])
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].astype(int)
    test_df['answered_correctly'] = clf.predict(np.array(test_df[['timestamp', 'content_id', 'content_type_id',
       'task_container_id', 'prior_question_elapsed_time',
       'prior_question_had_explanation']]))
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])