In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm.notebook import tqdm
import xgboost
from xgboost import XGBClassifier
from xgboost import plot_importance
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tabulate import tabulate
import missingno as msno 
from IPython.display import display_html
from PIL import Image
import gc
import cv2
from scipy.stats import pearsonr
import tqdm
import seaborn as sns

from skimage.transform import resize
import copy
import re

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Color Palette
custom_colors = ['#00FFE2', '#00FDFF', '#00BCFF', '#0082FF', '#8000FF', '#B300FF', '#F400FF']
sns.palplot(sns.color_palette(custom_colors))

# Set Style
sns.set_style("whitegrid")
sns.despine(left=True, bottom=True)

# Set tick size
plt.rc('xtick',labelsize=12)
plt.rc('ytick',labelsize=12)

In [None]:
# Check CUDA/cuDNN Version
!nvcc -V && which nvcc
!nvidia-smi

In [None]:
!pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             
    torch.cuda.empty_cache()
    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)
    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()

In [None]:
import os
import cudf
import dask_cudf

train = dask_cudf.from_cudf(cudf.read_csv('../input/riiid-test-answer-prediction/train.csv',
                          dtype={'row_id': 'int64',
                          'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'user_answer': 'int8',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   ),npartitions=6).compute()
print(train.shape)

In [None]:
print(train.head())
print(train.isnull().sum())
cols = train.columns
for col in cols: 
    print('Unique values :',{col,train[col].nunique()})

train.describe().T

In [None]:
# Data Information
print("Rows: {:,}".format(len(train)), "\n" +
      "Columns: {}".format(len(train.columns)))

# Find Missing Data if any
total = len(train)

# Fill in missing values with "-1"
train["prior_question_elapsed_time"] = train["prior_question_elapsed_time"].fillna(-1)
train["prior_question_had_explanation"] = train["prior_question_had_explanation"].fillna(0)

train.head()

In [None]:
def distplot_features(df, feature, title, color = custom_colors[4], categorical=True):
    '''Takes a column from the GPU dataframe and plots the distribution (after count).'''
    
    if categorical:
        values = cupy.asnumpy(df[feature].value_counts().values)
    else:
        values = cupy.asnumpy(df[feature].values)
        
    print('Mean: {:,}'.format(np.mean(values)), "\n"
          'Median: {:,}'.format(np.median(values)), "\n"
          'Max: {:,}'.format(np.max(values)))

    
    plt.figure(figsize = (18, 3))
    
    if categorical:
        sns.distplot(values, hist=False, color = color, kde_kws = {'lw':3})
    else:
        # To speed up the process
        sns.distplot(values[::250000], hist=False, color = color, kde_kws = {'lw':3})
    
    plt.title(title, fontsize=15)
    plt.show();
    
    del values
    gc.collect()

In [None]:
def barplot_features(df, feature, title, palette = custom_colors[2:]):
    '''Takes the numerical columns (with less than 10 categories) and plots the barplot.'''
    
    # We need to extract both the name of the category and the no. of appearences
    index = cupy.asnumpy(df[feature].value_counts().reset_index()["index"].values)
    values = cupy.asnumpy(df[feature].value_counts().reset_index()[feature].values) 

    plt.figure(figsize = (18, 3))
    sns.barplot(x = index, y = values, palette = custom_colors[2:])
    plt.title(title, fontsize=15)
    plt.show();
    
    del index, values
    gc.collect()

In [None]:
import cupy
numerical_features = ['timestamp', 'prior_question_elapsed_time']

for feature in numerical_features:
    distplot_features(train, feature=feature, title = feature + " distribution", color = custom_colors[1], categorical=False)

In [None]:
categorical_features = ['user_id', 'content_id', 'task_container_id']

for feature in categorical_features:
    distplot_features(train, feature=feature, title = feature + " countplot distribution", color = custom_colors[4], categorical=True)

In [None]:
categorical_for_bar = ['content_type_id', 'user_answer', 
                       'answered_correctly', 'prior_question_had_explanation']

for feature in categorical_for_bar:
    barplot_features(train, feature=feature, title = feature + " barplot")

In [None]:
questions = cudf.read_csv('../input/riiid-test-answer-prediction/questions.csv')

# Data Information
print("Rows: {:,}".format(len(questions)), "\n" +
      "Columns: {}".format(len(questions.columns)))

# Find Missing Data if any
total = len(questions)

for column in questions.columns:
    if questions[column].isna().sum() != 0:
        print("{} has: {:,} ({:.2}%) missing values.".format(column, questions[column].isna().sum(), 
                                                             (questions[column].isna().sum()/total)*100))
        
        
# Fill in missing values with "-1"
questions["tags"] = questions["tags"].fillna(-1)

questions.head()

In [None]:
for feature in ['part', 'correct_answer']:
    barplot_features(questions, feature=feature, title=feature + " - barplot distribution")

In [None]:
distplot_features(questions, 'tags', title = "Tags - Count Distribution", color = custom_colors[0], categorical=True)

In [None]:
free_gpu_cache()

In [None]:
#choosing important attributes
train = dask_cudf.from_cudf(cudf.read_csv('../input/riiid-test-answer-prediction/train.csv',
                          dtype={'row_id': 'int64',
                          'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'user_answer': 'int8',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   ),npartitions=6).compute()
print(train.shape)

In [None]:
train_grouped = train.groupby('user_id')
train_grouped.nth(5)

In [None]:
#split based on user_id and timestamp
train = train[train.content_type_id == False].sort_values('timestamp').reset_index(drop = True)
print(train.shape)
train_pd = train.to_pandas()
valid_split = train_pd.groupby('user_id').tail(5)
del(train_pd)
valid_split1 = cudf.from_pandas(valid_split)
del(valid_split)
train_split1 = train[~train.row_id.isin(valid_split1.row_id)]

In [None]:
del(train_split1)
del(valid_split1)

In [None]:
# !python -m pip --version

In [None]:
# !pip3 install --upgrade pip

In [None]:
# !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
# !bash rapidsai-csp-utils/colab/rapids-colab.sh stable

# import sys, os

# # dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
# # sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
# # sys.path
# exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

In [None]:
# #split training and testing set
# #created 5 cross validation sets

# val_size = 2500000

# for cv in range(5):
#     valid = train[-val_size:]
#     train = train[:-val_size]
#     # check new users and new contents
#     new_users = len(valid[~valid.user_id.isin(train.user_id)].user_id.unique())
#     valid_question = valid[valid.content_type_id == 0]
#     train_question = train[train.content_type_id == 0]
#     new_contents = len(valid_question[~valid_question.content_id.isin(train_question.content_id)].content_id.unique())    
#     print('cv{cv} {train_question.answered_correctly.mean():.3f} {valid_question.answered_correctly.mean():.3f} {new_users} {new_contents}')
#     valid.to_pickle(f'cv{cv+1}_valid.pickle')
#     train.to_pickle(f'cv{cv+1}_train.pickle') 

In [None]:
import pandas as pd
 
train_pickle ='../input/pickle-files-generated-xgboost/train_pickle.p'
valid_pickle ='../input/pickle-files-generated-xgboost/valid_pickle.p'
 
train = pd.read_pickle(train_pickle)
valid = pd.read_pickle(valid_pickle)

In [None]:

!apt update
!apt-get install -y cmake build-essential
!git clone https://github.com/dmlc/xgboost.git /tmp/xgboost
!pushd /tmp/xgboost/python-package
!git submodule init
!git submodule update
!python setup.py install
!popd

from sklearn.metrics import roc_auc_score
 
FEATS = ['row_id', 'user_id', 'content_id', 'content_type_id',
       'prior_question_elapsed_time',
       'prior_question_had_explanation', 'answered_correctly_sum_u', 'count_u',
       'answered_correctly_avg_u', 'answered_correctly_avg_c',
       'prior_question_elapsed_time_mean', 'question_id', 'part']
 
TARGET = ['answered_correctly']
 
X_train = train[FEATS]
y_train = train[TARGET]
X_valid = valid[FEATS]
y_valid = valid[TARGET]


import xgboost
from xgboost import XGBClassifier
from xgboost import plot_importance
import matplotlib.pyplot as plt

params = {
    'max_depth' : 12,
    'max_leaves' : 2**8,
    'eta':0.6,  # Step size shrinkage used in update to prevents overfitting
                # After each boosting step, we can directly get the weights of 
    # new features, and eta shrinks the feature weights to make the boosting 
    # process more conservative.
    'alpha':0.1,     # L1 regularization term on weights. 
    'lambda' : 0.2,  # L2 regularization term on weights. 
                     # Increasing this value will make model more conservative.
    'min_child_weight':1,
    'subsample':0.8,  # default = 1, Subsample ratio of the training instances. 
                      # Setting it to 0.5 means that XGBoost would randomly 
                      # sample half of the training data prior to growing trees.
                      # l prevent overfitting. 
                      # Subsampling will occur once in every boosting iteration.
    'tree_method' : 'gpu_hist',
    'learning_rate': 0.5, #default = 0.3,
    'colsample_bytree':0.7, # is the subsample ratio of columns when constructing each tree. 
                            # Subsampling occurs once for every tree constructed.
    'eval_metric':'auc', 
    'objective' : 'binary:logistic',
    'sample_type': 'weighted',
    'grow_policy' : 'lossguide',
    'n_estimators': 200,
    'normalize_type': 'tree',
    'rate_drop': 0.2,    # dropout rate 
    'skip_drop': 0.1,    # probability of skipping dropout (If a dropout is 
                         # skipped, new trees are added in the same manner as gbtree.)
    'feature_selector':'thrifty',  # This operation is multithreaded and is a 
    # linear complexity approximation of the quadratic greedy selection. 
    'deterministic_histogram': 'true',  # Histogram building is not deterministic 
    # due to the non-associative aspect of floating point summation. We employ a 
    # pre-rounding routine to mitigate the issue, which may lead to slightly lower 
    # accuracy
    # 'booster' : 'dart'
    'single_precision_histogram':'true', # single precision to build histograms instead of double precision
    'sampling_method':'gradient_based', # only supported when tree_method is set to gpu_hist    
    'predictor': 'gpu_predictor'

}

num_round = 50
train_matrix = xgboost.DMatrix(data = train[FEATS], label =  train[TARGET])
test_matrix = xgboost.DMatrix(data = valid[FEATS])
xgb = xgboost.train(params, dtrain = train_matrix)

predicts = xgb.predict(test_matrix, ntree_limit=num_round)
roc = roc_auc_score(valid[TARGET].astype('int32'), predicts)
print('ROC for XGBoost model')
print(roc)
plot_importance(xgb)