Learning Kaggle,thanks to Kostiantyn Isaienkov's code to help me get started!

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

In [None]:
%%time
used_data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

df = pd.read_csv(
    '../input/riiid-test-answer-prediction/train.csv',
    usecols = used_data_types_dict.keys(),
    dtype = used_data_types_dict,
    nrows=10**7
)
# Commented out nrows=10**6
# all data runs out of memory, need to fix this
# I think with the chunking syntax or datatable or something

Go to --> Add Data, search "RAPIDS", add RAPIDS
<br />
... still having trouble with this ...
No module found 'cudf'

In [None]:
'''

%%time

# Import the Rapids suite here - takes abot 1.5 mins

import sys
!cp ../input/rapids/rapids.0.16.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

# Rapids Imports
import cudf
import cupy # CuPy is an open-source array library accelerated with NVIDIA CUDA.


from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster()
client = Client(cluster)
client

%%time
df = cudf.read_csv('../input/riiid-test-answer-prediction/train.csv')
df.info()
'''

In [None]:
df = df.replace([np.inf, -np.inf], np.nan)
df['prior_question_had_explanation'] = df['prior_question_had_explanation'].fillna(value=False).astype(bool)
df = df.fillna(0.5)
df.info()

In [None]:
train_questions_only_df = df[df['answered_correctly']!=-1]
grouped_by_user_df = train_questions_only_df.groupby('user_id')
user_answers_df = grouped_by_user_df.agg({
    'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']
                                         }).copy()
user_answers_df.columns = [
    'mean_user_accuracy',
    'questions_answered',
    'std_user_accuracy',
    'median_user_accuracy',
    'skew_user_accuracy'
]
user_answers_df.head(5)

In [None]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg({
    'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']
}).copy()
content_answers_df.columns = [
    'mean_accuracy',
    'question_asked',
    'std_accuracy',
    'median_accuracy',
    'skew_accuracy'
]
content_answers_df.head(5)

In [None]:
df['timespend'] = df.groupby('user_id')['timestamp'].transform(lambda x: (x.max() - x.min()) / 1000)

In [None]:
q_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
q_df.info()

In [None]:
tags = q_df["tags"].str.split(" ", n=10, expand=True)
tags.columns = ['tags1', 'tags2', 'tags3', 'tags4', 'tags5', 'tags6']
q_df = pd.concat([q_df, tags], axis=1).drop(['tags'], axis=1)
q_df['tags1'] = pd.to_numeric(q_df['tags1'],
                              errors='coerce',
                              downcast='integer').fillna(-1)
q_df['tags2'] = pd.to_numeric(q_df['tags2'],
                              errors='coerce',
                              downcast='integer').fillna(-1)
q_df['tags3'] = pd.to_numeric(q_df['tags3'],
                              errors='coerce',
                              downcast='integer').fillna(-1)
q_df.head(3)

In [None]:
features = [
    'mean_user_accuracy',
    'questions_answered',
    'std_user_accuracy',
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy',
    'question_asked',
    'std_accuracy',
    'median_accuracy',
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    'skew_accuracy',
    'bundle_id',
    'tags1',
    'tags2',
    'tags3',
]
target = 'answered_correctly'

In [None]:
df = df[df[target] != -1]

In [None]:
df = df.merge(user_answers_df,
                         how='left', on='user_id')
df = df.merge(content_answers_df,
                         how='left', on='content_id')
df = df.merge(q_df, how='left', 
              left_on='content_id', right_on='question_id')

In [None]:
train_df, test_df = train_test_split(df,
                                    random_state=42,
                                    test_size=0.2)

In [None]:
# Optuna
params = {
    'bagging_fraction': 0.5817242323514327,
    'feature_fraction': 0.6884588361650144,
    'learning_rate': 0.42887924851375825, 
    'max_depth': 6,
    'min_child_samples': 946, 
    'min_data_in_leaf': 47, 
    'n_estimators': 169,
    'num_leaves': 29,
    'random_state': 666
}
model = LGBMClassifier(**params)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_features = train_df[features]
train_features = scaler.fit_transform(train_features)

In [None]:
model.fit(train_features, train_df[target])

In [None]:
test_df = test_df[test_df['answered_correctly']!=-1]
test_features = test_df[features]
test_features = scaler.transform(test_features)
preds = model.predict(test_features)

print(roc_auc_score(test_df[target], preds))

In [None]:
# Try to remedy this with Random Oversampling
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_df[target], preds))

In [None]:
labels = test_df[target].value_counts()
print(labels[1] / (labels[0] + labels[1]))

In [None]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.merge(user_answers_df, how='left',
                           on='user_id')
    test_df = test_df.merge(content_answers_df, how='left',
                           on='content_id')
    test_df = test_df.merge(q_df, how='left', 
              left_on='content_id', right_on='question_id')
    
    test_df['timespend'] = test_df.groupby('user_id')['timestamp'].transform(lambda x: (x.max() - x.min()) / 1000)
    
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:,1]
    
    
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0,
                           ['row_id', 'answered_correctly']])