In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import optuna
from random import sample
from sklearn.metrics import roc_auc_score
import gc
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)

In [None]:
train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                   usecols=[1, 2, 3, 4, 5, 7, 8, 9],
                   dtype={'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   )

In [None]:
user_list = train['user_id'].unique()
user_list = list(user_list)

In [None]:
len(user_list)//15

In [None]:
sample_user_list = sample(user_list, len(user_list)//15)

In [None]:
train = train[train['user_id'].isin(sample_user_list)]

In [None]:
train = train.sort_values(['user_id','timestamp'])

In [None]:
train = train.reset_index(drop=True)

In [None]:
gc.collect()

In [None]:
train.head()

In [None]:
train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'].fillna(0)
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(False)

In [None]:
lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
lectures['content_type_id'] = 1
lectures = lectures.rename(columns={'lecture_id':'content_id','tag':'lecture_tag','part':'lecture_part'})
lectures['type_of'] = lectures['type_of'].replace('solving question', 'solving_question')

In [None]:
lecture_tag_count =  lectures['lecture_tag'].value_counts().to_dict()
lectures['lecture_tag'] = lectures['lecture_tag'].map(lecture_tag_count)
# lectures['lecture_tag'] = lectures['lecture_tag'].map({3:'most',4:'second',2:'second',6:'third',5:'third',1:'third',7:'third'})

In [None]:
lectures['lecture_tag'].value_counts()

In [None]:
lectures['lecture_tag'] = lectures['lecture_tag'].apply(lambda x : 6 if ((x==7)|(x==5)) else x)

In [None]:
lectures['lecture_tag'].value_counts()

In [None]:
lectures = pd.get_dummies(lectures, columns=['lecture_part','lecture_tag','type_of'])

In [None]:
lectures.columns

In [None]:
lectures_column = lectures.columns.to_list()
lectures_column.remove('content_id')
lectures_column.remove('content_type_id')

In [None]:
lectures.head()

In [None]:
lectures.to_csv('lectures.csv',index=False)

In [None]:
train = pd.merge(train,lectures, on = ['content_id','content_type_id'], how='left')

In [None]:
train.head()

In [None]:
train[lectures_column] = train[lectures_column].fillna(0)

In [None]:
train.head()

In [None]:
lecture_cumsum = train.groupby('user_id')[lectures_column].cumsum()

In [None]:
for i in lectures_column:
    lecture_cumsum[i] = lecture_cumsum[i].astype('int32')

In [None]:
lecture_cumsum.max()

In [None]:
train[lectures_column] = lecture_cumsum

In [None]:
train.head()

In [None]:
train = train[train['content_type_id']==0].reset_index(drop=True)

In [None]:
answered_cumsum = train.groupby('user_id')['answered_correctly'].cumsum()
answered_count = train.groupby('user_id')['answered_correctly'].cumcount()

In [None]:
answered_count.head()

In [None]:
train['answered_cumsum'] = answered_cumsum
train['answered_count'] = answered_count
train['answered_cumsum'] = train['answered_cumsum'] - train['answered_correctly']

In [None]:
train['user_correctly_rate'] = train['answered_cumsum']/train['answered_count']
train['user_correctly_rate'] = train['user_correctly_rate'].mask((train['answered_count'] < 5), .65)

In [None]:
train.head(100)

In [None]:
task_info = pd.read_csv('../input/avg-questions-seen/task_info.csv')

In [None]:
train = pd.merge(train,task_info,on='task_container_id',how = 'left')

In [None]:
train.info()

In [None]:
question_to_tag = pd.read_csv('../input/riiid-question-to-tag/question_to_tag.csv')

In [None]:
train = pd.merge(train,question_to_tag,left_on = 'content_id',right_on='question_id',how = 'left')

In [None]:
answered_correctly_content = pd.read_csv('../input/user-content-correctly/answered_correctly_content.csv')

In [None]:
train = pd.merge(train,answered_correctly_content,on = 'content_id',how = 'left')

In [None]:
train['timestamp'] = train['timestamp']//3600000

In [None]:
edge = sample(sample_user_list, len(sample_user_list)//7)

In [None]:
test = train[train['user_id'].isin(edge)]
train = train[~train.index.isin(test.index)]

In [None]:
train_tail = train.groupby('user_id').tail(8)
train = train[~train.index.isin(train_tail.index)]

In [None]:
test = pd.concat([test,train_tail])

In [None]:
test.info()

In [None]:
y_train = train['answered_correctly']
y_test = test['answered_correctly']
X_train = train.drop(['user_id','content_id','content_type_id','task_container_id','answered_correctly','question_id'],axis = 1)
X_test = test.drop(['user_id','content_id','content_type_id','task_container_id','answered_correctly','question_id'],axis = 1)

In [None]:
gc.collect()

In [None]:
# from  sklearn.model_selection import train_test_split
# X_train,X_test, y_train, y_test =train_test_split(train,target,test_size=0.2, random_state=0)

In [None]:
X_train.loc[:,'prior_question_had_explanation']=X_train.loc[:,'prior_question_had_explanation'].astype('bool')
X_test.loc[:,'prior_question_had_explanation']=X_test.loc[:,'prior_question_had_explanation'].astype('bool')

In [None]:
X_train.info()

In [None]:
del train
# del target
gc.collect()

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [None]:
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False,
                        categorical_feature=['prior_question_had_explanation','part','tags1','tags2'])
lgb_eval = lgb.Dataset(X_test, y_test, free_raw_data=False,
                        categorical_feature=['prior_question_had_explanation','part','tags1','tags2'])

In [None]:
del X_train
del y_train
gc.collect()

In [None]:
def objective(trial):    
    params = {
            'num_leaves': trial.suggest_int('num_leaves', 100, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
            'max_depth': trial.suggest_int('max_depth', 4, 30),
            'min_child_weight': trial.suggest_int('min_child_weight', 4, 16),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 80),
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 1.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 1.0),
            'is_unbalance':trial.suggest_categorical('is_unbalance', ['-', '+']),
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'auc',
            'early_stopping_rounds': 100
            }

    model = lgb.train(params, lgb_train, valid_sets=[lgb_train,lgb_eval], verbose_eval=20)
    val_pred = model.predict(X_test)
    score = roc_auc_score(y_test, val_pred)
    print(f"AUC = {score}")
    return score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

In [None]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial
print('  Value: {}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
# plot history
from optuna.visualization import plot_optimization_history
plot_optimization_history(study)

In [None]:
model = lgb.train(trial.params, lgb_train, valid_sets=[lgb_eval], verbose_eval=1000)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
#displaying the most important features
lgb.plot_importance(model)
plt.show()

In [None]:
model.save_model('model.txt')