In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

import riiideducation

pd.options.display.max_columns = None

In [None]:
env = riiideducation.make_env()

In [None]:
train_df = pd.read_csv('../input/riiid-test-answer-prediction/train.csv', low_memory=False, nrows=10**7,
                      dtype={
                          'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean',
                      })

# train_df = train_df.query('answered_correctly != -1').reset_index(drop=True)
# train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype(float) 

In [None]:
train_df

In [None]:

features_part_df = train_df.iloc[:int( 9 / 10 * len(train_df) )]

train_part_df = train_df.iloc[int( 9 / 10 * len(train_df) ):]

In [None]:
train_part_df

In [None]:

train_questions_only_df = features_part_df[features_part_df['answered_correctly'] != -1]

grouped_by_user_df = train_questions_only_df.groupby('user_id')

user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew'], 'prior_question_had_explanation': ['mean']}).copy()



user_answers_df[('test', 'score')] = user_answers_df[('answered_correctly', 'mean')] * (user_answers_df[('answered_correctly', 'count')])


mean = user_answers_df[('test', 'score')].mean()

user_answers_df[('test', 'deviation')] = user_answers_df[('test', 'score')] - mean

user_answers_df[('test', 'square')] = user_answers_df[('test', 'deviation')] * user_answers_df[('test', 'deviation')]

variance = user_answers_df[('test', 'square')].sum() / len(user_answers_df.index)
standard_deviation = np.sqrt(variance)

user_answers_df[('test', 'Deviation_Value')] = (user_answers_df[('test', 'deviation')] * 10 / standard_deviation) + 50

In [None]:
user_answers_df[('test', 'smart_user')] = 0



for index, row in user_answers_df.iterrows():
    if row[('prior_question_had_explanation','mean')] >= 0.8:
        user_answers_df[('test','smart_user')][index] = 1
        
user_answers_df.columns = ['mean_user_accuracy', 'questions_answered', 'std_user_accuracy', 'median_user_accuracy', 'skew_user_accuracy', 'mean_prior_question_had_explanation', 'user_score', 'score_deviation', 'score_square', 'Deviation_Value', 'smart_user']

In [None]:
user_answers_df

In [None]:

questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')

grouped_by_content_df = train_questions_only_df.groupby('content_id')

content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew'] }).copy()
content_answers_df.columns = ['mean_accuracy', 'question_asked', 'std_accuracy', 'median_accuracy', 'skew_accuracy']

questions_df = questions_df.merge(content_answers_df, left_on = 'question_id', right_on = 'content_id', how = 'left')

bundle_dict = questions_df['bundle_id'].value_counts().to_dict()


questions_df['right_answers'] = questions_df['mean_accuracy'] * questions_df['question_asked']

questions_df['bundle_size'] = questions_df['bundle_id'].apply(lambda x: bundle_dict[x])

In [None]:
questions_df

In [None]:

grouped_by_bundle_df = questions_df.groupby('bundle_id')

bundle_answers_df = grouped_by_bundle_df.agg({'right_answers': 'sum', 'question_asked': 'sum'}).copy()
bundle_answers_df.columns = ['bundle_right_answers', 'bundle_questions_asked']

bundle_answers_df['bundle_accuracy'] = bundle_answers_df['bundle_right_answers'] / bundle_answers_df['bundle_questions_asked']

In [None]:
bundle_answers_df

In [None]:

grouped_by_part_df = questions_df.groupby('part')

part_answers_df = grouped_by_part_df.agg({'right_answers': 'sum', 'question_asked': 'sum'}).copy()

part_answers_df.columns = ['part_right_answers', 'part_questions_asked']
part_answers_df['part_accuracy'] = part_answers_df['part_right_answers'] / part_answers_df['part_questions_asked']

In [None]:
part_answers_df

In [None]:

train_part_df = train_part_df[train_part_df['answered_correctly'] != -1]

In [None]:
# user_answers_df
train_part_df = train_part_df.merge(user_answers_df, how='left', on='user_id')

# questions_df
train_part_df = train_part_df.merge(questions_df, how='left', left_on='content_id', right_on='question_id')

# bundle_answers_df
train_part_df = train_part_df.merge(bundle_answers_df, how='left', on='bundle_id')

# part_answers_df
train_part_df = train_part_df.merge(part_answers_df, how='left', on='part')

In [None]:

train_part_df['prior_question_had_explanation'] = train_part_df['prior_question_had_explanation'].fillna(value=False).astype(bool)

train_part_df.fillna(value = -1, inplace = True)

In [None]:

le = LabelEncoder()
train_part_df["prior_question_had_explanation"] = le.fit_transform(train_part_df["prior_question_had_explanation"])

In [None]:
train_part_df

In [None]:
train_part_df.columns

In [None]:
# 旧
# features = [
#     'timestamp','mean_user_accuracy', 'questions_answered','mean_accuracy',
#     'question_asked','prior_question_elapsed_time', 'prior_question_had_explanation',
#     'bundle_size', 'bundle_accuracy','part_accuracy', 'right_answers'
# ]


# features = [
#     'timestamp','prior_question_elapsed_time', 'prior_question_had_explanation',
#     'mean_user_accuracy', 'questions_answered', 'std_user_accuracy',
#     'median_user_accuracy', 'skew_user_accuracy','mean_accuracy',
#     'question_asked', 'std_accuracy', 'median_accuracy', 'skew_accuracy',
#     'bundle_size','bundle_accuracy', 'part_accuracy','user_score',
#     'score_deviation', 'score_square', 'Deviation_Value',
# ]


features = [
    'timestamp','prior_question_elapsed_time', 'prior_question_had_explanation',
       'mean_user_accuracy', 'questions_answered', 'std_user_accuracy',
       'median_user_accuracy', 'skew_user_accuracy',
       'mean_prior_question_had_explanation','Deviation_Value', 'smart_user',
       'mean_accuracy','question_asked', 'std_accuracy', 'median_accuracy', 'skew_accuracy',
       'bundle_size','bundle_accuracy','part_accuracy'
]

target = 'answered_correctly'

In [None]:
X_train = train_part_df[features]
y_train = train_part_df[target]

In [None]:
X_train

In [None]:
models = []
oof_train = np.zeros(len(X_train),) ### array([0., 0., 0., ..., 0., 0., 0.])
categorical_features = ['prior_question_had_explanation']

params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

n_tr = round(981094 * 0.9)

X_tr = X_train[:n_tr]
X_val = X_train[n_tr:]

y_tr = y_train[:n_tr]
y_val = y_train[n_tr:]

lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=100 
)

oof_train = model.predict(X_val, num_iteration=model.best_iteration)

models.append(model)

In [None]:

importance = pd.DataFrame(model.feature_importance(), index=X_train.columns, columns=['importance'])
result = importance.sort_values('importance', ascending=False)

In [None]:
result

In [None]:

roc_auc_score(y_val, oof_train)

In [None]:
import lightgbm as lgb
import sklearn.datasets, sklearn.model_selection

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
from sklearn import metrics
auc = metrics.roc_auc_score(y_val, oof_train)

false_positive_rate, true_positive_rate, thresolds = metrics.roc_curve(y_val, oof_train)

plt.figure(figsize=(10, 8), dpi=100)
plt.axis('scaled')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title("AUC & ROC Curve")
plt.plot(false_positive_rate, true_positive_rate, 'g')
plt.fill_between(false_positive_rate, true_positive_rate, facecolor='green', alpha=0.7)
plt.text(0.95, 0.05, 'AUC = %0.4f' % auc, ha='right', fontsize=12, weight='bold', color='blue')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.savefig('my_image.png')
plt.show()

   
   

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    y_preds = []
    
    test_df = test_df.merge(user_answers_df, how = 'left', on = 'user_id')
    test_df = test_df.merge(questions_df, how = 'left', left_on = 'content_id', right_on = 'question_id')
    test_df = test_df.merge(bundle_answers_df, how = 'left', on = 'bundle_id')
    test_df = test_df.merge(part_answers_df, how = 'left', on = 'part')
    
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df.fillna(value = -1, inplace = True)
    X_test = test_df[features]
    
    for model in models:
        y_pred = model.predict(X_test, num_iteration=model.best_iteration)
        y_preds.append(y_pred)
        
    y_preds = sum(y_preds) / len(y_preds)
    test_df['answered_correctly'] = y_preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

## discussion