In [15]:
import numpy as np
import pandas as pd

from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
# import riiideducation
import random
from sklearn.metrics import roc_auc_score
import gc
import pickle

from sklearn.model_selection import train_test_split

_ = np.seterr(divide='ignore', invalid='ignore')

In [2]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id':'int8', 
    'task_container_id': 'int16',
    #'user_answer': 'int8',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

In [3]:
train_df = dt.fread('data/train.csv', columns=set(data_types_dict.keys())).to_pandas()

In [4]:
features_df = train_df.iloc[:int(9 /10 * len(train_df))]
train_df = train_df.iloc[int(9 /10 * len(train_df)):]

In [5]:
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]
grouped_by_user_df = train_questions_only_df.groupby('user_id')
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']}).copy()
user_answers_df.columns = [
    'mean_user_accuracy', 
    'questions_answered', 
    'std_user_accuracy', 
    'median_user_accuracy', 
    'skew_user_accuracy'
]

user_answers_df

Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,0.695652,46,0.465215,1.0,-0.879359
124,0.233333,30,0.430183,0.0,1.328338
2746,0.578947,19,0.507257,1.0,-0.347892
5382,0.672000,125,0.471374,1.0,-0.741648
8623,0.642202,109,0.481566,1.0,-0.601619
...,...,...,...,...,...
1933700710,0.624829,1466,0.484332,1.0,-0.516175
1933703805,0.650000,40,0.483046,1.0,-0.653746
1933711038,0.684211,38,0.471069,1.0,-0.825545
1933715576,0.375000,16,0.500000,0.0,0.571429


In [6]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']}).copy()
content_answers_df.columns = [
    'mean_accuracy', 
    'question_asked', 
    'std_accuracy', 
    'median_accuracy', 
    'skew_accuracy'
]

content_answers_df

Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.908595,6236,0.288207,1.0,-2.836339
1,0.891682,6684,0.310805,1.0,-2.521185
2,0.554656,40499,0.497010,1.0,-0.219949
3,0.779348,20734,0.414696,1.0,-1.347371
4,0.613226,28549,0.487020,1.0,-0.465009
...,...,...,...,...,...
13518,0.789203,778,0.408137,1.0,-1.420839
13519,0.567797,826,0.495682,1.0,-0.274213
13520,0.678524,759,0.467351,1.0,-0.766003
13521,0.822560,789,0.382283,1.0,-1.691836


In [7]:
del features_df
del grouped_by_user_df
del grouped_by_content_df

In [8]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'skew_accuracy'
]

target = 'answered_correctly'

In [9]:
train_df = train_df[train_df[target] != -1]

In [10]:
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')

In [11]:
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
train_df = train_df.fillna(value=0.5)

In [12]:
train_df = train_df[features + [target]]
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.5)

train_df

Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,prior_question_elapsed_time,prior_question_had_explanation,skew_accuracy,answered_correctly
0,0.779843,5219.0,0.414392,1.0,-1.351136,0.532146,1291.0,0.499159,1.0,13000.0,True,-0.128999,0
1,0.779843,5219.0,0.414392,1.0,-1.351136,0.584772,2548.0,0.492858,1.0,44000.0,True,-0.344273,1
2,0.779843,5219.0,0.414392,1.0,-1.351136,0.603571,1960.0,0.489280,1.0,22000.0,True,-0.423795,1
3,0.779843,5219.0,0.414392,1.0,-1.351136,0.694888,2504.0,0.460547,1.0,74000.0,True,-0.847011,1
4,0.779843,5219.0,0.414392,1.0,-1.351136,0.765869,1922.0,0.423565,1.0,9000.0,True,-1.256695,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9926946,0.500000,0.5,0.500000,0.5,0.500000,0.738732,3927.0,0.439382,1.0,18000.0,True,-1.087226,1
9926947,0.500000,0.5,0.500000,0.5,0.500000,0.524581,9194.0,0.499423,1.0,14000.0,True,-0.098460,1
9926948,0.500000,0.5,0.500000,0.5,0.500000,0.616455,28174.0,0.486258,1.0,14000.0,True,-0.479018,1
9926949,0.500000,0.5,0.500000,0.5,0.500000,0.660559,5185.0,0.473565,1.0,22000.0,True,-0.678349,0


In [16]:
train_df, test_df = train_test_split(train_df, random_state=666, test_size=0.2)

In [18]:
def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    
    model = lgb.train(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=666
    )
    
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(train_df[features], train_df[target])
    score = roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1])
    return score

# uncomment to use optuna
# study = optuna.create_study(direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=70)
# params = study.best_params
# params['random_state'] = 666

params = {
    'bagging_fraction': 0.5817242323514327,
    'feature_fraction': 0.6884588361650144,
    'learning_rate': 0.42887924851375825, 
    'max_depth': 6,
    'min_child_samples': 946, 
    'min_data_in_leaf': 47, 
    'n_estimators': 169,
    'num_leaves': 29,
    'random_state': 666
}

#study = optuna.create_study(direction="maximize")
#study.optimize(objective, n_trials=5)
model = LGBMClassifier(**params)
model.fit(train_df[features], train_df[target])
lgb.plot_importance(model, importance_type='gain')
plt.show()
print('LGB score: ', roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1]))

NameError: name 'LGBMClassifier' is not defined

In [None]:
test_df[:30]

Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,prior_question_elapsed_time,prior_question_had_explanation,skew_accuracy,answered_correctly
885777,0.5,0.5,0.5,0.5,0.5,0.5083,2530,0.50003,1.0,24672.0,True,-0.033226,1
1726837,0.5,0.5,0.5,0.5,0.5,0.584929,16455,0.492749,1.0,22000.0,True,-0.344755,0
4889874,0.5,0.5,0.5,0.5,0.5,0.885568,5287,0.318365,1.0,16992.0,True,-2.423096,1
4187376,0.5,0.5,0.5,0.5,0.5,0.85742,3605,0.349692,1.0,24000.0,True,-2.045333,0
3569513,0.5,0.5,0.5,0.5,0.5,0.816858,9823,0.386802,1.0,20992.0,True,-1.638682,1
498141,0.5,0.5,0.5,0.5,0.5,0.727284,8100,0.445384,1.0,14000.0,True,-1.020873,1
601084,0.5,0.5,0.5,0.5,0.5,0.527761,6664,0.499266,1.0,46496.0,True,-0.111241,1
4132260,0.5,0.5,0.5,0.5,0.5,0.466591,23916,0.498893,0.0,12000.0,True,0.133942,1
2381427,0.5,0.5,0.5,0.5,0.5,0.873001,5378,0.333003,1.0,18000.0,True,-2.241062,1
3649921,0.5,0.5,0.5,0.5,0.5,0.771446,5036,0.419943,1.0,24336.0,True,-1.293286,1


In [None]:
predicts = model.predict(test_df[features])
predicts[1:30]

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0], dtype=int8)

In [None]:
test_df_ans = test_df['answered_correctly'][:50]
ans_predicts = test_df_ans.merge(predicts[:50], how='left', on='answered_correctly')

AttributeError: ignored

In [None]:
model

In [None]:
get_params_model = model.get_params(deep=True)
get_params_model

In [None]:
objective = model.objective_
objective

In [None]:
model

In [None]:
print(model.best_score_)

In [None]:
model.classes_

In [None]:
model.evals_result_

In [None]:
model.feature_importances_

In [None]:
lgb.plot_importance(model)
plt.show()