In [None]:
#basic
import numpy as np
import pandas as pd

#Model imports
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

#optuna
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import roc_auc_score

# You can only call make_env() once, so don't lose it!
import riiideducation

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
used_data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

train_df = pd.read_csv(
    '/kaggle/input/riiid-test-answer-prediction/train.csv',
    usecols = used_data_types_dict.keys(),
    dtype=used_data_types_dict,
    #nrows=10**7,
)

used_data_types_dict = {
    'question_id': 'int16',
    'bundle_id': 'int16',
    'correct_answer': 'int8',
    'part': 'int8',
    'tags': 'str',
}

questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv',
                       usecols = used_data_types_dict.keys(), dtype=used_data_types_dict)

lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
ex = pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv')

In [None]:
#90% of the train_df
features_df = train_df.iloc[:int(9 /10 * len(train_df))]
#10% of the train_df
train_df = train_df.iloc[int(9 /10 * len(train_df)):]

In [None]:
#removes rows that are lectures and adds tags and part to each interaction
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]

train_questions_only_df = pd.merge(train_questions_only_df, questions[['part','tags']], 
                                   left_on='content_id', right_index=True, how = 'left')

#getting the mean accuracy, question count of each user and other math stuff
grouped_by_user_df = train_questions_only_df.groupby('user_id')

user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count']}).copy()
user_answers_df.columns = [
    'user_mean_accuracy', 
    'user_questions_answered', 
]

user_answers_df

In [None]:
#grouping by content_id
grouped_by_content_df = train_questions_only_df.groupby('content_id')

#getting mean count and other stuff for each content_id
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']}).copy()
content_answers_df.columns = [
    'q_mean_accuracy', 
    'q_question_asked', 
    'q_std_accuracy', 
    'q_median_accuracy', 
    'q_skew_accuracy'
]

content_answers_df

In [None]:
#grouping by content_id
grouped_by_tags_df = train_questions_only_df.groupby('tags')

tags_answers_df = grouped_by_tags_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']}).copy()

tags_answers_df.columns = [
    'tags_mean_accuracy', 
    'tags_question_asked', 
    'tags_std_accuracy', 
    'tags_median_accuracy', 
    'tags_skew_accuracy'
]

tags_answers_df

In [None]:
grouped_by_part_df = train_questions_only_df.groupby('part')

part_answers_df = grouped_by_part_df.agg({'answered_correctly': ['mean', 'count', 'std', 'skew']}).copy()
part_answers_df.columns = [
    'part_mean_accuracy', 
    'part_questions_answered', 
    'part_std_user_accuracy',  
    'part_skew_user_accuracy',
]

part_answers_df

Filling missing accuracy for each question_id

In [None]:
#missing questions in training data
missing_q= questions.index.difference(content_answers_df.index)

#filled the one missing tag with most freq in part
questions['tags'] = questions.tags.fillna('27')
#creating dataframe with missing q's
df_copy = content_answers_df.iloc[0:0,:].copy()

#creating rows with each missing_q
for i in missing_q:
    df_copy = df_copy.append({'content_id': i}, ignore_index=True)

df_copy.content_id = df_copy.content_id.astype('int64')
df_copy = df_copy.set_index('content_id')

#fill in the df_copy dataset with values
for i in missing_q:
    i_tags = questions.loc[i].tags
    df_copy.loc[i] = tags_answers_df.loc[i_tags]

#making the datatypes between the two dataframes the same (if error make sure at least nrows=10**7)
df_copy.q_median_accuracy = df_copy.q_median_accuracy.round(decimals=0)
df_copy['q_question_asked'] = 1
df_copy.q_question_asked = df_copy.q_question_asked.astype('int64')

#adding questions that havent been seen to the dataframe
content_answers_df=content_answers_df.append(df_copy).sort_index()

In [None]:
del features_df
del grouped_by_user_df
del grouped_by_content_df
del grouped_by_tags_df
del grouped_by_part_df
del df_copy
del missing_q

In [None]:
features = [
    'user_mean_accuracy', 
    'user_questions_answered',
    'q_mean_accuracy', 
    'q_question_asked', 
    'q_std_accuracy', 
    'q_median_accuracy', 
    'q_skew_accuracy',
    'tags_mean_accuracy', 
    'tags_question_asked', 
    'tags_std_accuracy', 
    'tags_median_accuracy', 
    'tags_skew_accuracy',
    'part_mean_accuracy', 
    'part_questions_answered', 
    'part_std_user_accuracy',  
    'part_skew_user_accuracy',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'part'
]

target = 'answered_correctly'

In [None]:
train_df = train_df[train_df[target] != -1]

train_df = pd.merge(train_df, questions[['part','tags']], 
                    left_on='content_id', right_index=True, how = 'left')

train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')
train_df = train_df.merge(part_answers_df, how='left', left_on='part', right_index=True)
train_df = train_df.merge(tags_answers_df, how='left', left_on='tags', right_index=True)

train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
train_df = train_df.fillna(value=0.5)

train_df = train_df[features + [target]]
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.5)

In [None]:
train_df, test_df = train_test_split(train_df, random_state=314, test_size=0.2)

In [None]:
sampler = TPESampler(seed=314)

def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.30)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    
    model = LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=314
    )
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(train_df[features], train_df[target])
    score = roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1])
    return score

In [None]:
#study = optuna.create_study(direction="maximize", sampler=sampler)
#study.optimize(objective, n_trials=15)
#params = study.best_params
#params['random_state'] = 314

In [None]:
params = {'num_leaves': 24,
          'n_estimators': 104,
          'max_depth': 7,
          'min_child_samples': 689,
          'learning_rate': 0.2221239593291603,
          'min_data_in_leaf': 28,
          'bagging_fraction': 0.2273386395906522,
          'feature_fraction': 0.7763591512041167}

model = LGBMClassifier(**params)
model.fit(train_df[features], train_df[target])
print('LGB score: ', roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1]))

feature importance

In [None]:
print(model.feature_importances_)
print(train_df.columns[:-1])

#we can use train_df.columns[:-1] only because the target column is at the end of the dataframe

pd.DataFrame({'col_name': model.feature_importances_},
                index=train_df.columns[:-1]).sort_values(by='col_name', ascending=False)

### Final Preds

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, questions[['part','tags']], 
                    left_on='content_id', right_index=True, how = 'left')
    test_df = test_df.merge(user_answers_df, how='left', on='user_id')
    test_df = test_df.merge(content_answers_df, how='left', on='content_id')
    test_df = test_df.merge(part_answers_df, how='left', left_on='part', right_index=True)
    test_df = test_df.merge(tags_answers_df, how='left', left_on='tags', right_index=True)
    
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
    test_df.fillna(value = 0.6, inplace = True)

    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:,1]
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])