# Riiid! Answer Correctness Prediction with help lightgbm classifier with futures and parameters tuning. Beginners guide.

In [None]:
import optuna
from sklearn.feature_selection import RFECV
from lightgbm import LGBMClassifier
import pandas as pd
import joblib
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import optuna.integration.lightgbm as lgb
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt

## 1. Data preprocessing.

### 1.1 
### Let's read the data from attached files(train.csv, questions.csv) into the specified columns.

In [None]:
used_data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
}

train_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/train.csv',
    usecols=used_data_types_dict.keys(),
    dtype=used_data_types_dict
)

questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv',
    usecols=[0, 3],
    dtype={'question_id': 'int16', 'part': 'int8'}
)

### 1.2 
### Attach data frame from *questions_df* to *train_df* using pandas method *merge*. Joining occurs on the specified columns: *content_id* in *train_df* and *question_id* in *questions_df*.

In [None]:
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left')
train_df.drop(columns=['question_id'], inplace=True)

### 1.3 
### Fill NaN with mean values for numerical features and the most common for boolean ones. To find out the most common value in columns or distribution of values, we use  *matplotlib.pyplot* methods: *pie*(categorial/boolean) or *hist*(numerical) or something else.


In [None]:
train_df['prior_question_had_explanation'].fillna(bool(True), inplace=True)
train_df = train_df.replace([-np.inf, np.inf], np.nan)
train_df = train_df.fillna(train_df.mean())
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype(bool)

In [None]:
labels = [bool(False), bool(True)]
sizes = [list(train_df['prior_question_had_explanation']).count(labels[0]),
        list(train_df['prior_question_had_explanation']).count(labels[1])]

plt.pie(sizes, labels=labels, shadow=True, startangle=90, autopct='%1.1f%%')
plt.show()

### 1.4 
### Ð¡reate all possible features based by *'answered_correctly'*, grouping values first by *'user_id'*, then by *'content_id'* with using *dataframe* method *group_by* and genetating features with *agg* functions.

### Write the resulting data for convenience in a file with help *read_csv*.

In [None]:
train_df = train_df[train_df['answered_correctly'] != -1]
train_questions_only_df = train_df

grouped_by_user_df = train_questions_only_df.groupby('user_id')
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count', 'skew',
                                                                 'std', 'var', 'sem',
                                                                               'sum']}).copy()
user_answers_df.columns = [
    'mean_user_accuracy',
    'questions_answered',
    'questions_skew',
    'questions_std',
    'questions_var',
    'questions_sem',
    'questions_sum'
]
user_answers_df.to_csv('user.csv')

grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'skew',
                                                                       'std', 'var', 'sem',
                                                                                     'sum']}).copy()
content_answers_df.columns = [
    'content_mean',
    'question_asked',
    'content_skew',
    'content_std',
    'content_var',
    'content_sem',
    'content_sum'
]
content_answers_df.to_csv('content.csv')

### Repeat 1.2 to join the data from the previous point to the main data

In [None]:
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')
train_df = train_df.replace([-np.inf, np.inf], np.nan)
train_df = train_df.fillna(train_df.mean()

### 1.5
### Separating the training features from the target

In [None]:
features = [
    'timestamp',
    'user_id',
    'content_id',
    'prior_question_elapsed_time',
    'prior_question_had_explanation',

    'part',

    'mean_user_accuracy',
    'questions_skew',
    'questions_std',
    'questions_var',
    'questions_sem',

    'content_mean',
    'content_skew',
    'content_std',
    'content_var',
    'content_sem'
]

target = 'answered_correctly'

## 2
## Features selection with RFECV.

### 2.1
### Let's create the simplest classifier for recursive selection of features using *sklearn.feature_selection.RFECV*. Train selector on our data.

In [None]:
selection_df = train_df[:20_000_000]

select_model = LGBMClassifier()
selector = RFECV(select_model, step=1, cv=3, n_jobs=12, verbose=10, min_features_to_select=6)
selector.fit(selection_df[features], selection_df[target])

joblib.dump(selector, 'Selector.joblib') # save the selector on disk

### 2.2
### Using a logical mask(*selectoe.support_*), we select the most useful features. You can also find out the rank assigned to each feature(*selector.ranking_*).

In [None]:
features = [features[i] for i in range(len(selector.support_)) if selector.support_[i] == True]

## 3
## Parameters selection with optuna.

### 3.1 
### Create train and test data with *sklearn.model_selection.train_test_split*.

In [None]:
optuna_df = train_df[:20_000_000]

Xt, Xv, Yt, Yv = train_test_split(optuna_df[features], optuna_df[target], test_size=0.3, shuffle=True)
lgb_train = lgb.Dataset(Xt, Yt)
lgb_eval = lgb.Dataset(Xv, Yv)

### 3.2
### Let's define functions that return a model with a list of LGBMClassifier parameters(*create_model*) and its accuracy on test data(*objective*).

In [None]:
def create_model(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 32, 512),
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': trial.suggest_uniform('learning_rate', 0.05, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 18),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 8),
        'min_child_samples': trial.suggest_int('min_child_samples', 4, 80),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'random_state': 42
    }
    
    model = LGBMClassifier(**params)
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(Xt, Yt)
    preds = model.predict_proba(Xv)[:, 1]
    score = roc_auc_score(Yv, preds)
    return score

### 3.3 
### Finally, let's create a optuna model that iterates through the parameters from the given list and selects the best ones.

In [None]:
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50) # n_trials - number of parameter sets and model

params = study.best_params # get the best set of parameters from these n_trials sets

joblib.dump(study, 'Study_optuna.joblib')

### The parameters I got:

In [None]:
best_params = {'num_leaves': 392, 
               'learning_rate': 0.14812766987568138, 
               'max_depth': 11, 
               'min_child_weight': 13, 
               'feature_fraction': 0.9829084591151024, 
               'bagging_fraction': 0.9793416187075863, 
               'bagging_freq': 5, 
               'min_child_samples': 22, 
               'reg_alpha': 0.8989695252132637,
               'reg_lambda': 0.024084559071289695, 
               'n_estimators': 397
              }

## 4
## LGBMClassifier with chosen parameters and features training.

### 4.1
### Repeat 3.1, but on all dataset.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df[features], train_df[target], test_size=0.2, random_state=42
)

### 4.2
### Prepare the model with the selected parameters and train it. 

In [None]:
model = LGBMClassifier(**params)
model.fit(X_train, y_train)

joblib.dump(model, 'Final_lgb.joblib')

## 5
## Submitting result.

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

### For speed, let's execute all the above code on the local computer and load the resulting models and data on kaggle.

In [None]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv',
    usecols=[0, 3],
    dtype={'question_id': 'int16', 'part': 'int8'}
)

model = joblib.load()
selector = joblib.load('../input/selector/Selector.joblib')

user_answers_df = read_csv('../input/preprocessingcontentuser/user.csv')
content_answers_df = read_csv('../input/preprocessingcontentuser/content.csv')

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left')
    test_df.drop(columns=['question_id'], inplace=True)
    test_df['prior_question_had_explanation'].fillna(bool(True), inplace=True)
    test_df = test_df.replace([-np.inf, np.inf], np.nan)
    test_df = test_df.fillna(test_df.mean())
    
    test_df = test_df[test_df['content_type_id'] != 1]
    
    test_df = test_df.merge(user_answers_df, how='left', on='user_id')
    test_df = test_df.merge(content_answers_df, how='left', on='content_id')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].astype(bool)
    test_df = test_df.replace([-np.inf, np.inf], np.nan)
    test_df = test_df.fillna(test_df.mean())

    
    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:,1]
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])


### That's all.

### The accuracy i got: 0.753

### Good luck to all!