In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, auc, roc_curve
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from  sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from  sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import neighbors
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
import lightgbm as lgb
from IPython.display import display
from sklearn.preprocessing import StandardScaler
import joblib
import gc
from tqdm import tqdm
from patsy import dmatrices
import statsmodels.api as sm
from scipy.signal import savgol_filter
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
train = pd.read_csv('../input/riiid-test-answer-prediction/train.csv',
                   usecols=[0, 2, 3, 4, 5, 7, 8, 9],
                   dtype={
                          'row_id': 'int32',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'user_answer': 'int8',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'
                          },
                   )

lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv',
                       usecols=[0,3],
                   dtype={
                          'lecture_id': 'int16',
                          'type_of': 'object',
                          }
                        )
lectures = pd.get_dummies(lectures, columns=['type_of'])

questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv',                         
                        usecols=[0, 1, 2, 3, 4],
                           dtype={'question_id': 'int16',
                                  'part': 'int8',
                                  'bundle_id': 'int8',
                                  'correct_answer': 'int8',
                                  'tags': 'object'}
                          )
train = pd.merge(train, questions, left_on = 'content_id', right_on = 'question_id', how = 'left')

train = train[train.content_type_id == False]
train = train[train.answered_correctly!= -1 ]
train.drop(['content_type_id'], axis=1, inplace=True)
train.set_index('user_id', inplace=True)
train.dropna(inplace=True)

test = pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv')

In [None]:
train.head(2)

In [None]:
columns = ['row_id','content_id','task_container_id','answered_correctly','question_id','bundle_id','part',
           'prior_question_had_explanation','correct_answer',
           ]

df = train[columns].groupby(level='user_id').tail(50)

df['user_id'] = df.index
encoder = LabelEncoder()
scaler = StandardScaler()
df.prior_question_had_explanation = encoder.fit_transform(df.prior_question_had_explanation)
dummies = pd.get_dummies(df[['correct_answer']], columns=['correct_answer'])
df = pd.concat([df,dummies], axis=1)

# content_id
results_c = df[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ['c_mean']
df = pd.merge(df, results_c, on=['content_id'], how="left")

results_ex_c = df[['prior_question_had_explanation','content_id']].groupby(['content_id']).agg(['mean'])
results_ex_c.columns = ['c_ex_mean']
df = pd.merge(df, results_ex_c, on=['content_id'], how="left")

c_df = pd.merge(results_c,results_ex_c,on='content_id',how='left')

# user_id
results_u = df[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean'])
results_u.columns = ['u_mean']
df = pd.merge(df, results_u, on=['user_id'], how="left")

results_ex_u = df[['prior_question_had_explanation','user_id']].groupby(['user_id']).agg(['mean'])
results_ex_u.columns = ['u_ex_mean']
df = pd.merge(df, results_ex_u, on=['user_id'], how="left")

results_an_u = df[['correct_answer_0.0','correct_answer_2.0','correct_answer_3.0','correct_answer_1.0','user_id']].groupby(['user_id']).agg(['mean'])
results_an_u.columns = ['u_correct_answer_0.0','u_correct_answer_2.0','u_correct_answer_3.0','u_correct_answer_1.0']
df = pd.merge(df, results_an_u, on=['user_id'], how="left")

u_part = df[['user_id','part']].groupby(['user_id']).agg(['mean'])
u_part.columns = ['u_part']
df = pd.merge(df, u_part, on=['user_id'], how="left")

u_df = pd.merge(results_u,results_ex_u,on='user_id',how='left')
u_df = pd.merge(u_df,u_part,on='user_id',how='left')
u_df = pd.merge(u_df,results_an_u,on='user_id',how='left')

df.dropna(inplace=True)

In [None]:
del c_df
del u_df
gc.collect()

In [None]:
df.head(2)

In [None]:
print('df label distribution:\n', df.answered_correctly.value_counts())
print('df size\n',df.shape[0]/train.shape[0])
print('df:train user_id\n',df.user_id.nunique()/train.index.nunique()) 
print('df:train content_id\n',round(df.content_id.nunique()/train.content_id.nunique(),2))
print('df:train task_container_id\n',df.task_container_id.nunique()/train.task_container_id.nunique())

In [None]:
del train
gc.collect()

In [None]:
%%time

kf = KFold(n_splits=5, random_state=42, shuffle=True)
df["fold"] = -1
for fold_id, (train_index, val_index) in enumerate(kf.split(df.row_id)):
    df.iloc[val_index, -1] = fold_id

use_fold = 2

Xt = df[df.fold!=use_fold]
Xv = df.query("fold == @use_fold")
# del df

print("[fold {}] train: {}, val: {}".format(use_fold, len(Xt), len(Xv)))
Yt = Xt[["answered_correctly"]] 
Yv = Xv[["answered_correctly"]]

scaler = StandardScaler(copy=False)

Xt = Xt[['c_mean',
         'u_mean',
         'c_ex_mean',
         'u_ex_mean',
         'u_correct_answer_0.0','u_correct_answer_2.0','u_correct_answer_3.0','u_correct_answer_1.0',
         'u_part',
        ]]

Xv = Xv[['c_mean',
         'u_mean',
         'c_ex_mean',
         'u_ex_mean',
         'u_correct_answer_0.0','u_correct_answer_2.0','u_correct_answer_3.0','u_correct_answer_1.0',
         'u_part',
        ]]

print('Columns:\n',Xt.columns)

print('scaling..')
Xt = scaler.fit_transform(Xt)
Xv = scaler.fit_transform(Xv)

classifier = LogisticRegression(penalty='l2', 
                         dual=False, 
                         tol=0.0001, 
                         C=1.0, 
                         fit_intercept=True, 
                         intercept_scaling=1, 
                         class_weight=None, 
                         random_state=1, 
                         solver='lbfgs',     #'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'
                         max_iter=1000, 
                         multi_class='auto', 
                         verbose=1, 
                         warm_start=False, 
                         n_jobs=-1, 
                         l1_ratio=None)

# classifier = svm.SVC(C=1.0, 
#                      kernel='rbf', #'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
#                      degree=1, 
#                      gamma='auto', 
#                      coef0=1.0, 
#                      shrinking=True, 
#                      probability=True, 
#                      tol=0.001, 
#                      cache_size=200, 
#                      class_weight=None, 
#                      verbose=1, 
#                      max_iter=5, 
#                      decision_function_shape='ovr', 
#                      break_ties=False, 
#                      random_state=1)

# classifier = neighbors.KNeighborsClassifier(n_neighbors=1000,  
#                                             weights='uniform',
#                                             algorithm='auto',   
#                                             leaf_size=5, 
#                                             p=2,   
#                                             metric='minkowski', 
#                                             metric_params=None, 
#                                             n_jobs=-1, )


# classifier = MLPClassifier(hidden_layer_sizes=(10), 
#                            activation='relu', 
#                            solver='adam', 
#                            alpha=0.0001, 
#                            batch_size='auto', 
#                            learning_rate='constant', 
#                            learning_rate_init=0.001, 
#                            power_t=0.5, 
#                            max_iter=200, 
#                            shuffle=False, 
#                            random_state=42, 
#                            tol=0.0001, 
#                            verbose=False, 
#                            warm_start=False, 
#                            momentum=0.9, 
#                            nesterovs_momentum=True, 
#                            early_stopping=False, 
#                            validation_fraction=0.1, 
#                            beta_1=0.9, 
#                            beta_2=0.999, 
#                            epsilon=1e-08, 
#                            n_iter_no_change=10, 
#                            max_fun=15000)

# classifier = RandomForestClassifier(n_estimators=10, 
#                                     criterion='gini', 
#                                     max_depth=5, 
#                                     min_samples_split=2, 
#                                     min_samples_leaf=1, 
#                                     min_weight_fraction_leaf=0.0, 
#                                     max_features='auto', 
#                                     max_leaf_nodes=None, 
#                                     min_impurity_decrease=0.0, 
#                                     min_impurity_split=None, 
#                                     bootstrap=True, 
#                                     oob_score=True, 
#                                     n_jobs=-1, 
#                                     random_state=42, 
#                                     verbose=1, 
#                                     warm_start=False, 
#                                     class_weight=None, 
#                                     ccp_alpha=0.0, 
#                                     max_samples=None )

# params = {
#     'objective': 'binary', 
#     'max_bin': 1000,
#     'learning_rate': 0.1,
#     'num_leaves': 10
# }

# lgb_train = lgb.Dataset(Xt, Yt)
# lgb_eval = lgb.Dataset(Xv, Yv, reference=lgb_train)

# classifier = lgb.train(
#     params, lgb_train,
#     valid_sets=[lgb_train, lgb_eval],
#     verbose_eval=10,
#     num_boost_round=500,
#     early_stopping_rounds=10
# )

print('training..')
classifier.fit(Xt, Yt)

In [None]:
lr_probs = classifier.predict_proba(Xv)
lr_probs = lr_probs[:, 1]
lr_auc = roc_auc_score(Yv, lr_probs)
print('Logistic: ROC AUC=%.3f' % (lr_auc))
lr_fpr, lr_tpr, _ = roc_curve(Yv, lr_probs)
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic', color='darkorange', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
# predict class values
yhat = classifier.predict(Xv)
lr_precision, lr_recall, _ = precision_recall_curve(Yv, lr_probs)
lr_f1, lr_auc = f1_score(Yv, yhat), auc(lr_recall, lr_precision)
# summarize scores
print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
# plot the precision-recall curves
plt.plot(lr_recall, lr_precision, marker='.', label='Logistic', color='darkorange', lw=1)
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()