In [None]:
!pip install --quiet category_encoders
!pip install --quiet pycaret[full]
!pip install --quiet optuna

[K     |████████████████████████████████| 81kB 7.7MB/s 
[K     |████████████████████████████████| 266kB 32.0MB/s 
[K     |████████████████████████████████| 112kB 48.9MB/s 
[K     |████████████████████████████████| 81kB 8.4MB/s 
[K     |████████████████████████████████| 6.8MB 50.4MB/s 
[K     |████████████████████████████████| 276kB 42.1MB/s 
[K     |████████████████████████████████| 1.4MB 23.5MB/s 
[K     |████████████████████████████████| 61kB 8.1MB/s 
[K     |████████████████████████████████| 1.7MB 32.3MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 256kB 40.0MB/s 
[K     |████████████████████████████████| 14.2MB 207kB/s 
[K     |████████████████████████████████| 2.0MB 20.0MB/s 
[K     |████████████████████████████████| 174kB 44.0MB/s 
[K     |███████

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
import joblib

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, classification_report, roc_curve, roc_auc_score
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict, StratifiedKFold, learning_curve
import lightgbm as lgb
from lightgbm import LGBMClassifier
from category_encoders.cat_boost import CatBoostEncoder
import torch
import random
import os

  import pandas.util.testing as tm


In [None]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
train = pd.read_csv("/content/drive/MyDrive/train.csv")
test = pd.read_csv("/content/drive/MyDrive/test.csv")
train.head()

Unnamed: 0,row_id,order_id,student_id,bundle_id,question_id,feature_1,feature_2,feature_3,feature_4,feature_5,correct
0,0,0,0,891,4664,18000,18000,0,0,0,1
1,1,0,1,60,5957,11000,11000,0,0,0,0
2,2,0,1,60,5951,26000,26000,0,5,0,1
3,3,0,1,60,5952,41000,41000,0,3,0,1
4,4,0,2,1783,6101,12000,12000,0,4,0,1


In [None]:
dataset = [train, test]
for data in dataset:
  data.drop('row_id', axis=1, inplace=True)
train.head()

ce = CatBoostEncoder()
cat_feats = ['order_id', 'student_id', 'bundle_id', 'question_id', 'feature_3', 'feature_4', 'feature_5']
train[cat_feats] = ce.fit_transform(train[cat_feats], train['correct'])
test[cat_feats] = ce.transform(test[cat_feats])

sc = StandardScaler()
train.iloc[:,4:6]=sc.fit_transform(train.iloc[:,4:6])
test.iloc[:,4:6]=sc.transform(test.iloc[:,4:6])

for feat in ['feature_3', 'feature_4', 'feature_5']:
  train[feat] = train[feat].astype('category')
  test[feat] = test[feat].astype('category')
train.head()

Unnamed: 0,order_id,student_id,bundle_id,question_id,feature_1,feature_2,feature_3,feature_4,feature_5,correct
0,0,0,891,4664,-0.023827,-0.029006,0,0,0,1
1,0,1,60,5957,-0.028121,-0.033224,0,0,0,0
2,0,1,60,5951,-0.018918,-0.024186,0,5,0,1
3,0,1,60,5952,-0.009715,-0.015147,0,3,0,1
4,0,2,1783,6101,-0.027508,-0.032622,0,4,0,1


In [None]:
from pycaret.classification import *
clf = setup(data = train, target = 'correct', train_size=0.9)

Unnamed: 0,Description,Value
0,session_id,1974
1,Target,correct
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(590079, 10)"
5,Missing Values,False
6,Numeric Features,6
7,Categorical Features,3
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
# lg_params = {"random_state": [42],
#              "objective": ["binary"],
#              "metric": ["auc"],
#              "categorical_feature": ['feature_3', 'feature_4', 'feature_5'],
#              'n_estimators': np.random.randint(400, 1000),
#              'learning_rate': [0.05, 0.1],
#              'num_leaves': np.random.randint(450, 1024),
#              'max_depth': np.random.randint(-1, 32),
#              'reg_alpha': np.random.uniform(1E-16, 25),
#              'reg_lambda': np.random.uniform(1E-16, 25),
#              'colsample_bytree': np.random.uniform(0.5, 1.0),
#              'subsample ': np.random.uniform(0.4, 1.0)}

In [None]:
# lgbm = create_model('lightgbm')
# tune_lgbm = tune_model(lgbm, n_iter=100, optimize='AUC', custom_grid=lg_params, choose_better=True)

In [None]:
top3 = compare_models(sort = 'AUC', n_select = 3, include = ['lightgbm','rf','catboost'])
tuned_top3 = [tune_model(i, n_iter=50, optimize = 'AUC') for i in top3]
# lgbm1 = create_model('lightgbm', random_state=42, objective="binary", metric="auc", categorical_feature=[6, 7, 8], n_estimators=1000, learning_rate=0.05, num_leaves=794, max_depth=32, reg_alpha=0.08357518142850195, reg_lambda=0.24641636383948007, colsample_bytree=0.5777255100396145, subsample=0.45805579547952935, cat_smooth=46.717143511098826)
# lgbm2 = create_model('lightgbm', random_state=42, objective="binary", metric="auc", categorical_feature=[6, 7, 8], n_estimators=5000, learning_rate=0.05, num_leaves=683, max_depth=-1, reg_alpha=7.343826329039283, reg_lambda=6.097790338135608, colsample_bytree=0.6648831061301637, subsample =0.5866576887013851, cat_smooth=29.739488628848175)
# lgbm3 = create_model('lightgbm', random_state=42, objective="binary", metric="auc", categorical_feature=[6, 7, 8], n_estimators=5000, learning_rate=0.1, num_leaves=812, max_depth=11, reg_alpha=12.276001743272404, reg_lambda=9.39290007481496, colsample_bytree=0.5675131603273436, subsample=0.5909281388417449, cat_smooth=32.38414121118627)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8834,0.9598,0.9503,0.8614,0.9037,0.7569,0.7628
1,0.8862,0.9609,0.9549,0.8621,0.9061,0.7625,0.769
2,0.8836,0.9595,0.9532,0.8598,0.9041,0.7571,0.7637
3,0.8818,0.9595,0.9541,0.8568,0.9028,0.7532,0.7603
4,0.8813,0.958,0.9527,0.857,0.9023,0.7522,0.7591
5,0.8828,0.9594,0.9518,0.8596,0.9033,0.7555,0.7618
6,0.8828,0.9592,0.9536,0.8584,0.9035,0.7553,0.7621
7,0.8825,0.9588,0.9527,0.8585,0.9032,0.7547,0.7613
8,0.8806,0.9579,0.9519,0.8566,0.9017,0.7507,0.7575
9,0.8855,0.961,0.9559,0.8605,0.9057,0.7609,0.7677


IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 19.5min


KeyboardInterrupt: ignored

In [None]:
ens_model = blend_models(estimator_list = [lgbm1, lgbm2, lgbm3], fold = 5, method = 'soft')

In [None]:
calibrated_blended = calibrate_model(ens_model)
predictions1 = predict_model(calibrated_blended, data = test)

In [None]:
pred_holdout = predict_model(ens_model)
final_model = finalize_model(ens_model)
predictions2 = predict_model(final_model, data = test)

In [None]:
# predictions2.to_csv('/content/drive/MyDrive/lgbmoptuna3blends2.csv')
predictions1.to_csv('/content/drive/MyDrive/lgbmoptuna3blends.csv')

In [None]:
plot_model(final_model, plot = 'learning')

In [None]:
plot_model(final_model, plot = 'feature_all')

In [None]:
plot_model(final_model, plot = 'threshold')

In [None]:
plot_model(final_model, plot = 'auc')