In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
import joblib

In [None]:
!pip install --quiet category_encoders
!pip install --quiet optuna

[K     |████████████████████████████████| 81kB 3.4MB/s 
[K     |████████████████████████████████| 307kB 4.3MB/s 
[K     |████████████████████████████████| 81kB 8.2MB/s 
[K     |████████████████████████████████| 174kB 33.3MB/s 
[K     |████████████████████████████████| 51kB 6.1MB/s 
[K     |████████████████████████████████| 143kB 46.0MB/s 
[K     |████████████████████████████████| 112kB 54.6MB/s 
[K     |████████████████████████████████| 81kB 8.4MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [None]:
# !git clone --recursive https://github.com/Microsoft/LightGBM
# %cd /content/LightGBM
# !mkdir build
# !cmake -DUSE_GPU=1
# !make -j$(nproc)
# !sudo apt-get -y install python-pip
# !sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
# %cd /content/LightGBM/python-package/
# !sudo python setup.py install --precompile

In [None]:
import optuna
from lightgbm import LGBMRegressor, LGBMClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import QuantileTransformer, StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from category_encoders.cat_boost import CatBoostEncoder

import torch
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import tqdm
import random
import os
import glob


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



In [None]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
train = pd.read_csv("/content/drive/MyDrive/train.csv")
test = pd.read_csv("/content/drive/MyDrive/test.csv")

In [None]:
dataset = [train, test]
for data in dataset:
  data.drop('row_id', axis=1, inplace=True)
  
ce = CatBoostEncoder()
cat_feats = ['order_id', 'student_id', 'bundle_id', 'question_id', 'feature_3', 'feature_4', 'feature_5']
train[cat_feats] = ce.fit_transform(train[cat_feats], train['correct'])
test[cat_feats] = ce.transform(test[cat_feats])

sc = StandardScaler()
train.iloc[:,4:6]=sc.fit_transform(train.iloc[:,4:6])
test.iloc[:,4:6]=sc.transform(test.iloc[:,4:6])

for feat in cat_feats:
  train[feat] = train[feat].astype('category')
  test[feat] = test[feat].astype('category')

In [None]:
class Objective:

    def __init__(self):
        self.best_booster = None
        self._booster = None

    def __call__(self, trial):

        param = {
          "random_state": 42,
          "objective": "binary",
          "metric": "auc",
          "categorical_feature": cat_feats,
          "verbosity": -1,
          # 'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
          "n_estimators": trial.suggest_int('n_estimators', 400, 1000),
          "learning_rate": trial.suggest_categorical('learning_rate', [0.05, 0.1]),
          'num_leaves': trial.suggest_int('num_leaves', 450, 1024),
          'max_depth': trial.suggest_int('max_depth', -1, 32),
          'reg_alpha': trial.suggest_float('reg_alpha', 1E-16, 25),
          'reg_lambda': trial.suggest_float('reg_lambda', 1E-16, 25),
          'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
          'subsample': trial.suggest_float('subsample ', 0.4, 1.0)
          # 'device': 'gpu'
          # 'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 50.0)
        }

        val_aucs = []
        aucs = []
        cv = StratifiedKFold(10, shuffle = True, random_state = 42)

        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'auc', valid_name='valid_1') 

        for kfold, (train_idx, val_idx) in tqdm.tqdm(enumerate(cv.split(train[train.columns[:-1]].values, 
                                                                    train['correct'].values))):
        
        
          X_train = train.loc[train_idx, train.columns[:-1]]
          y_train = train.loc[train_idx, 'correct']
          
          X_valid = train.loc[val_idx, train.columns[:-1]]
          y_valid = train.loc[val_idx, 'correct']
          
          d_train = lgb.Dataset(X_train, label=y_train)
          d_valid = lgb.Dataset(X_valid, label=y_valid)
          watchlist = [d_train, d_valid]
          
          model = lgb.train(param,
                        train_set=d_train,
                        valid_sets=watchlist,
                        verbose_eval=0,
                        early_stopping_rounds=100,
                        callbacks=[pruning_callback])
          
          self._booster = model

          preds = model.predict(X_valid)
          
          auc = roc_auc_score(y_valid, preds)
          aucs.append(auc)

        if np.average(aucs) > 0.965:
          pred = model.predict(test[test.columns[:-1]])
          lgbmbest = pd.DataFrame(pred)
          lgbmbest.to_csv(f'/content/drive/MyDrive/pycaret/output/{np.average(aucs)}.csv')
          joblib.dump(model, f'/content/drive/MyDrive/pycaret/model/{np.average(aucs)}.pkl')
        return np.average(aucs)

    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_booster = self._booster

In [None]:
objective = Objective()

study = optuna.create_study(study_name = 'lgbm_parameter_optuna', direction="maximize", pruner=optuna.pruners.MedianPruner(n_warmup_steps=10))
    
study.optimize(objective, n_trials=100, callbacks=[objective.callback]) 

print("Best trial:")
trial = study.best_trial

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

best_model = objective.best_booster

pred = best_model.predict(test[test.columns[:-1]])
lgbmbest = pd.DataFrame(pred)
lgbmbest.to_csv('/content/drive/MyDrive/pycaret/lgbmoptuna.csv')

[32m[I 2021-06-24 02:40:25,992][0m A new study created in memory with name: lgbm_parameter_optuna[0m
0it [00:00, ?it/s]

KeyboardInterrupt: ignored

In [None]:
files = glob.glob('/content/drive/MyDrive/pycaret/output/*.csv')
dfs = [pd.read_csv(f).rename(columns={'Unnamed: 0': 'row_id', '0': 'correct'}) for f in files]
files

In [None]:
output = pd.DataFrame(columns=['row_id', 'correct'])
output['row_id'] = dfs[0]['row_id']
output['correct'] = dfs[0]['correct']

for df in dfs[1:]:
  output['correct'] += df['correct']
output['correct'] /= len(dfs)
output.head()

Unnamed: 0,row_id,correct
0,0,0.768669
1,1,0.413607
2,2,0.995534
3,3,0.99937
4,4,0.99506


In [None]:
# Feature 5 != 0 => correct == 0
# 8 out of 152990 feature_1 < feature_2 => correct == 1, thus mostly correct==0
test_original = pd.read_csv("/content/drive/MyDrive/test.csv")
cond1eda = test[test['feature_5'] != 0].index
cond2eda = test[test['feature_1'] < test['feature_2']].index
cond2eda

Int64Index([    7,    10,    13,    32,    36,    42,    46,    61,    62,
               64,
            ...
            12768, 12780, 12797, 12814, 12829, 12836, 12838, 12839, 12845,
            12850],
           dtype='int64', length=2243)

In [None]:
output.loc[cond1eda, 'correct'] = 0
output.loc[cond2eda, 'correct'] = 0
output.head(20)

Unnamed: 0,row_id,correct
0,0,0.768669
1,1,0.413607
2,2,0.995534
3,3,0.99937
4,4,0.99506
5,5,0.0
6,6,0.0
7,7,0.0
8,8,0.700819
9,9,0.388238


In [None]:
output.to_csv('/content/drive/MyDrive/pycaret/output.csv', index=False)