In [None]:
import collections
import numpy as np
import pandas as pd
import nltk
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from collections import Counter
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
from scipy.sparse import csr_matrix
import gensim
import os
os.listdir("../input/ykc-cup-2nd/")

## read data 

In [None]:
train = pd.read_csv("../input/ykc-cup-2nd/train.csv")
test = pd.read_csv("../input/ykc-cup-2nd/test.csv")
sub = pd.read_csv("../input/ykc-cup-2nd/sample_submission.csv")
train.shape, test.shape, sub.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sub.head()

In [None]:
## trainとtestをくっつけて一括で特徴量作成をする
df = pd.concat([train, test])
df = df.reset_index(drop=True)
df.shape

## EDA

In [None]:
train[train["department_id"] == 3].head()
## 野菜とか果物？

In [None]:
train[train["department_id"] == 12].head()
## 調味料？

In [None]:
train[train["department_id"] == 16].head()
##洗濯用具とか

## feature engineering

In [None]:
df["product_name"] = df["product_name"].apply(lambda words : words.lower().replace(",", "").replace("&", "").split(" "))
df.head()

In [None]:
# stopword & normalization
# stopword settings
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# normalizing setting
# https://yukinoi.hatenablog.com/entry/2018/05/29/120000
import re
shortened = {
    '\'m': ' am',
    '\'re': ' are',
    'don\'t': 'do not',
    'doesn\'t': 'does not',
    'didn\'t': 'did not',
    'won\'t': 'will not',
    'wanna': 'want to',
    'gonna': 'going to',
    'gotta': 'got to',
    'hafta': 'have to',
    'needa': 'need to',
    'outta': 'out of',
    'kinda': 'kind of',
    'sorta': 'sort of',
    'lotta': 'lot of',
    'lemme': 'let me',
    'gimme': 'give me',
    'getcha': 'get you',
    'gotcha': 'got you',
    'letcha': 'let you',
    'betcha': 'bet you',
    'shoulda': 'should have',
    'coulda': 'could have',
    'woulda': 'would have',
    'musta': 'must have',
    'mighta': 'might have',
    'dunno': 'do not know',
    # 実データからの置換
    'softgels': "soft gels supplement",
    "almondmilk": "almond milk",
    "lunchables": "lunch",
    "febreze": "deodorant",
    'steamfresh': 'steam fresh',
    "lil": 'little',
    'volumizing': 'volume',
    'rigate': 'penne',
    'anticavity': 'cavity protection',
    'keurig': 'coffee',  # わからん
    'eggo': 'waffle',  # 要確認
    'pantiliners': 'panty liner',
    'nutri': 'nutrition',  # 要確認
    'havarti': 'cheese',
    'lunchables': 'lunch',
    '0.0oz': 'deodorant',
    '0oz': 'deodorant',
    'velveeta': 'cheese',
    'organix': 'organic',  # シャンプーかドッグフード？
    'muenster': 'cheese',
    'smartblend': 'smart blend dog pet',  # dog foods?
    'snickerdoodle': 'cookie',
    '0ct': 'diamond ring',  # カラット
    'grands!': 'cat pet',  # cat foods?
    'umcka': 'supplement',
    'marzano': 'pizza tomato',
    'butterfinger': 'candy chocolate peanut',  # バターフィンガー
    'modena': 'italy',
    'unstopables': 'deodorant',
    'yokids': 'sandal',
    "flamin'": 'gay',  # ?
    'beneful': 'dog pet',
    'swaddlers': 'diapers',
    'compleats': 'meal preserved',  # 保存食?
    'sambucus': 'drink berry',
    'lindor': 'chocolate gift',
    'macrobar': 'chocolate peanut',
    'honeycrisp': 'apple',  #?
    'ahoy!': 'europe',  #ヨーロッパの挨拶?
    'whips!': "whip", # 鞭?
    'arrabbiata': 'tomato sauce',
    'craisins': 'dry berry fruit',
    'nyquil': 'medicine',
    'actionpacs': 'detergent',
    'sproutofu': 'organic teriyaki tofu', # ??
    'chewables': 'medicine',
    'gurt': 'pig',
    'krunch': 'chocolate',
    'doubleshot': 'coffee',
    'activia': 'yogurt',
    'fillo': 'pillow',
    'snax': 'snack',  # わからん
    'snackimals': 'snack organic',
    'oxiclean': 'bleach detergent',
    'chex': 'cheese', # ??
    'tahitian': 'tahiti',
    'montebello': 'oil', # ??
    'vegenaise': 'seasoning spice',
    'noticeables': 'deodorant',
    'scoopable': 'cat pet', # ??
    'wetjet': 'clean',
    'pantene': 'shampoo',
    'shirataki': 'noodle',  # 白滝はnoodleでいいの?
    'triscuit': 'snack',
    'dophilus': 'capsule supplement',
    'danimals': 'sweet', # 甘い食品を売ってそうなブランド
    'purina': 'cat dog pet',  # ペットフード
    'creamline': 'hamburger',
    'funfetti': 'cookie',  # おいしくなさそうなクッキー?
    'friskies': 'cat pet',
    'krinkle': 'biscuit',
    'antigingivitis': 'mouse wash tooth',
    'nesquik': 'chocolate drink milk',
    'sleepytime': 'herb tea',
    'gillette': 'shaving shaver shave',
    'antiplaque': 'tooth mouse wash',
    'detangler': 'treatment shampoo',
    'wintermint': 'gum',
    'perspirant': 'deodorant',
    'clorox': 'deodorant',
    'multimineral': 'multi mineral',
    'hommus': 'bean',
    'steamables': 'potato vegetable',
    'dentastix': 'dog pet',
    'nutrish': 'dog pet',
    '0st': '0',
    '0nd': '0',
    '0rd': '0',
}

shortened_re = re.compile('(?:' + '|'.join(map(lambda x: '\\b' + x + '\\b', shortened.keys())) + ')')

def get_shortended_word(word: str):
    """
    単語の正規化
    """
    
    
    shortened_word = re.sub(r"[0-9]+", "0", word)  # 数字
    shortened_word = shortened_re.sub(lambda w: shortened[w.group(0)], shortened_word) # 置換
    shortened_word = re.sub(r"'(s|n)", "", shortened_word)  # 's, 'n を削除
    shortened_word = re.sub(r"(%|\\|!|\(|\)|#|\.|™|\"|\'|®)", "", shortened_word)  # いろいろ削除
    shortened_word = re.sub(r"(\+|\-|\/|:)", " ", shortened_word)  # +, -など  を空白に置換
    return shortened_word.split(" ")

def flatten(l):
    """
    2,3d list => 1d list
    [[1,2], 3, [4,5]] => [1,2,3,4,5]
    """
    
    for el in l:
        if isinstance(el, collections.abc.Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

df["product_name"] = df["product_name"].apply(lambda words : [get_shortended_word(w) for w in words])
df["product_name"] = df["product_name"].apply(lambda words: list(set([w for word in words for w in word if w not in stop_words])))

df.head()

In [None]:
# その他の特徴量
df["name_num"] = df["product_name"].apply(len)  # product name の数
df["in_num"] = df["product_name"].apply(lambda words: any([c.isdigit() for w in words for c in w]))  # product nameに数字を含んでいるか
# train_data での order_dow_mode の出現割合
dow_counter = collections.Counter(train['order_dow_mode'].tolist())
dow_rate = {k: v / len(train) for k, v in dow_counter.items()}
df["dow_rate"] = df["order_dow_mode"].apply(lambda x: dow_rate[x])
# train_data での order_hour_of_day_mode の出現割合
day_counter = collections.Counter(train['order_hour_of_day_mode'].tolist())
day_rate = {k: v / len(train) for k, v in day_counter.items()}
df["day_rate"] = df["order_hour_of_day_mode"].apply(lambda x: day_rate[x])
# order_rate の値の大きさ
def get_order_rate_basis(order_rate: float):
    if order_rate > 5e-4:
        return 0
    elif order_rate > 1e-4:
        return 1
    elif order_rate > 5e-5:
        return 2
    elif order_rate > 1e-5:
        return 3
    elif order_rate > 5e-6:
        return 4
    elif order_rate > 1e-6:
        return 5
    elif order_rate > 5e-7:
        return 6
    elif order_rate > 1e-7:
        return 7
    else:
        return 8
df["order_rate_basis"] = df["order_rate"].apply(get_order_rate_basis)

important_words = {
    0: ['cream', 'ice', 'chicken', 'pizza', 'frozen', 'cheeze', 'chocolate', 'vanilla', 'gluten'],
    1: ['sleep', 'liquid', 'melatonin', 'baby', 'mix', 'tablets', 'flavor', 'natural', 'hand'],
    2: ['bread', 'whole', 'grain', 'tortillas', 'buns', 'gluten', 'rolls', 'chocolate'],
    3: ['baby', 'red', 'salad', 'bag', 'potato', 'potatoes', 'lettuce', 'sweet', 'green', 'apple', 'mashrooms'],
    4: ['wine', 'beer', 'ale', 'sauvignon', 'callfornia', 'cabernet', 'lager', 'chardonnay', 'red', 'whiskey', 'ponot'],
    5: ['sauce', 'rice', 'noodles', 'noodle', 'thai', 'soup', 'miso', 'curry', 'spacy', 'sesame', 'medium'],
    6: ['tea', 'juice', 'water', 'coffee', 'drink', 'green', 'sparking','soda', 'orange', 'lemmon', 'ginger'],
    7: ['cat', 'dog', 'chicken', 'beef', 'treat', 'treats', 'turkey', 'adult', 'flavor', 'dry', 'tuna', 'salmon'],
    8: ['pasta', 'rice', 'sauce', 'cheese', 'whole', 'grain', 'spaghetti', 'macaroni', 'chicken', 'garlic', 'brown', 'tomato'],
    9: ['rice', 'bean', 'beans', 'granola', 'brown', 'super', 'cranberry', 'mung', 'rolled', 'oats', 'pesto', 'sauce', 'berry'],
    10: ['body', 'shampoo', 'oil', 'conditioner', 'wash', 'deodorant', 'soap', 'vitamin', 'hand', 'with','tablets', 'mint'],
    11: ['chicken', 'sausage', 'smoked', 'beef', 'turkey', 'bacon', 'boneless', 'pork', 'breast', 'franks', 'ground', 'uncured'],
    12: ['dressing', 'mix', 'sauce', 'butter', 'oil', 'seasoning', 'suger', 'honey', 'salsa', 'chocolate', 'ground', 'salt'],
    13: ['cereal', 'granola', 'oatmeal', 'gluten', 'honey', 'mix', 'pancake', 'cinnamon', 'chocolate', 'grain', 'instant'],
    14: ['soup', 'beans', 'bean', 'tomatoes', 'tomato', 'vegetable', 'tuna', 'water', 'whole', 'white', 'sauce'],
    15: ['cheese', 'yogurt', 'milk', 'fat', 'greek', 'vanilla', 'cheddar', 'lowfat', 'strawberry', 'cream', 'berry', 'original'],
    16: ['scent', 'cleaner', 'detergent', 'laundry', 'liquid', 'fresh', 'paper', 'bags', 'ultra', 'dish', 'fabric', 'lavender'],
    17: ['baby', 'food', 'stage', 'diapers', 'apple', 'banana', 'size', 'foods', 'food', 'yogurt', 'fruit', 'wipes'],
    18: ['chocolate', 'bar', 'chips', 'chip', 'dark', 'cookies', 'cookie', 'cracker', 'crackers', 'salt', 'butter', 'sea'],
    19: ['humms', 'hum', 'turkey', 'chicken', 'salad', 'roasted', 'roast', 'breast', 'deli', 'tofu', 'salami', 'dip'],
    20: ['yogurt', 'chocolate', 'cheese', 'strawberry', 'apple', 'chicken', 'fruit', 'vanilla', 'original', 'cream', 'potato']
}

importance_feature_names = []
for k, v in important_words.items():
    importance_feature_names.append("important_" + str(k) + "_rate")
    # importance_feature_names.append("important_" + str(k) + "_flag")
    df["important_" + str(k) + "_rate"] = df['product_name'].apply(lambda words: sum([1 for w in words if w in v]) / len(words))
    # df["important_" + str(k) + "_flag"] = df['product_name'].apply(lambda words: any([True if w in v else False for w in words]))

In [None]:
## 訓練済みの単語ベクトルを読み込んで，product_nameに含まれる単語をベクトルに変換して平均を取ることで，各product_idに対して特徴量ベクトルを作成する

## gensimで.vecから読み込むときに時間がかかるので，他のnotebookでpickleで保存したものを使用している
model = pd.read_pickle("../input/ykc-cup-2nd-save-fasttext/fasttext_gensim_model.pkl") 

## gensimでvecから読み込む場合（５分ぐらいかかる）
# model = gensim.models.KeyedVectors.load_word2vec_format('../input/ykc-2nd/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec')

from collections import defaultdict
unused_words = defaultdict(int)
lemmatizer = nltk.WordNetLemmatizer() # レンマ化
def to_vec(x, model):
    cnt = 0
    v = np.zeros(model.vector_size)
    all_pretrained_words = model.index2word

    for w in x:
        cnt += 1
        # lemmatizeは遅いので try except で必要な単語だけlemmatizeする
        try:
            v += model[w] ## 単語が訓練済みモデルのvocabにあったら
        except:
            try:
                # 存在しない場合は、レンマ化したものが訓練済みモデルのvocabにあるかを確認
                lemmatized_w = lemmatizer.lemmatize(w)
                v += model[lemmatizer.lemmatize(w)]
            except:
                cnt -= 1
                unused_words[w] += 1 ## ベクトルが存在しなかった単語をメモ
                
    v /= cnt if cnt > 0 else 1  # SWEM average-pooling
    v = v / (np.sqrt(np.sum(v ** 2)) + 1e-16) ## 長さを1に正規化
    return v

vecs = df["product_name"].apply(lambda x : to_vec(x, model))
vecs = np.vstack(vecs)
fasttext_pretrain_cols = [f"fasttext_pretrain_vec{k}" for k in range(vecs.shape[1])]
vec_df = pd.DataFrame(vecs, columns=fasttext_pretrain_cols)
df = pd.concat([df, vec_df], axis = 1)
df.head()

In [None]:
sorted(unused_words.items(), key=lambda x: x[1], reverse = True)[:100]

## Neural Network

In [None]:
# config
import torch
import torch.nn as nn
!pip install skorch
import skorch
from skorch import NeuralNetClassifier
from skorch.callbacks import Callback, Checkpoint, EarlyStopping
torch.manual_seed(42)

class MLPModel(nn.Module):
    def __init__(self, num_features, dropout=0.25, n_hid=128):
        super().__init__()
        self.model = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(num_features, n_hid),
            nn.ReLU(),
            nn.BatchNorm1d(n_hid),
            nn.Dropout(dropout),            
            nn.Linear(n_hid, n_hid // 4),
            nn.ReLU(),
            nn.BatchNorm1d(n_hid // 4),
            nn.Dropout(dropout),
            nn.Linear(n_hid // 4, 21),  # 21 class
        )
        self.softmax = nn.Softmax(dim=-1)
        for m in self.model:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, input_tensor):
        return self.softmax(self.model(input_tensor))


import torch.nn.functional as F


# https://github.com/kefirski/pytorch_Highway/blob/master/highway/highway.py
class Highway(nn.Module):

    def __init__(self, size: int, num_layers: int, f: torch.nn.functional):
        """
        :param size: linear layer size
        :param num_layers: number of linear layers
        :param f: activation function (ex. F.softmax, F.ReLU)
        """

        super(Highway, self).__init__()
        self.num_layers = num_layers
        self.nonlinear = nn.ModuleList(
            [nn.Linear(size, size) for _ in range(num_layers)])
        self.linear = nn.ModuleList(
            [nn.Linear(size, size) for _ in range(num_layers)])
        self.gate = nn.ModuleList([nn.Linear(size, size)
                                   for _ in range(num_layers)])
        self.f = f

    def forward(self, x):
        """
        :param x: tensor with shape of (batch_size, size)
        :return: tensor with shape of (batch_size, size)
        """

        for layer in range(self.num_layers):
            gate = torch.sigmoid(self.gate[layer](x))
            nonlinear = self.f(self.nonlinear[layer](x))
            linear = self.linear[layer](x)
            x = gate * nonlinear + (1 - gate) * linear

        return x


class CNNModel(nn.Module):

    def __init__(self, num_features: int,
                 cnn_filter_sizes: list=[1,3,5,10], cnn_num_filters: list=[100,200,300,400],
                 highway_layers_num: int = 1, dropout: float = 0.5):
        """
        :param embedding: word embedding
        :param emb_dim: number of word embedding dimension
        :param cnn_filter_sizes: filter sizes of CNNs
        :param cnn_num_filters: filter numbers of CNNs
        :param highway_layers_num: numbers of highway network layer
        :param dropout_rate: drop out rate
        """

        super().__init__()
        self.cnn_filter_sizes = cnn_filter_sizes
        self.cnn_num_filters = cnn_num_filters

        self.emb_dim = num_features

        self.convs = nn.ModuleList([
            nn.Conv2d(1, n, (f, self.emb_dim))  # nn.Conv1d(n, f, emb_dim)
            for (n, f) in zip(cnn_num_filters, cnn_filter_sizes)
        ])
        self.highway = Highway(sum(cnn_num_filters),
                               highway_layers_num, nn.ReLU())
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(sum(cnn_num_filters), 21)  # 21 = class num
        self.softmax = nn.LogSoftmax(dim=0)
        self._init_parameters()

    def forward(self, x) -> torch.Tensor:
        """
        Forward propagation algorithm.
        :param x: embeddings
        :return: (batch_size, 2)
        """

        convs = [F.relu(conv(x).squeeze(3))
                 for conv in self.convs]
        pools = [F.max_pool1d(conv, conv.size(2)).squeeze(2)
                 for conv in convs]
        pred = torch.cat(pools, 1)
        pred = self.highway(pred)
        return self.softmax(self.linear(self.dropout(pred)))

    def _init_parameters(self):
        for param in self.parameters():
            param.data.uniform_(-0.05, 0.05)


## train

In [None]:
## 予測に使用する特徴量の名前
features = fasttext_pretrain_cols + [
    "order_rate", "order_dow_mode", "order_hour_of_day_mode",  # 元から用意されている素性
    'name_num', 'in_num',  # product name に関する素性
    'dow_rate', 'day_rate','order_rate_basis'  # その他の特徴量を変形した素性
] + importance_feature_names  # 各カテゴリごとの重要単語が出現するかどうかとその割合みたいなもの
target = "department_id" ## 予測対象
n_split = 7 ## cross validationのfold数

In [None]:
## trainとtestを分離
train = df[~df[target].isna()]
test = df[df[target].isna()]

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [None]:
## cross validation
from sklearn.ensemble import VotingClassifier
preds_test = []
scores = []
kfold = KFold(n_splits=n_split, shuffle = True, random_state=42)
preds = []
anss = []
for i_fold, (train_idx, valid_idx) in enumerate(kfold.split(train)):
    print(f"--------fold {i_fold}-------")
    
    ## train data
    x_tr = train.loc[train_idx, features]
    y_tr = train.loc[train_idx, target]

    ## valid data
    x_va = train.loc[valid_idx, features]
    y_va = train.loc[valid_idx, target]

    ## train LGBM model
    lgbm_params = {
        'n_estimators': 700,
        'objective': 'multiclass',
        "boosting_type": "gbdt",
        "importance_type": "split",
        "random_state": 42,
        'num_leaves': 225,
        'learning_rate': 0.04689606818793407,
        'class_weight': None,
        'min_child_samples': 98,
        'subsample': 0.44354721466773056,
        'subsample_freq': 6,
        'colsample_bytree': 0.7377347290625655,
        'reg_alpha': 1.0160018949956453,
        'reg_lambda': 1.6781461339752908,
        'n_jobs': 2
    }
    lgbm_model = LGBMClassifier(**lgbm_params)
    lgbm_model.fit(x_tr, y_tr, eval_set=(x_va, y_va), early_stopping_rounds=10)
    ## predict on valid
    pred_val = lgbm_model.predict_proba(x_va.to_numpy().astype(np.float32))
    
    ## evaluate
    score = {
        "logloss"  : log_loss(y_va, pred_val),
        "f1_micro" : f1_score(y_va, np.argmax(pred_val, axis = 1), average = "micro")
    }
    print(score)
    scores.append(score)
    preds.append(np.argmax(pred_val, axis = 1))
    anss.append(y_va)
        
    ## predict on test
    pred_test = lgbm_model.predict_proba(test[features].to_numpy().astype(np.float32))
    preds_test.append(pred_test)
    # """

    monitor = lambda MLPModel: all(MLPModel.history[-1, ('train_loss_best', 'valid_loss_best')])
 
    # set param(make trainer)
    neural_model = NeuralNetClassifier(
                    MLPModel,
                    max_epochs=200,
                    lr=0.005,
                    warm_start=True,
                    optimizer=torch.optim.Adam,
                    iterator_train__shuffle=True,
                    callbacks=[Checkpoint(), EarlyStopping(patience=10)],
                    module__num_features=train[features].shape[1],
                    # module__n_hid=512,
                    module__dropout=0.25,
                    iterator_valid__batch_size=256,
                    device="cuda"
                )
    neural_model.fit(x_tr.to_numpy().astype(np.float32), y_tr.to_numpy().astype(np.int64))
    ## predict on valid
    pred_val = neural_model.predict_proba(x_va.to_numpy().astype(np.float32))
    
    # voting
    """
    estimators = [
        ('lgbm', lgbm_model),
        ('mlp', neural_model)
    ]
    voting_model = VotingClassifier(estimators, n_jobs=-1)
    voting_model.fit(x_tr.to_numpy().astype(np.float32), y_tr.to_numpy().astype(np.int64))
    pred_val = neural_model.predict_proba(x_va.to_numpy().astype(np.float32))
    """
    
    ## evaluate
    score = {
        "logloss"  : log_loss(y_va, pred_val),
        "f1_micro" : f1_score(y_va, np.argmax(pred_val, axis = 1), average = "micro")
    }
    print(score)
    scores.append(score)
    preds.append(np.argmax(pred_val, axis = 1))
    anss.append(y_va)
        
    ## predict on test
    pred_test = neural_model.predict_proba(test[features].to_numpy().astype(np.float32))
    preds_test.append(pred_test)

In [None]:
## evaluate for each class
print(classification_report(np.concatenate(anss), np.concatenate(preds)))

In [None]:
score_df = pd.DataFrame(scores)
score_df

In [None]:
score_df.mean()

## submission

In [None]:
## cvの各foldで計算した予測値の平均を最終的な予測値に
pred_test_final = np.array(preds_test).mean(axis = 0)
pred_test_final = np.argmax(pred_test_final, axis = 1)

In [None]:
sub["department_id"] = pred_test_final
sub.to_csv("submission.csv", index = False)
sub.head()

In [None]:
importance = pd.DataFrame(
    model.feature_importances_,
    index=features,
    columns=['importance']
)

importance = importance.sort_values('importance', ascending=False)
importance.head(50).plot.bar()

## Hyper Parameter Tuning

In [None]:
# configs
import optuna
import json
import datetime as dt

n_trials = 50

In [None]:
def param_grids_to_params(trial: optuna.Trial, param_grids: dict):
    params = {}
    for k, v in param_grids.items():
        # set optimizing target parameters
        if isinstance(v, list):
            if len(v) > 2:
                params[k] = trial.suggest_categorical(k, v)
            elif all([isinstance(s, bool) for s in v]):
                b = strtobool(trial.suggest_categorical(k, [str(p) for p in v]))
                params[k] = True if b == 1 else False
            elif type(v[0]) == int:
                params[k] = trial.suggest_int(k, v[0], v[1])
            elif type(v[0]) == float:
                params[k] = trial.suggest_uniform(k, v[0], v[1])
            else:
                params[k] = trial.suggest_categorical(k, v)
        # set static parameters
        else:
            params[k] = v
    return params


def objective(trial: optuna.Trial):
    fmeasures = []
    model = LGBMClassifier
    param_grids = {
            "boosting_type": "gbdt",
            "num_leaves": [2, 256],
            "max_depth": -1,
            "learning_rate": [0.005, 0.1],
            "n_estimators": 500,
            "subsample_for_bin": 200000,
            "objective": "multiclass",
            "class_weight": ["balanced", None],
            "min_split_gain": 0.0,
            "min_child_weight": 0.001,
            "min_child_samples": [5, 100],
            "subsample": [0.4, 1.0],
            "subsample_freq": [1, 7],
            "colsample_bytree": [0.65, 1.0],
            "reg_alpha": [1e-8, 10.0],
            "reg_lambda": [1e-8, 10.0],
            "random_state": 0,
            "n_jobs": 2,
            "silent": True,
            "importance_type": "split",
        }
    params = param_grids_to_params(trial, param_grids)
    kfold = KFold(n_splits=n_split, shuffle = True, random_state=42)
    for i_fold, (train_idx, valid_idx) in enumerate(kfold.split(train)):    
        ## train data
        x_tr = train.loc[train_idx, features]
        y_tr = train.loc[train_idx, target]

        ## valid data
        x_va = train.loc[valid_idx, features]
        y_va = train.loc[valid_idx, target]

        model = LGBMClassifier(**params)
        model.fit(x_tr, y_tr, eval_set=(x_va, y_va), early_stopping_rounds=10)
    
        ## predict on valid
        pred_val = model.predict_proba(x_va)
        fmeasures.append(f1_score(y_va, np.argmax(pred_val, axis = 1), average = "micro"))
        
        break

    f1score = sum(fmeasures) / len(fmeasures)
    return f1score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials)
params = study.best_trial.params
best_score = study.best_value
now = dt.datetime.now()
json_name = "tuned_{0:%Y%m%d%H%M%S}_{1}.json".format(now, "lgbm")
with open(json_name, "w") as f:
    json.dump(params, f)

In [None]:
from sklearn.model_selection import GridSearchCV
import json

# monitor = lambda MLPModel: all(MLPModel.history[-1, ('train_loss_best', 'valid_loss_best')])
neural_model = NeuralNetClassifier(
                    MLPModel,
                    max_epochs=100,
                    lr=0.01,
                    warm_start=True,
                    optimizer=torch.optim.Adam,
                    iterator_train__shuffle=True,
                    module__num_features=train[features].shape[1],
                    device='cuda'
                )
# deactivate skorch-internal train-valid split and verbose logging
neural_model.set_params(train_split=False, verbose=0)
params = {
    'lr': [0.01],
    'module__dropout': [0.25],
    'module__n_hid': [128, 256, 512],
}
gs = GridSearchCV(neural_model, params, cv=7, scoring='f1_micro')

gs.fit(train[features].to_numpy().astype(np.float32), train[target].to_numpy().astype(np.int64))
print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))
with open('neural_tuned_params.json', 'w') as f:
    json.dump(gs.best_params_, f)

In [None]:
from sklearn.model_selection import GridSearchCV
import json

# monitor = lambda MLPModel: all(MLPModel.history[-1, ('train_loss_best', 'valid_loss_best')])
neural_model = NeuralNetClassifier(
                    MLPModel,
                    max_epochs=10,
                    lr=0.01,
                    warm_start=True,
                    optimizer=torch.optim.Adam,
                    iterator_train__shuffle=True,
                    module__num_features=train[features].shape[1],
                    device='cuda'
                )
# deactivate skorch-internal train-valid split and verbose logging
neural_model.set_params(train_split=False, verbose=0)
params = {
    'lr': [0.01],
    'module__dropout': [0.25],
    'module__n_hid': [128, 256, 512],
}
gs = GridSearchCV(neural_model, params, cv=3, scoring='f1_micro')

gs.fit(train[features].to_numpy().astype(np.float32), train[target].to_numpy().astype(np.int64))
print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))
with open('neural_tuned_params.json', 'w') as f:
    json.dump(gs.best_params_, f)