In [14]:
# lightgbmを試す。

import pandas as pd
import numpy as np
import re
from glob import glob
from tqdm import tqdm
import optuna

import optuna.integration.lightgbm as lgb

from lightgbm import early_stopping
from lightgbm import log_evaluation

from sklearn.model_selection import train_test_split
from wandb.lightgbm import wandb_callback
import wandb
from sklearn.model_selection import RepeatedKFold

wandb.init(project="narou", entity="ryotoitoi")

### ファイル読み込み・データ確認

df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
sub_df = pd.read_csv('../data/sample_submission.csv')

df_train_num = df_train.select_dtypes("int")
df_test_num = df_test.select_dtypes("int")

In [2]:
import numpy as np
import pandas as pd
train_title = np.load("../npy/train_title_roberta.npy")
train_story = np.load("../npy/train_story_roberta.npy")
train_key = np.load("../npy/train_keyword_roberta.npy")
test_title = np.load("../npy/test_title_roberta.npy")
test_story = np.load("../npy/test_story_roberta.npy")
test_key = np.load("../npy/test_keyword_roberta.npy")

print(train_title.shape)
print(train_story.shape)
print(train_key.shape)
print(test_title.shape)
print(test_story.shape)
print(test_key.shape)

(40000, 768)
(40000, 768)
(40000, 768)
(8522, 768)
(8522, 768)
(8522, 768)


In [21]:
key = np.concatenate([train_key, test_key])
title = np.concatenate([train_title, test_title])
story = np.concatenate([train_story, test_story])
print(key.shape)
print(title.shape)
print(story.shape)

(48522, 768)
(48522, 768)
(48522, 768)


In [23]:
import umap

um = umap.UMAP(random_state=42)
um.fit(key)
train_key_emb = um.fit_transform(train_key)
test_key_emb = um.fit_transform(test_key)

um = umap.UMAP(random_state=42)
um.fit(title)
train_title_emb = um.fit_transform(train_title)
test_title_emb = um.fit_transform(test_title)

um = umap.UMAP(random_state=42)
um.fit(story)
train_story_emb = um.fit_transform(train_story)
test_story_emb = um.fit_transform(test_story)

In [24]:
train_key_df = pd.DataFrame(train_key_emb).rename(columns={0:"key_0", 1:"key_1"})
train_title_df = pd.DataFrame(train_title_emb).rename(columns={0:"title_0", 1:"title_1"})
train_story_df = pd.DataFrame(train_story_emb).rename(columns={0:"story_0", 1:"story_1"})
test_key_df = pd.DataFrame(test_key_emb).rename(columns={0:"key_0", 1:"key_1"})
test_title_df = pd.DataFrame(test_title_emb).rename(columns={0:"title_0", 1:"title_1"})
test_story_df = pd.DataFrame(test_story_emb).rename(columns={0:"story_0", 1:"story_1"})

df_train = pd.concat([df_train_num, train_key_df, train_title_df, train_story_df], axis=1)
df_test = pd.concat([df_test_num, test_key_df, test_title_df, test_story_df], axis=1)

In [25]:
print(df_train.shape)
print(df_test.shape)
display(df_train.head(2))
display(df_test.head(2))

(40000, 20)
(8522, 19)


Unnamed: 0,userid,biggenre,genre,novel_type,end,isstop,isr15,isbl,isgl,iszankoku,istensei,istenni,pc_or_k,fav_novel_cnt_bin,key_0,key_1,title_0,title_1,story_0,story_1
0,9904,4,402,1,0,0,0,0,0,0,0,0,0,1,10.485666,5.483404,8.597751,5.569159,12.719436,13.707171
1,6527,4,401,1,0,0,0,0,0,0,0,0,3,2,18.593077,8.966592,6.469113,4.609012,5.0419,5.404982


Unnamed: 0,userid,biggenre,genre,novel_type,end,isstop,isr15,isbl,isgl,iszankoku,istensei,istenni,pc_or_k,key_0,key_1,title_0,title_1,story_0,story_1
0,952106,99,9903,2,0,0,0,0,0,0,0,0,2,12.606698,4.662037,10.84304,1.529229,-0.544979,12.597836
1,2197141,1,101,2,0,0,1,0,0,0,0,0,2,5.135323,14.957435,11.33322,1.492492,-2.443586,1.2758


In [26]:
train_x = df_train.drop(columns="fav_novel_cnt_bin")
train_y = df_train[["fav_novel_cnt_bin"]]

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.2)

In [27]:
params = {
    'objective': 'multiclass',
    'num_classes': 5,
    "verbosity": -1,
    'metric': 'multi_logloss',
    "seed": 42
}

train_data = lgb.Dataset(train_x, label=train_y)
val_data = lgb.Dataset(val_x, label=val_y)

cat_cols = ['userid', 'biggenre', 'genre', 'novel_type', 'end', 'isstop', 'isr15', 'isbl', 'isgl', 'iszankoku', 'istensei', 'istenni', 'pc_or_k']


model = lgb.train(
    params,
    train_data, 
    categorical_feature = cat_cols,
    valid_names = ['train', 'valid'],
    valid_sets =[train_data, val_data], 
    verbose_eval = 50,
    callbacks=[wandb_callback(), early_stopping(50), log_evaluation(50)], 
)

val_pred = model.predict(val_x, num_iteration=model.best_iteration)

pred_df = pd.DataFrame(sorted(zip(val_x.index, val_pred, val_y)), columns=['index', 'predict', 'actual'])

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(), train_x.columns)), columns=['importance', 'feature'])

test_pred = model.predict(df_test, num_iteration=model.best_iteration)
sub_df.iloc[:, 1:] = test_pred
sub_df.to_csv('../output/lgb_emb_test_submission.csv', index=False)

[32m[I 2021-11-12 16:24:29,546][0m A new study created in memory with name: no-name-bfe4ce3f-c78b-4da4-ba65-3192a04e2a7f[0m
New categorical_feature is ['biggenre', 'end', 'genre', 'isbl', 'isgl', 'isr15', 'isstop', 'istenni', 'istensei', 'iszankoku', 'novel_type', 'pc_or_k', 'userid']
[33m[W 2021-11-12 16:24:30,064][0m Trial 0 failed because of the following error: Error('You must call wandb.init() before wandb.log()')
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.8/site-packages/optuna/integration/_lightgbm_tuner/optimize.py", line 249, in __call__
    booster = lgb.train(self.lgbm_params, train_set, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/lightgbm/engine.py", line 302, in train
    cb(callback.CallbackEnv(model=booster,
  File "/usr/local/lib/python3.8/site-packages/wandb/integration/lightgbm/__init__.py", line 

Error: You must call wandb.init() before wandb.log()