## Import

In [1]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

## Settings

In [2]:
TRAIN_PATH = "/home/knikaido/work/atma10/data/train.csv"
TEST_PATH = "/home/knikaido/work/atma10/data/test.csv"
SUB_PATH = "/home/knikaido/work/atma10/data/atmacup10__sample_submission.csv"

In [3]:
SEED = 42
N_SPLITS = 5
SHUFFLE = True
LGBM_PARAMS = {'num_leaves': 32,
               'min_data_in_leaf': 64,
               'objective': 'regression',
               'max_depth': -1,
               'learning_rate': 0.05,
               "boosting": "gbdt",
               "bagging_freq": 1,
               "bagging_fraction": 0.8,
               "bagging_seed": SEED,
               "verbosity": -1,
              'reg_alpha': 0.1,
              'reg_lambda': 0.3,
              'colsample_bytree': 0.7,
              'metric':"rmse",
              'num_threads':6,
         }

LGBM_FIT_PARAMS = {
    'num_boost_round': 10000,
    'early_stopping_rounds': 200,
    'verbose_eval': 200,
}

SAVE_TEST_SUB_PATH = "sub01.csv"

## Utility

In [4]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
seed_everything(SEED)

## Feature Engineering

In [5]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
len_train = len(train)
y = np.log1p(train["likes"])
train

Unnamed: 0,object_id,art_series_id,title,description,long_title,principal_maker,principal_or_first_maker,sub_title,copyright_holder,more_title,acquisition_method,acquisition_date,acquisition_credit_line,dating_presenting_date,dating_sorting_date,dating_period,dating_year_early,dating_year_late,likes
0,0011d6be41612ec9eae3,93c092ba70beab248f31,The Avenue of Birches,,"The Avenue of Birches, Jan Hackaert, 1660 - 1685",Jan Hackaert,Jan Hackaert,h 66.5cm × w 53.7cm × t 2.5cm × d 4.7cm,,The Avenue of Birches,purchase,1808-01-01T00:00:00,,1660 - 1685,1660.0,17,1660.0,1685.0,48
1,0012765f7a97ccc3e9e9,95c14fb11c54281ad7e0,Struik in bloei,,"Struik in bloei, Richard Tepe (attributed to),...",Richard Tepe,Richard Tepe,h 165mm × w 223mm,erven Richard Tepe,Struik in bloei,purchase,2000-01-01T00:00:00,,c. 1900 - c. 1930,1900.0,19,1900.0,1930.0,2
2,0017be8caa87206532cb,4c406890d208fe01f8fb,Portret van een vrouw,"Portret van eenvrouw, zittend naast een tafel.","Portret van een vrouw, Tresling & Comp., 1860 ...",Tresling & Comp.,Tresling & Comp.,h 87mm × w 56mm,,Portret van een vrouw,gift,2007-01-01T00:00:00,"Gift of M.M. Boom, Leiden",1860 - 1880,1860.0,19,1860.0,1880.0,5
3,00181d86ff1a7b95864e,fa0891535500a4973db2,A St Bernard Dog,"Een sint-bernardshond, staand in een landschap...","A St Bernard Dog, Bernard te Gempt, c. 1850 - ...",Bernard te Gempt,Bernard te Gempt,h 179cm × w 248cm × t 4cm,,A St Bernard Dog,bequest,1881-01-01T00:00:00,"J. Hilman Bequest, Amsterdam",c. 1850 - c. 1879,1850.0,19,1850.0,1879.0,100
4,001c52ae28ec106d9cd5,8065ed5063c5887e677d,Woelige zee met zeilschepen,Woelige zee met zeilschepen.,"Woelige zee met zeilschepen, anonymous, 1825 -...",anonymous,anonymous,h 13cm × w 17.5cm × d 0.7cm,,Woelige zee met zeilschepen,unknown,1971-01-01T00:00:00,,1825 - 1874,1825.0,19,1825.0,1874.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12021,ffedf8af4fd5b3873164,2e4695e7f3260d52b3e6,De schilder H.W. Mesdag voor een doek,,"De schilder H.W. Mesdag voor een doek, Delboy ...",Delboy & Baer,Delboy & Baer,h 90mm × w 140mm,,De schilder H.W. Mesdag voor een doek H W Mesdag,transfer,1994-01-01T00:00:00,,1900 - 1920,1900.0,19,1900.0,1920.0,4
12022,ffee34705ea44e1a0f79,7aa656a9ef243d93d009,"Kaart van de streek tussen Mannheim en Landau,...","Kaart van de streek tussen Mannheim, Speyer en...","Kaart van de streek tussen Mannheim en Landau,...",Abraham Allard,Abraham Allard,h 245mm × w 278mm,,"Kaart van de streek tussen Mannheim en Landau,...",purchase,1881-01-01T00:00:00,,1701 - 1714,1701.0,18,1701.0,1714.0,0
12023,ffefbe1faf771aa4f790,e79c2e74ed17533a7e56,Storm op het IJ aan het Blauwhoofd van Amsterd...,Zeilschepen in de problemen op het IJ ter hoog...,Storm op het IJ aan het Blauwhoofd van Amsterd...,Noach van der Meer (II),Noach van der Meer (II),h 217mm × w 306mm,,Storm op het IJ aan het Blauwhoofd van Amsterd...,transfer,1887-01-01T00:00:00,,1778,1778.0,18,1778.0,1778.0,0
12024,fff08e76cbb969eaddc7,510358b74c1104edbbbd,"Het rad van fortuin, ca. 1689","Het rad van fortuin, ca. 1689. Spotprent op de...","Het rad van fortuin, ca. 1689, Romeyn de Hoogh...",Romeyn de Hooghe,Romeyn de Hooghe,h 560mm × w 405mm,,"Het rad van fortuin, ca. 1689 'T Hedendaags Ra...",purchase,1881-01-01T00:00:00,,1689 - 1690,1689.0,17,1689.0,1690.0,14


In [6]:
cols = ['principal_maker', 'principal_or_first_maker',
        'copyright_holder','acquisition_method',
       'acquisition_credit_line', 
       'dating_period', 'dating_year_early',
       'dating_year_late']
cat_cols = ['principal_maker', 'principal_or_first_maker','copyright_holder','acquisition_method','acquisition_credit_line']

In [10]:
whole_df = pd.concat([train[cols],test[cols]]).reset_index(drop=True)
whole_df

Unnamed: 0,principal_maker,principal_or_first_maker,copyright_holder,acquisition_method,acquisition_credit_line,dating_period,dating_year_early,dating_year_late
0,Jan Hackaert,Jan Hackaert,,purchase,,17,1660.0,1685.0
1,Richard Tepe,Richard Tepe,erven Richard Tepe,purchase,,19,1900.0,1930.0
2,Tresling & Comp.,Tresling & Comp.,,gift,"Gift of M.M. Boom, Leiden",19,1860.0,1880.0
3,Bernard te Gempt,Bernard te Gempt,,bequest,"J. Hilman Bequest, Amsterdam",19,1850.0,1879.0
4,anonymous,anonymous,,unknown,,19,1825.0,1874.0
...,...,...,...,...,...,...,...,...
24029,Henry W. Taunt,Henry W. Taunt,,transfer,,19,1871.0,1871.0
24030,John Jabez Edwin Mayall,John Jabez Edwin Mayall,,transfer,,19,1851.0,1885.0
24031,Francis Frith,Francis Frith,,transfer,,19,1856.0,1859.0
24032,Henry W. Taunt,Henry W. Taunt,,transfer,,19,1871.0,1871.0


In [11]:
# for c in cat_cols:
#     whole_df.loc[~whole_df[c].isin(whole_df[c].unique()),c] = np.nan

In [12]:
for c in cat_cols:
    whole_df[c] = whole_df[c].astype(str)
    le = LabelEncoder()
    whole_df[c] = le.fit_transform(whole_df[c])

In [18]:
test = whole_df.iloc[len_train:].reset_index(drop=True)
train = whole_df.iloc[:len_train].reset_index(drop=True)
train

Unnamed: 0,principal_maker,principal_or_first_maker,copyright_holder,acquisition_method,acquisition_credit_line,dating_period,dating_year_early,dating_year_late
0,1782,1793,42,6,644,17,1660.0,1685.0
1,2880,2899,39,6,644,19,1900.0,1930.0
2,3063,3082,42,2,328,19,1860.0,1880.0
3,404,406,42,0,457,19,1850.0,1879.0
4,3246,3267,42,8,644,19,1825.0,1874.0
...,...,...,...,...,...,...,...,...
12021,736,739,42,7,644,19,1900.0,1920.0
12022,45,45,42,6,644,18,1701.0,1714.0
12023,2574,2591,42,7,644,18,1778.0,1778.0
12024,2905,2924,42,6,644,17,1689.0,1690.0


In [14]:
features = list(train.columns)
drop_cols = []
features = [i for i in features if i not in drop_cols]
X = train[features]

## Train / Test

In [15]:
def calc_loss(y_true, y_pred):
    return  np.sqrt(mean_squared_error(y_true, y_pred))

In [16]:

kf = KFold(n_splits=N_SPLITS,random_state=SEED, shuffle=SHUFFLE)

y_test = np.zeros([N_SPLITS, len(test)])
oof_pred = np.zeros_like(y, dtype=np.float)

# kfoldで分割
for i, (train_index, valid_index) in enumerate(kf.split(X, y)):

    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_valid = X.iloc[valid_index]
    y_valid = y.iloc[valid_index]

    # データセットを生成する
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid)

    # 学習
    model = lgb.train(LGBM_PARAMS,
                        lgb_train,
                        valid_sets=[lgb_eval, lgb_train],
                        **LGBM_FIT_PARAMS)

    predict_train = model.predict(X_train, num_iteration=model.best_iteration)
    predict_eval = model.predict(X_valid, num_iteration=model.best_iteration)
    oof_pred[valid_index] = predict_eval
    
    rmse_train = calc_loss(y_train, predict_train)
    rmse_eval = calc_loss(y_valid, predict_eval)
    print(f'epoch: {i} RMSE_train: {rmse_train}, RMSE_eval: {rmse_eval}')
    
    y_test[i] = model.predict(test, num_iteration=model.best_iteration)
    
oof_score = calc_loss(y, oof_pred)
print(f'RMSE_oof: {oof_score}')

Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 1.20413	valid_0's rmse: 1.29858
[400]	training's rmse: 1.12531	valid_0's rmse: 1.26962
[600]	training's rmse: 1.07302	valid_0's rmse: 1.26008
[800]	training's rmse: 1.03328	valid_0's rmse: 1.25341
[1000]	training's rmse: 1.00117	valid_0's rmse: 1.25098
[1200]	training's rmse: 0.974419	valid_0's rmse: 1.2494
[1400]	training's rmse: 0.951647	valid_0's rmse: 1.25007
Early stopping, best iteration is:
[1293]	training's rmse: 0.963241	valid_0's rmse: 1.24747
epoch: 0 RMSE_train: 0.963241178714731, RMSE_eval: 1.2474695677575762
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 1.2007	valid_0's rmse: 1.31569
[400]	training's rmse: 1.12103	valid_0's rmse: 1.29401
[600]	training's rmse: 1.06975	valid_0's rmse: 1.2831
[800]	training's rmse: 1.03027	valid_0's rmse: 1.27845
[1000]	training's rmse: 0.998841	valid_0's rmse: 1.2738
[1200]	training's rmse: 0.972891	valid_0's rmse: 1.

In [17]:
y_test_sub =  np.mean(y_test,axis=0)
y_test_sub = np.expm1(y_test_sub)
sub = pd.read_csv(SUB_PATH)
sub["likes"] = y_test_sub
sub.loc[sub.likes <= 0,"likes"] = 0
sub.to_csv(SAVE_TEST_SUB_PATH,index=False)