In [None]:
# os,pandas,numpy
import os
import pandas as pd
import numpy as np

In [None]:
# ランダムシード
import random
np.random.seed(1234)
random.seed(1234)

In [None]:
# csv読み込み
train_df = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
test_df = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
submission = pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")

train_df.head()

In [None]:
# 学習データとテストデータの結合
all_df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)

all_df["Pawpularity"]

In [None]:
# lightgbm
import lightgbm as lgb

In [None]:
# クロスバリデーション用のライブラリを読み込んで分割数を3に設定
from sklearn.model_selection import KFold
folds = 3
kf = KFold(n_splits=folds)

In [None]:
# ハイパーパラメータ
lgbm_params = {
    "objective":"regression",
    "random_seed":1234
}

In [None]:
# 結合した後再分類
train_df_le = all_df[~all_df["Pawpularity"].isnull()]
test_df_le = all_df[all_df["Pawpularity"].isnull()]
# 説明変数・目的変数を指定
train_X = train_df_le.drop(["Pawpularity", "Id"], axis=1)
train_Y = train_df_le["Pawpularity"]

train_df,train_X,train_Y

In [None]:
# 平均二乗誤差を出すライブラリをインポート
from sklearn.metrics import mean_squared_error

In [None]:
# 各foldごとに作成したモデルごとの予測値を保存
models = []
rmses = []
oof = np.zeros(len(train_X))

for train_index, val_index in kf.split(train_X):
    X_train = train_X.iloc[train_index]
    X_valid = train_X.iloc[val_index]
    y_train = train_Y.iloc[train_index]
    y_valid = train_Y.iloc[val_index]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    model_lgb = lgb.train(lgbm_params,
                         lgb_train,
                         valid_sets=lgb_eval,
                         num_boost_round=100,
                         early_stopping_rounds=20,
                         verbose_eval=10)
    
    y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
    tmp_rmse = np.sqrt(mean_squared_error(np.log(y_valid),np.log(y_pred)))
    print(tmp_rmse)
    
    models.append(model_lgb)
    rmses.append(tmp_rmse)
    oof[val_index] = y_pred

In [None]:
# 平均RMSEを計算
sum(rmses)/len(rmses)

In [None]:
# 現状の予測値と実際の値の違いを確認する
actual_pred_df = pd.DataFrame({
    "actual" : train_Y,
    "pred" : oof
})

actual_pred_df.plot(figsize=(12,5))

In [None]:
# 変数の数を制限して各変数の重要度を表示
for model in models:
    lgb.plot_importance(model,importance_type="gain", max_num_features=15)

# データ分布の確認

In [None]:
# Pawpularityのデータ分布を確認する

train_df["Pawpularity"].describe()

In [None]:
train_df["Pawpularity"].plot.hist(bins=20)

# 目的変数の対数化による予測精度の向上を確認する

In [None]:
# Pawpularityを対数化
np.log(train_df['Pawpularity'])

In [None]:
# 　対数化したSalePriceの分布をヒストグラムで可視化
np.log(train_df['Pawpularity']).plot.hist(bins=20)

In [None]:
# 対数化による予測精度の向上を確認
train_df_le["Pawpularity_log"] = np.log(train_df_le["Pawpularity"])

train_X = train_df_le.drop(["Pawpularity","Pawpularity_log","Id"], axis=1)
train_Y = train_df_le["Pawpularity_log"]
                      
models = []
rmses = []
oof = np.zeros(len(train_X))

for train_index, val_index in kf.split(train_X):
    X_train = train_X.iloc[train_index]
    X_valid = train_X.iloc[val_index]
    y_train = train_Y.iloc[train_index]
    y_valid = train_Y.iloc[val_index]
        
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)    
    
    model_lgb = lgb.train(lgbm_params, 
                          lgb_train, 
                          valid_sets=lgb_eval, 
                          num_boost_round=100,
                          early_stopping_rounds=20,
                          verbose_eval=10,
                         )    
    
    y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
    tmp_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(tmp_rmse)    
              
    models.append(model_lgb)    
    rmses.append(tmp_rmse)
    oof[val_index] = y_pred 

In [None]:
sum(rmses)/len(rmses)