In [1]:
#lightgbmで学習時にnanが入っていない場合にnanが入ってきた時の挙動を確認する

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import lightgbm as lgb
from sklearn.model_selection import train_test_split
pd.set_option("display.max_rows", 101)
pd.set_option("display.max_columns", 101)
%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
from sklearn.datasets import load_boston
boston = load_boston()
X=pd.DataFrame(boston.data, columns=boston.feature_names)
y=boston.target

In [4]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
#欠損check
X.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64

In [6]:
#データの型確認
X.dtypes

CRIM       float64
ZN         float64
INDUS      float64
CHAS       float64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD        float64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
dtype: object

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
params={
        'objective': 'regression',
        'random_state' : 1,
        "metric": "rmse",
        }

dtrain = lgb.Dataset(X_train, label=y_train)
dvalid = lgb.Dataset(X_valid, label=y_valid)
bst = lgb.train(params, dtrain, num_boost_round=1000,valid_sets=[dtrain, dvalid],early_stopping_rounds=50,verbose_eval=100)

Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 1.55562	valid_1's rmse: 3.4141
[200]	training's rmse: 1.0165	valid_1's rmse: 3.35222
[300]	training's rmse: 0.735403	valid_1's rmse: 3.34575
Early stopping, best iteration is:
[254]	training's rmse: 0.848352	valid_1's rmse: 3.33877


In [9]:
X_valid_zero=X_valid.copy()
X_valid_zero["AGE"]=0

In [10]:
X_valid_nan=X_valid.copy()
X_valid_nan["AGE"]=np.nan

In [11]:
checkdf=pd.DataFrame()
checkdf["normal"]=bst.predict(X_valid)
checkdf["zero"]=bst.predict(X_valid_zero)
checkdf["nan"]=bst.predict(X_valid_nan)

In [12]:
checkdf.head()

Unnamed: 0,normal,zero,nan
0,25.264243,27.296083,27.296083
1,36.374307,35.791168,35.791168
2,14.952075,18.925241,18.925241
3,22.851986,22.851986,22.851986
4,19.335784,20.7203,20.7203


In [13]:
abs(checkdf["zero"]-checkdf["nan"]).sum()

0.0

In [14]:
#念の為すべての列で確認
for col in X.columns:
    X_valid_zero=X_valid.copy()
    X_valid_zero[col]=0
    
    X_valid_nan=X_valid.copy()
    X_valid_nan[col]=np.nan
    
    checkdf=pd.DataFrame()
    checkdf["normal"]=bst.predict(X_valid)
    checkdf["zero"]=bst.predict(X_valid_zero)
    checkdf["nan"]=bst.predict(X_valid_nan)
    
    #print(i,checkdf.sum())
    print(col,(abs(checkdf["zero"]-checkdf["nan"])).sum())

CRIM 0.0
ZN 0.0
INDUS 0.0
CHAS 0.0
NOX 0.0
RM 0.0
AGE 0.0
DIS 0.0
RAD 0.0
TAX 0.0
PTRATIO 0.0
B 0.0
LSTAT 0.0
