# アンサンブルモデル
LightGBM、Random Forest、MLP、ロジスティック回帰とSVMの5つのモデルを用いて予測を行い、その結果の多数決をとり、最終的な予測を決定する

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import os
for dirname, _, filenames in os.walk('../data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../data/ozon_4_.ipynb
../data/sample_submit.csv
../data/submission_ensemble.csv
../data/submission_lightBGM.csv
../data/submission_lightBGM2.csv
../data/submission_lightBGM2_2.csv
../data/submission_lightBGM3.csv
../data/submission_Logistic.csv
../data/submission_Logistic2.csv
../data/submission_Logistic3.csv
../data/test.tsv
../data/train.tsv
../data/.ipynb_checkpoints\ozon_4_-checkpoint.ipynb
../data/.ipynb_checkpoints\sample_submit-checkpoint.csv
../data/.ipynb_checkpoints\submission_ensemble-checkpoint.csv
../data/.ipynb_checkpoints\submission_lightBGM-checkpoint.csv
../data/.ipynb_checkpoints\submission_lightBGM3-checkpoint.csv
../data/.ipynb_checkpoints\submission_Logistic-checkpoint.csv
../data/.ipynb_checkpoints\submission_Logistic2-checkpoint.csv
../data/.ipynb_checkpoints\submission_Logistic3-checkpoint.csv
../data/.ipynb_checkpoints\test-checkpoint.tsv
../data/.ipynb_checkpoints\train-checkpoint.tsv


In [2]:
train_df = pd.read_table('../data/train.tsv', index_col='Date', parse_dates=True)
test_df = pd.read_table('../data/test.tsv', index_col='Date', parse_dates=True)
sample_sub = pd.read_csv('../data/sample_submit.csv')
print(sample_sub)

# set type label
train_df['type'] = 'train'
test_df['type'] = 'test'

# all data
all_df = pd.concat([train_df, test_df], axis=0)

      1267  0.0
0     1268  0.0
1     1269  0.0
2     1270  1.0
3     1271  1.0
4     1272  1.0
...    ...  ...
1261  2529  1.0
1262  2530  1.0
1263  2531  0.0
1264  2532  1.0
1265  2533  0.0

[1266 rows x 2 columns]


## 特徴量前処理

In [3]:
import re

def eda(all_df):
    # データの追加,気温・風速の標準偏差
    #1時間ごとの気温・風速を取得
    T_data = all_df[['T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21', 'T22', 'T23']]
    WSR_data = all_df[['WSR0', 'WSR1', 'WSR2', 'WSR3', 'WSR4', 'WSR5', 'WSR6', 'WSR7', 'WSR8', 'WSR9', 'WSR10', 'WSR11', 'WSR12', 'WSR13', 'WSR14', 'WSR15', 'WSR16', 'WSR17', 'WSR18', 'WSR19', 'WSR20', 'WSR21', 'WSR22', 'WSR23']]
    # 行ごとの標準偏差を追加
    all_df['T_SD'] = T_data.std(axis=1)
    all_df['WSR_SD'] = WSR_data.std(axis=1)
    # データの削除, T0~T23
    all_df = all_df.drop(columns=['T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21', 'T22', 'T23'])
    # データの削除, WSR0~WSR23
    all_df = all_df.drop(columns=['WSR0', 'WSR1', 'WSR2', 'WSR3', 'WSR4', 'WSR5', 'WSR6', 'WSR7', 'WSR8', 'WSR9', 'WSR10', 'WSR11', 'WSR12', 'WSR13', 'WSR14', 'WSR15', 'WSR16', 'WSR17', 'WSR18', 'WSR19', 'WSR20', 'WSR21', 'WSR22', 'WSR23'])
    # データの削除, SLP_
    all_df = all_df.drop(columns=['SLP_'])
    return all_df

# データ標準化(rightGBMのときはいらない)
def standardscaler(train_df, test_df):
    scaler = StandardScaler()
    train_df_standard = pd.DataFrame(scaler.fit_transform(train_df), index = train_df.index, columns = train_df.columns)
    test_df_standard = pd.DataFrame(scaler.transform(test_df), index = test_df.index, columns = test_df.columns)
    return train_df_standard, test_df_standard

# 特徴量の削除/追加
all_df = eda(all_df)

# trainとtestに分けなおす
train_df = all_df[all_df['type'] == 'train']
test_df = all_df[all_df['type'] == 'test']
# train正解ラベル
y = train_df['OZONE']

# 学習に不要な特徴量を削除
train_df = train_df.drop(columns=['id', 'OZONE', 'type'])
test_df = test_df.drop(columns=['id', 'OZONE', 'type'])

# 欠損値を平均値で補完
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

print(f'train_df : \n{train_df}\n')
print(f'y : \n{y}\n')

train_df : 
            WSR_PK  WSR_AV  T_PK  T_AV        T85      RH85       U85  \
Date                                                                    
1998-01-01     5.5     3.1  19.1  12.5   6.700000  0.110000  3.830000   
1998-01-02     5.5     3.4  22.4  17.8   9.000000  0.250000 -0.410000   
1998-01-03     5.6     3.5  22.2  18.7   9.000000  0.560000  0.890000   
1998-01-04     4.7     3.2  19.6  18.7   9.900000  0.890000 -0.340000   
1998-01-05     3.7     2.3  26.0  21.1  13.539776  0.556758  2.243384   
...            ...     ...   ...   ...        ...       ...       ...   
2001-06-29     3.3     1.7  30.6  25.7  17.400000  0.520000  0.840000   
2001-06-30     3.3     2.0  31.7  26.9  16.000000  0.840000  0.010000   
2001-07-01     4.1     2.0  27.9  24.8  16.500000  0.790000  1.900000   
2001-07-02     3.1     1.4  30.8  24.8  17.100000  0.640000 -2.110000   
2001-07-03     2.6     1.4  30.9  26.2  16.800000  0.760000 -3.780000   

                 V85         HT85     

### lightGBM
#### +ハイパーパラメータチューニング(深さ、葉数、学習率)

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import tensorflow as tf
import random
import scipy.stats as stats
import os
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

"""
# tensorflowの代わり
def to_categorical(y, num_classes=np.amax(y)+1):
    return np.eye(num_classes, dtype='uint8')[y]
"""
# 乱数を固定
tf.random.set_seed(0)
np.random.seed(0)
random.seed(0)
os.environ["PYTHONHASHSEED"] = "0"

# KFold で学習させる
cv = KFold(n_splits=5, random_state=0, shuffle=True)

train_acc_list = []
val_acc_list = []

# ハイパーパラメータを定義　
lgb_params = {
    "objective":"binary",
    "metric": "binary_error",
    "force_row_wise" : True,
    "seed" : 0,
    'learning_rate': 0.16394902178188858,
    'min_data_in_leaf': 5,
    'num_leaves': 25,
    'max_depth': 13,
    }

# indexをDateから普通のindexに直す(kholdが使えないため)、日付は消す
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# アンサンブルなモデルたち
for i ,(trn_index, val_index) in enumerate(cv.split(train_df, y)):
    
    print(f'Fold : {i}')
    X_train ,X_val = train_df.loc[trn_index], train_df.loc[val_index]
    y_train ,y_val = y[trn_index],y[val_index]
    
    # *** LigthGBM Part ***
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_val, y_val)
    
    model_lgb = lgb.train(
        params = lgb_params, 
        train_set = lgb_train,
        valid_sets = [lgb_train, lgb_valid], 
        callbacks = [lgb.log_evaluation(period=0),lgb.early_stopping(10)],
       )
    
    # *** RandomForest Part ***
    print('-' *10 +' Start_rf ' +'-' *10)
    model_rf = RandomForestClassifier(
        random_state=0,max_depth=15,
        min_samples_leaf=5,min_samples_split=5
        )
    model_rf.fit(
        X_train, y_train
        )

    # 標準化
    train_df, test_df = standardscaler(train_df, test_df)
    X_train ,X_val = train_df.loc[trn_index], train_df.loc[val_index]
    y_train ,y_val = y[trn_index],y[val_index]
    
    # *** MLP Part ***
    print('-' *10 +' Start_mlp ' +'-' *10)
    
    # MLP用にLabel-EncodingをOne-Hot Encodingに変換
    X_train_mlp ,X_val_mlp = train_df.loc[trn_index],train_df.loc[val_index]
    y_train_mlp ,y_val_mlp = y[trn_index], y[val_index]
    
    
    model_mlp = tf.keras.models.Sequential([
        tf.keras.layers.Input(X_train_mlp.shape[1]),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
    """
    # torchで書き換え、途中
    model_mlp = nn.Sequential(
        nn.Input(shape=X_train_mlp.shape[1]), 
        nn.Linear(X_train_mlp.shape[1], 32), 
        nn.ReLU(), 
        nn.Dropout(0.5),
        nn.Linear(32, 16), 
        nn.ReLU(), 
        nn.Dropout(0.5),
        nn.Linear(16, 16), 
        nn.ReLU(), 
        nn.Linear(16, 2),
        nn.Softmax(dim=1)
    )
        loss_fn = model_mlp.CrossEntropyLoss()
    optim = torch.optim.Adam(model_mlp.parameters(), lr=learning_rate)
  
    """
    early_stopping =  EarlyStopping(
                            monitor='val_loss',
                            patience=10,
                            mode='auto'
                        )

    model_mlp.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    model_mlp.fit(
        X_train_mlp, to_categorical(y_train_mlp),validation_data = (X_val_mlp,to_categorical(y_val_mlp)),
        batch_size=256, epochs=300, verbose=False,callbacks=[early_stopping]
    )


    # *** LogisticRegression Part ***
    print('-' *10 +' Start_rogi ' +'-' *10)
    model_rogi = LogisticRegression()
    model_rogi.fit(
        X_train_mlp, y_train
        )
    
    # *** SVM Part ***
    print('-' *10 +' Start_SVM ' +'-' *10)
    model_svm = SVC(random_state=0)
    model_svm.fit(
        X_train_mlp, y_train
        )
    
    # それぞれのモデルで予測し、正答率を算出
    train_pred = np.zeros((len(y_train_mlp), 5))
    
    train_pred[:,0] = np.where(model_lgb.predict(X_train)>=0.5, 1, 0)
    train_pred[:,1] = model_rf.predict(X_train)
    train_pred[:,2] = np.argmax(model_mlp.predict(X_train_mlp),axis=1)
    train_pred[:,3] = model_rogi.predict(X_train_mlp)
    train_pred[:,4] = model_svm.predict(X_train_mlp)

    train_acc = accuracy_score(
        y_train, stats.mode(train_pred,axis=1)[0]
        )
    train_acc_list.append(train_acc)
    
    val_pred = np.zeros((len(y_val_mlp), 5))
    
    val_pred[:,0] = np.where(model_lgb.predict(X_val)>=0.5, 1, 0)
    val_pred[:,1] = model_rf.predict(X_val)
    val_pred[:,2] = np.argmax(model_mlp.predict(X_val_mlp),axis=1)
    val_pred[:,3] = model_rogi.predict(X_val_mlp)
    val_pred[:,4] = model_svm.predict(X_val_mlp)

    val_acc = accuracy_score(
        y_val, stats.mode(val_pred,axis=1)[0]
        )
    val_acc_list.append(val_acc)
    
    
print('-'*10 + 'Result' +'-'*10)
print(f'Train_acc : {train_acc_list} , Ave : {np.mean(train_acc_list)}')
print(f'Valid_acc : {val_acc_list} , Ave : {np.mean(val_acc_list)}')

Fold : 0
[LightGBM] [Info] Number of positive: 95, number of negative: 918
[LightGBM] [Info] Total Bins 4561
[LightGBM] [Info] Number of data points in the train set: 1013, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.093781 -> initscore=-2.268320
[LightGBM] [Info] Start training from score -2.268320
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[4]	training's binary_error: 0.0454097	valid_1's binary_error: 0.0472441
---------- Start_rf ----------


  y_train ,y_val = y[trn_index],y[val_index]
  y_train ,y_val = y[trn_index],y[val_index]
  y_train_mlp ,y_val_mlp = y[trn_index], y[val_index]


---------- Start_mlp ----------
---------- Start_rogi ----------
---------- Start_SVM ----------
Fold : 1
[LightGBM] [Info] Number of positive: 83, number of negative: 930
[LightGBM] [Info] Total Bins 4490
[LightGBM] [Info] Number of data points in the train set: 1013, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081935 -> initscore=-2.416344
[LightGBM] [Info] Start training from score -2.416344
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1]	training's binary_error: 0.0819348	valid_1's binary_error: 0.110236
---------- Start_rf ----------


  y_train ,y_val = y[trn_index],y[val_index]
  y_train ,y_val = y[trn_index],y[val_index]
  y_train_mlp ,y_val_mlp = y[trn_index], y[val_index]


---------- Start_mlp ----------
---------- Start_rogi ----------
---------- Start_SVM ----------
Fold : 2
[LightGBM] [Info] Number of positive: 86, number of negative: 928
[LightGBM] [Info] Total Bins 4451
[LightGBM] [Info] Number of data points in the train set: 1014, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.084813 -> initscore=-2.378684
[LightGBM] [Info] Start training from score -2.378684
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	training's binary_error: 0	valid_1's binary_error: 0.0711462
---------- Start_rf ----------


  y_train ,y_val = y[trn_index],y[val_index]
  y_train ,y_val = y[trn_index],y[val_index]
  y_train_mlp ,y_val_mlp = y[trn_index], y[val_index]


---------- Start_mlp ----------
---------- Start_rogi ----------
---------- Start_SVM ----------
Fold : 3
[LightGBM] [Info] Number of positive: 91, number of negative: 923
[LightGBM] [Info] Total Bins 4445
[LightGBM] [Info] Number of data points in the train set: 1014, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.089744 -> initscore=-2.316770
[LightGBM] [Info] Start training from score -2.316770
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[17]	training's binary_error: 0	valid_1's binary_error: 0.0592885
---------- Start_rf ----------


  y_train ,y_val = y[trn_index],y[val_index]
  y_train ,y_val = y[trn_index],y[val_index]
  y_train_mlp ,y_val_mlp = y[trn_index], y[val_index]


---------- Start_mlp ----------
---------- Start_rogi ----------
---------- Start_SVM ----------
Fold : 4
[LightGBM] [Info] Number of positive: 89, number of negative: 925
[LightGBM] [Info] Total Bins 4469
[LightGBM] [Info] Number of data points in the train set: 1014, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.087771 -> initscore=-2.341157
[LightGBM] [Info] Start training from score -2.341157
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[11]	training's binary_error: 0.00197239	valid_1's binary_error: 0.0592885
---------- Start_rf ----------


  y_train ,y_val = y[trn_index],y[val_index]
  y_train ,y_val = y[trn_index],y[val_index]
  y_train_mlp ,y_val_mlp = y[trn_index], y[val_index]


---------- Start_mlp ----------
---------- Start_rogi ----------
---------- Start_SVM ----------
----------Result----------
Train_acc : [0.9062191510365252, 0.926949654491609, 0.9418145956607495, 0.9378698224852071, 0.9408284023668639] , Ave : 0.9307363252081909
Valid_acc : [0.937007874015748, 0.8937007874015748, 0.9051383399209486, 0.924901185770751, 0.9209486166007905] , Ave : 0.9163393607419625


In [11]:
# 予測結果をサブミットするファイル形式に変更
test_pred = np.zeros((len(test_df), 5))

test_pred[:,0] = np.where(model_lgb.predict(test_df)>=0.5, 1, 0)
test_pred[:,1] = model_rf.predict(test_df)
test_pred[:,2] = np.argmax(model_mlp.predict(test_df),axis=1)
test_pred[:,3] = model_rogi.predict(test_df)
test_pred[:,4] = model_svm.predict(test_df)


# 提出ファイルを出力 
test_pred = pd.DataFrame(test_pred)
print(test_pred.sum())
test_pred = test_pred.mode(axis=1).values

for index, row in sample_sub.iterrows():
    sample_sub.iloc[index,1] = test_pred[index]


print(f'test_pred.sum : {test_pred.sum()}')
# 結果を保存
sample_sub.to_csv("../data/submission_ensemble.csv", index=False)

0    25.0
1     4.0
2     0.0
3    17.0
4     0.0
dtype: float64
test_pred.sum : 4.0
