## カラムの日本語訳

In [1]:
# MonsoonIntensity - モンスーンの強度
# TopographyDrainage - 地形排水
# RiverManagement - 河川管理
# Deforestation - 森林破壊
# Urbanization - 都市化
# ClimateChange - 気候変動
# DamsQuality - ダムの品質
# Siltation - 堆積
# AgriculturalPractices - 農業の慣行
# Encroachments - 侵害
# IneffectiveDisasterPreparedness - 効果のない災害対策
# DrainageSystems - 排水システム
# CoastalVulnerability - 沿岸の脆弱性
# Landslides - 地滑り
# Watersheds - 流域
# DeterioratingInfrastructure - 低下するインフラ
# PopulationScore - 人口スコア
# WetlandLoss - 湿地の喪失
# InadequatePlanning - 不十分な計画
# PoliticalFactors - 政治的要因
# FloodProbability - 洪水確率

## import

In [44]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

## データの読み込み

In [3]:
train = pd.read_csv("inputs/train.csv")
test = pd.read_csv("inputs/test.csv")

In [4]:
x_train = train.drop(columns=["FloodProbability", "id"], axis=1)
y_train = train["FloodProbability"]
x_test = test.drop(columns=["id"], axis=1)

## 特徴量エンジニアリング

In [5]:
# 統計データの追加
def cleaning(dataset):
    features = dataset.columns.tolist()
    dataset['total'] = dataset[features].sum(axis=1)
    dataset['mean_features'] = 0.1*dataset[features].mean(axis=1)
    dataset['std_features'] = dataset[features].std(axis=1)
    dataset['max_features'] = dataset[features].max(axis=1)
    dataset['min_features'] = dataset[features].min(axis=1)
    dataset['median_features'] = 0.1*dataset[features].median(axis=1)
    dataset['ptp'] = dataset[features].values.ptp(axis=1)
    dataset['q25'] = dataset[features].quantile(0.25, axis=1)
    dataset['q75'] = dataset[features].quantile(0.75, axis=1)

cleaning(x_train)
cleaning(x_test)

In [6]:
# 特徴量の追加
def add_features(df):
    df['ClimateImpact'] = df['MonsoonIntensity'] + df['ClimateChange']
    df['AnthropogenicPressure'] = df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments']
    df['InfrastructureQuality'] = df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']
    df['CoastalVulnerabilityTotal'] = df['CoastalVulnerability'] + df['Landslides']
    df['PreventiveMeasuresEfficiency'] = df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning']
    df['EcosystemImpact'] = df['WetlandLoss'] + df['Watersheds']
    df['SocioPoliticalContext'] = df['PopulationScore'] * df['PoliticalFactors']

add_features(x_train)
add_features(x_test)

## trainとtestの分布を確認

In [9]:
# def plot_distribution_pairs(train, test, feature, hue="set", palette=None):
#     data_df = train.copy()
#     data_df['set'] = 'train'
#     data_df = pd.concat([data_df, test.copy()]).fillna('test')
#     data_df.replace([np.inf, -np.inf], np.nan, inplace=True)

#     f, axes = plt.subplots(1, 2, figsize=(14, 6))
#     for i, s in enumerate(data_df[hue].unique()):
#         selection = data_df.loc[data_df[hue]==s, feature]
#         # Filter 'selection' to include only the central 95% of the data
#         q_025, q_975 = np.percentile(selection, [2.5, 97.5])
#         selection_filtered = selection[(selection >= q_025) & (selection <= q_975)]
#         with warnings.catch_warnings():
#             warnings.simplefilter("ignore", category=FutureWarning)
#             sns.histplot(selection_filtered, color=palette[i], ax=axes[0], label=s)
#             sns.boxplot(x=hue, y=feature, data=data_df, palette=palette, ax=axes[1])
#     axes[0].set_title(f"Paired train/test distributions of {feature}")
#     axes[1].set_title(f"Paired train/test boxplots of {feature}")
#     axes[0].legend()
#     axes[1].legend()
#     plt.show()
# color_list = ["#A5D7E8", "#576CBC", "#19376D", "#0B2447"]
# for feature in x_train.columns:
#   plot_distribution_pairs(x_train, x_test, feature, palette=color_list)

## stackingによるアンサンブル

In [8]:
def predict_cv(model, x_train, y_train, x_test):
    preds = list()
    preds_test = list()
    va_idxes = list()
    
    kf = KFold(n_splits=5, shuffle=True, random_state=71)
    iterator = tqdm(enumerate(kf.split(x_train)), total=kf.get_n_splits(), desc='CV Progress')  # tqdmを使ってプログレスバーを表示
    
    for _, (tr_idx, va_idx) in iterator:
        tr_x, va_x = x_train.iloc[tr_idx], x_train.iloc[va_idx]
        tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
        
        model.fit(tr_x, tr_y)
        
        pred = model.predict(va_x)
        preds.append(pred)
        
        pred_test = model.predict(x_test)
        preds_test.append(pred_test)
        
        va_idxes.append(va_idx)

    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]
    preds_test = np.mean(preds_test, axis=0)
    
    return pred_train, preds_test

In [40]:
lgb_model_1 = lgb.LGBMRegressor(max_depth=5, n_estimators=100, random_state=42, device="gpu")
lgb_model_2 = lgb.LGBMRegressor(max_depth=7, n_estimators=100, random_state=42, device="gpu")
lgb_model_3 = lgb.LGBMRegressor(max_depth=10, n_estimators=100, random_state=42, device="gpu")

rf_model_1 = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=42)
rf_model_2 = RandomForestRegressor(max_depth=10, n_estimators=100, random_state=42)

mlp_1_layer = MLPRegressor(hidden_layer_sizes=(50,), max_iter=500, random_state=42)
mlp_2_layers = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42)

linear_model = LinearRegression()

In [41]:
pred_train_lgb_2, pred_test_lgb_2 = predict_cv(lgb_model_2, x_train, y_train, x_test)
pred_train_rf_1, pred_test_rf_1 = predict_cv(rf_model_1, x_train, y_train, x_test)
pred_train_lgb_1, pred_test_lgb_1 = predict_cv(mlp_1_layer, x_train, y_train, x_test)

print(f"LGBM: {r2_score(y_train, pred_train_lgb_2)}")
print(f"Random Forest: {r2_score(y_train, pred_train_rf_1)}")
print(f"MLP: {r2_score(y_train, pred_train_lgb_1)}")

CV Progress:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1102
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 36
[LightGBM] [Info] Using GPU Device: NVIDIA TITAN RTX, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 36 dense feature groups (30.71 MB) transferred to GPU in 0.041061 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504459


CV Progress:  20%|██        | 1/5 [00:03<00:12,  3.07s/it]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 36
[LightGBM] [Info] Using GPU Device: NVIDIA TITAN RTX, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 36 dense feature groups (30.71 MB) transferred to GPU in 0.038766 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504457


CV Progress:  40%|████      | 2/5 [00:06<00:09,  3.06s/it]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 894366, number of used features: 36
[LightGBM] [Info] Using GPU Device: NVIDIA TITAN RTX, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 36 dense feature groups (30.71 MB) transferred to GPU in 0.039165 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504501


CV Progress:  60%|██████    | 3/5 [00:09<00:06,  3.06s/it]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1101
[LightGBM] [Info] Number of data points in the train set: 894366, number of used features: 36
[LightGBM] [Info] Using GPU Device: NVIDIA TITAN RTX, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 36 dense feature groups (30.71 MB) transferred to GPU in 0.039253 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504490


CV Progress:  80%|████████  | 4/5 [00:12<00:03,  3.05s/it]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1098
[LightGBM] [Info] Number of data points in the train set: 894366, number of used features: 36
[LightGBM] [Info] Using GPU Device: NVIDIA TITAN RTX, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 36 dense feature groups (30.71 MB) transferred to GPU in 0.039218 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504495


CV Progress: 100%|██████████| 5/5 [00:15<00:00,  3.05s/it]
CV Progress: 100%|██████████| 5/5 [16:19<00:00, 195.89s/it]
CV Progress: 100%|██████████| 5/5 [03:50<00:00, 46.14s/it]

LGBM: 0.8687478072941155
Random Forest: 0.8642661416782262
MLP: 0.5438519181996369





In [10]:
pred_train_lgb_3, pred_test_lgb_3 = predict_cv(lgb_model_3, x_train, y_train, x_test)
pred_train_mlp_2, pred_test_mlp_2 = predict_cv(mlp_2_layers, x_train, y_train, x_test)
pred_train_rf_2, pred_test_rf_2 = predict_cv(rf_model_2, x_train, y_train, x_test)

print(f"lgb_1: {r2_score(y_train, pred_train_lgb_3)}")
print(f"lgb_2: {r2_score(y_train, pred_train_rf_2)}")
print(f"lgb_3: {r2_score(y_train, pred_train_lgb_3)}")

CV Progress:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1102
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 36
[LightGBM] [Info] Using GPU Device: NVIDIA TITAN RTX, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 36 dense feature groups (30.71 MB) transferred to GPU in 0.062817 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504459


CV Progress:  20%|██        | 1/5 [00:33<02:14, 33.66s/it]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 36
[LightGBM] [Info] Using GPU Device: NVIDIA TITAN RTX, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 36 dense feature groups (30.71 MB) transferred to GPU in 0.062789 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504457


CV Progress:  40%|████      | 2/5 [01:07<01:40, 33.56s/it]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 894366, number of used features: 36
[LightGBM] [Info] Using GPU Device: NVIDIA TITAN RTX, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 36 dense feature groups (30.71 MB) transferred to GPU in 0.062838 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504501


CV Progress:  60%|██████    | 3/5 [01:40<01:07, 33.56s/it]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1101
[LightGBM] [Info] Number of data points in the train set: 894366, number of used features: 36
[LightGBM] [Info] Using GPU Device: NVIDIA TITAN RTX, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 36 dense feature groups (30.71 MB) transferred to GPU in 0.062969 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504490


CV Progress:  80%|████████  | 4/5 [02:14<00:33, 33.57s/it]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1098
[LightGBM] [Info] Number of data points in the train set: 894366, number of used features: 36
[LightGBM] [Info] Using GPU Device: NVIDIA TITAN RTX, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 36 dense feature groups (30.71 MB) transferred to GPU in 0.064211 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504495


CV Progress: 100%|██████████| 5/5 [02:47<00:00, 33.55s/it]
CV Progress: 100%|██████████| 5/5 [26:32<00:00, 318.46s/it]
CV Progress: 100%|██████████| 5/5 [30:03<00:00, 360.67s/it]

lgb_1: 0.8687668574621917
lgb_2: 0.8689157473042654
lgb_3: 0.8687668574621917





In [42]:
x2_train = pd.DataFrame({'lgb_3': pred_train_lgb_3, 'lgb_2': pred_train_lgb_2, 'rf_1': pred_train_rf_1, 'rf_2': pred_train_rf_2, 'mlp_1': pred_train_lgb_1, 'mlp_2': pred_train_mlp_2})
x2_test = pd.DataFrame({'lgb_3': pred_test_lgb_3, 'lgb_2': pred_test_lgb_2, 'rf_1': pred_test_rf_1, 'rf_2': pred_test_rf_2, 'mlp_1': pred_test_lgb_1, 'mlp_2': pred_test_mlp_2})
print(x2_train.shape, x2_test.shape)

(1117957, 6) (745305, 6)


In [52]:
linear_model = LinearRegression()
ridge_model = Ridge()
pred_train_linear, pred_test_linear = predict_cv(linear_model, x2_train, y_train, x2_test)
pred_train_ridge, pred_test_ridge = predict_cv(ridge_model, x2_train, y_train, x2_test)

print(f"Linear: {r2_score(y_train, pred_train_linear)}")
print(f"Ridge: {r2_score(y_train, pred_train_ridge)}")

CV Progress: 100%|██████████| 5/5 [00:00<00:00,  8.71it/s]
CV Progress: 100%|██████████| 5/5 [00:00<00:00, 14.72it/s]


Linear: 0.869016602673978
Ridge: 0.8690034185266192


In [53]:
x3_train = pd.DataFrame({'linear': pred_train_linear, 'ridge': pred_train_ridge})
x3_test = pd.DataFrame({'linear': pred_test_linear, 'ridge': pred_test_ridge})

In [58]:
last_model = LinearRegression()
# last_model = Ridge()
pred_train_last, pred_test_last = predict_cv(last_model, x3_train, y_train, x3_test)
print(f"last: {r2_score(y_train, pred_train_last)}")

CV Progress: 100%|██████████| 5/5 [00:00<00:00, 21.04it/s]

last: 0.8690154564476442





## 重要ではない特徴量を取り除いて再訓練

In [None]:

# min = 2000
# feature_idx = list()
# for idx, item in enumerate(model.feature_importances_):
#     if item <= min:
#         feature_idx.append(idx)
# feature_name = x_train.columns

# x_train = x_train.drop(columns=list(feature_name[feature_idx]), axis=1)
# x_test = x_test.drop(columns=list(feature_name[feature_idx]), axis=1)
# model = LGBMRegressor(**study.best_params, objective='regression', random_state=0, device='gpu', verbosity=-1)
# cv = KFold(5, shuffle=True, random_state=0)
# cv_splits = cv.split(x_train, y_train)
# scores = []
# for train_idx, val_idx in cv_splits:
#     x_train_fold, x_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
#     y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
#     model.fit(x_train_fold, y_train_fold)
#     y_pred = model.predict(x_val_fold)
#     r2 = r2_score(y_val_fold, y_pred)
#     print(f'score: {r2}')
#     scores.append(r2)

# print(f"Mean Score ＝ {np.mean(scores):.5f}") 

## 提出用ファイルの作成

In [47]:
submit = pd.read_csv("inputs/sample_submission.csv")
submit["FloodProbability"] = pred_test_last
submit.to_csv("outputs/submission_stacking.csv", index=False)
submit.head()

Unnamed: 0,id,FloodProbability
0,1117957,0.577747
1,1117958,0.455502
2,1117959,0.449118
3,1117960,0.4664
4,1117961,0.466489
