## カラムの日本語訳

In [None]:
# MonsoonIntensity - モンスーンの強度
# TopographyDrainage - 地形排水
# RiverManagement - 河川管理
# Deforestation - 森林破壊
# Urbanization - 都市化
# ClimateChange - 気候変動
# DamsQuality - ダムの品質
# Siltation - 堆積
# AgriculturalPractices - 農業の慣行
# Encroachments - 侵害
# IneffectiveDisasterPreparedness - 効果のない災害対策
# DrainageSystems - 排水システム
# CoastalVulnerability - 沿岸の脆弱性
# Landslides - 地滑り
# Watersheds - 流域
# DeterioratingInfrastructure - 低下するインフラ
# PopulationScore - 人口スコア
# WetlandLoss - 湿地の喪失
# InadequatePlanning - 不十分な計画
# PoliticalFactors - 政治的要因
# FloodProbability - 洪水確率

## import

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import optuna
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from rgf.sklearn import RGFRegressor

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## データの読み込み

In [3]:
train = pd.read_csv("inputs/train.csv")
test = pd.read_csv("inputs/test.csv")

In [4]:
x_train = train.drop(columns=["FloodProbability", "id"], axis=1)
y_train = train["FloodProbability"]
x_test = test.drop(columns=["id"], axis=1)

## 特徴量エンジニアリング

In [5]:
# 統計データの追加
def cleaning(dataset):
    features = dataset.columns.tolist()
    dataset['total'] = dataset[features].sum(axis=1)
    dataset['mean_features'] = 0.1*dataset[features].mean(axis=1)
    dataset['std_features'] = dataset[features].std(axis=1)
    dataset['max_features'] = dataset[features].max(axis=1)
    dataset['min_features'] = dataset[features].min(axis=1)
    dataset['range_features'] = dataset['max_features'] - dataset['min_features']
    dataset['median_features'] = 0.1*dataset[features].median(axis=1)
    dataset['skewness_features'] = dataset[features].skew(axis=1)
    dataset['ptp'] = dataset[features].values.ptp(axis=1)
    dataset['q25'] = dataset[features].quantile(0.25, axis=1)
    dataset['q75'] = dataset[features].quantile(0.75, axis=1)

cleaning(x_train)
cleaning(x_test)

In [6]:
# 特徴量の追加
def add_features(df):
    df['ClimateImpact'] = df['MonsoonIntensity'] + df['ClimateChange']
    df['AnthropogenicPressure'] = df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments']
    df['InfrastructureQuality'] = df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']
    df['CoastalVulnerabilityTotal'] = df['CoastalVulnerability'] + df['Landslides']
    df['PreventiveMeasuresEfficiency'] = df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning']
    df['EcosystemImpact'] = df['WetlandLoss'] + df['Watersheds']
    df['SocioPoliticalContext'] = df['PopulationScore'] * df['PoliticalFactors']

add_features(x_train)
add_features(x_test)

## stackingによるアンサンブル

In [7]:
def objective(trial, model, x_train_op, y_train_op, x_test_op, y_test_op):
    base_params = model.get_params()
    
    if isinstance(model, xgb.XGBRegressor):
        new_params = {
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
            'max_depth': trial.suggest_int('max_depth', 1, 15),
            'subsample': trial.suggest_uniform('subsample', 0.25, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-9, 10.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-9, 10.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
        }
    elif isinstance(model, lgb.LGBMRegressor):
        new_params = {
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
            'max_depth': trial.suggest_int('max_depth', 1, 15),
            'subsample': trial.suggest_uniform('subsample', 0.25, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-9, 10.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-9, 10.0),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 20)
        }
    elif isinstance(model, CatBoostRegressor):
        new_params = {
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'depth': trial.suggest_int('depth', 1, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
            'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
            'border_count': trial.suggest_int('border_count', 1, 255),
            'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10),
            'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-3, 10),
            'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
            'od_wait': trial.suggest_int('od_wait', 10, 50)
        }
    else:
        raise ValueError("Unsupported model type!")

    base_params.update(new_params)
    model = model.__class__(**base_params)
    model.fit(x_train_op, y_train_op)
    y_pred = model.predict(x_test_op)
    r2 = r2_score(y_test_op, y_pred)

    return r2

def predict_cv(model, x_train, y_train, x_test):
    preds = list()
    preds_test = list()
    va_idxes = list()
    
    kf = KFold(n_splits=5, shuffle=True, random_state=71)
    iterator = tqdm(enumerate(kf.split(x_train)), total=kf.get_n_splits(), desc='CV Progress')  # tqdmを使ってプログレスバーを表示
    
    for _, (tr_idx, va_idx) in iterator:
        tr_x, va_x = x_train.iloc[tr_idx], x_train.iloc[va_idx]
        tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
        
        model.fit(tr_x, tr_y)
        
        pred = model.predict(va_x)
        preds.append(pred)
        
        pred_test = model.predict(x_test)
        preds_test.append(pred_test)
        
        va_idxes.append(va_idx)

    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]
    preds_test = np.mean(preds_test, axis=0)
    
    return pred_train, preds_test

## Model

In [None]:
x_train_op, x_test_op, y_train_op, y_test_op = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
initial_model = lgb.LGBMRegressor(objective='regression', random_state=0, device='cpu', verbosity=-1)
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, initial_model, x_train_op, y_train_op, x_test_op, y_test_op), n_trials=100)

In [None]:
x_train_op, x_test_op, y_train_op, y_test_op = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
initial_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=0)
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, initial_model, x_train_op, y_train_op, x_test_op, y_test_op), n_trials=100)

In [8]:
x_train_op, x_test_op, y_train_op, y_test_op = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
initial_model = CatBoostRegressor(objective='RMSE', random_seed=0,task_type='GPU', verbose=0)
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, initial_model, x_train_op, y_train_op, x_test_op, y_test_op), n_trials=100)

[I 2024-05-20 19:02:28,628] A new study created in memory with name: no-name-c436e4a9-f66a-47ad-af76-89eefad0edb8
[I 2024-05-20 19:03:39,558] Trial 0 finished with value: 0.8543625387905273 and parameters: {'iterations': 632, 'depth': 8, 'learning_rate': 0.003671722512334447, 'l2_leaf_reg': 0.007870950332850655, 'border_count': 233, 'random_strength': 2.739388351537881, 'bagging_temperature': 0.010291397602041733, 'od_type': 'Iter', 'od_wait': 16}. Best is trial 0 with value: 0.8543625387905273.
[I 2024-05-20 19:03:49,813] Trial 1 finished with value: 0.2808282087219014 and parameters: {'iterations': 183, 'depth': 3, 'learning_rate': 0.0012009669599053622, 'l2_leaf_reg': 2.4338048469538056, 'border_count': 126, 'random_strength': 0.19756297555750732, 'bagging_temperature': 0.0016586431316978764, 'od_type': 'Iter', 'od_wait': 36}. Best is trial 0 with value: 0.8543625387905273.
[I 2024-05-20 19:05:00,049] Trial 2 finished with value: 0.8173212777296317 and parameters: {'iterations': 662

In [10]:
lgb_best_params = {'learning_rate': 0.03448065557991946, 'n_estimators': 1139, 'max_depth': 14, 'subsample': 0.7117013506265009, 'colsample_bytree': 0.8453899699874292, 'reg_alpha': 8.707740542701372e-06, 'reg_lambda': 1.3473848421538016e-07, 'min_child_samples': 19}
xgb_best_params = {'learning_rate': 0.011266220048489109, 'n_estimators': 673, 'max_depth': 9, 'subsample': 0.8719725913762578, 'colsample_bytree': 0.8055042169347792, 'reg_alpha': 0.00018948835810392984, 'reg_lambda': 3.8477359926778826e-09, 'min_child_weight': 10}
cat_best_params = {'iterations': 974, 'depth': 7, 'learning_rate': 0.06074053195699405, 'l2_leaf_reg': 6.255562336483921, 'border_count': 159, 'random_strength': 0.006997658088887226, 'bagging_temperature': 0.015062515205916959, 'od_type': 'IncToDec', 'od_wait': 43}
lgb_model = lgb.LGBMRegressor(**lgb_best_params, objective='regression', random_state=0, device='cpu', verbosity=-1)
xgb_model = xgb.XGBRegressor(**xgb_best_params, objective='reg:squarederror', random_state=0)
cat_model = CatBoostRegressor(**cat_best_params, objective='RMSE', random_seed=0)

In [11]:
pred_train_lgb, pred_test_lgb = predict_cv(lgb_model, x_train, y_train, x_test)
pred_train_xgb, pred_test_xgb = predict_cv(xgb_model, x_train, y_train, x_test)
pred_train_cat, pred_test_cat = predict_cv(cat_model, x_train, y_train, x_test)

CV Progress: 100%|██████████| 5/5 [01:30<00:00, 18.20s/it]
CV Progress: 100%|██████████| 5/5 [02:21<00:00, 28.37s/it]
CV Progress:   0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.0484383	total: 37.9ms	remaining: 36.8s
1:	learn: 0.0460160	total: 76ms	remaining: 36.9s
2:	learn: 0.0437675	total: 114ms	remaining: 36.8s
3:	learn: 0.0416726	total: 149ms	remaining: 36.2s
4:	learn: 0.0397330	total: 184ms	remaining: 35.6s
5:	learn: 0.0379302	total: 217ms	remaining: 35s
6:	learn: 0.0362626	total: 252ms	remaining: 34.8s
7:	learn: 0.0347218	total: 294ms	remaining: 35.5s
8:	learn: 0.0332974	total: 332ms	remaining: 35.6s
9:	learn: 0.0319868	total: 365ms	remaining: 35.1s
10:	learn: 0.0307713	total: 396ms	remaining: 34.7s
11:	learn: 0.0296576	total: 429ms	remaining: 34.4s
12:	learn: 0.0286373	total: 464ms	remaining: 34.3s
13:	learn: 0.0276997	total: 498ms	remaining: 34.2s
14:	learn: 0.0268440	total: 533ms	remaining: 34.1s
15:	learn: 0.0260594	total: 568ms	remaining: 34s
16:	learn: 0.0253465	total: 599ms	remaining: 33.7s
17:	learn: 0.0246976	total: 632ms	remaining: 33.6s
18:	learn: 0.0241073	total: 665ms	remaining: 33.4s
19:	learn: 0.0235727	total: 701ms	remaining: 

CV Progress:  20%|██        | 1/5 [00:28<01:52, 28.08s/it]

0:	learn: 0.0484586	total: 32.5ms	remaining: 31.6s
1:	learn: 0.0460388	total: 66.2ms	remaining: 32.2s
2:	learn: 0.0437896	total: 98.8ms	remaining: 32s
3:	learn: 0.0416987	total: 133ms	remaining: 32.2s
4:	learn: 0.0397562	total: 162ms	remaining: 31.5s
5:	learn: 0.0379570	total: 196ms	remaining: 31.7s
6:	learn: 0.0362880	total: 230ms	remaining: 31.8s
7:	learn: 0.0347440	total: 265ms	remaining: 32s
8:	learn: 0.0333193	total: 300ms	remaining: 32.2s
9:	learn: 0.0320050	total: 332ms	remaining: 32.1s
10:	learn: 0.0307955	total: 365ms	remaining: 32s
11:	learn: 0.0296834	total: 396ms	remaining: 31.8s
12:	learn: 0.0286634	total: 428ms	remaining: 31.6s
13:	learn: 0.0277251	total: 460ms	remaining: 31.6s
14:	learn: 0.0268700	total: 495ms	remaining: 31.7s
15:	learn: 0.0260858	total: 530ms	remaining: 31.7s
16:	learn: 0.0253729	total: 563ms	remaining: 31.7s
17:	learn: 0.0247218	total: 597ms	remaining: 31.7s
18:	learn: 0.0241319	total: 628ms	remaining: 31.5s
19:	learn: 0.0235962	total: 659ms	remaining:

CV Progress:  40%|████      | 2/5 [00:56<01:24, 28.06s/it]

0:	learn: 0.0484357	total: 33.2ms	remaining: 32.3s
1:	learn: 0.0460139	total: 66.4ms	remaining: 32.3s
2:	learn: 0.0437639	total: 98.9ms	remaining: 32s
3:	learn: 0.0416704	total: 133ms	remaining: 32.3s
4:	learn: 0.0397282	total: 169ms	remaining: 32.8s
5:	learn: 0.0379276	total: 205ms	remaining: 33.1s
6:	learn: 0.0362606	total: 238ms	remaining: 32.9s
7:	learn: 0.0347179	total: 276ms	remaining: 33.3s
8:	learn: 0.0332947	total: 313ms	remaining: 33.6s
9:	learn: 0.0319784	total: 345ms	remaining: 33.3s
10:	learn: 0.0307691	total: 378ms	remaining: 33.1s
11:	learn: 0.0296547	total: 413ms	remaining: 33.1s
12:	learn: 0.0286350	total: 445ms	remaining: 32.9s
13:	learn: 0.0276993	total: 480ms	remaining: 32.9s
14:	learn: 0.0268443	total: 515ms	remaining: 32.9s
15:	learn: 0.0260597	total: 547ms	remaining: 32.7s
16:	learn: 0.0253464	total: 578ms	remaining: 32.5s
17:	learn: 0.0246973	total: 611ms	remaining: 32.5s
18:	learn: 0.0241068	total: 643ms	remaining: 32.3s
19:	learn: 0.0235717	total: 678ms	remain

CV Progress:  60%|██████    | 3/5 [01:24<00:56, 28.15s/it]

0:	learn: 0.0484455	total: 32.6ms	remaining: 31.7s
1:	learn: 0.0460232	total: 66.6ms	remaining: 32.4s
2:	learn: 0.0437746	total: 99.1ms	remaining: 32.1s
3:	learn: 0.0416798	total: 136ms	remaining: 32.9s
4:	learn: 0.0397379	total: 168ms	remaining: 32.6s
5:	learn: 0.0379387	total: 203ms	remaining: 32.8s
6:	learn: 0.0362700	total: 237ms	remaining: 32.7s
7:	learn: 0.0347287	total: 272ms	remaining: 32.9s
8:	learn: 0.0333049	total: 308ms	remaining: 33s
9:	learn: 0.0319887	total: 340ms	remaining: 32.8s
10:	learn: 0.0307789	total: 373ms	remaining: 32.7s
11:	learn: 0.0296643	total: 405ms	remaining: 32.4s
12:	learn: 0.0286440	total: 438ms	remaining: 32.4s
13:	learn: 0.0277081	total: 474ms	remaining: 32.5s
14:	learn: 0.0268526	total: 511ms	remaining: 32.7s
15:	learn: 0.0260677	total: 547ms	remaining: 32.7s
16:	learn: 0.0253552	total: 580ms	remaining: 32.7s
17:	learn: 0.0247066	total: 611ms	remaining: 32.4s
18:	learn: 0.0241161	total: 645ms	remaining: 32.4s
19:	learn: 0.0235814	total: 680ms	remain

CV Progress:  80%|████████  | 4/5 [01:52<00:27, 27.98s/it]

973:	learn: 0.0183721	total: 27.3s	remaining: 0us
0:	learn: 0.0484353	total: 33.1ms	remaining: 32.2s
1:	learn: 0.0460160	total: 66.4ms	remaining: 32.3s
2:	learn: 0.0437661	total: 103ms	remaining: 33.3s
3:	learn: 0.0416741	total: 136ms	remaining: 33.1s
4:	learn: 0.0397330	total: 169ms	remaining: 32.7s
5:	learn: 0.0379332	total: 201ms	remaining: 32.5s
6:	learn: 0.0362657	total: 236ms	remaining: 32.7s
7:	learn: 0.0347205	total: 274ms	remaining: 33.1s
8:	learn: 0.0332964	total: 310ms	remaining: 33.3s
9:	learn: 0.0319819	total: 346ms	remaining: 33.4s
10:	learn: 0.0307727	total: 381ms	remaining: 33.4s
11:	learn: 0.0296576	total: 413ms	remaining: 33.1s
12:	learn: 0.0286380	total: 451ms	remaining: 33.4s
13:	learn: 0.0277023	total: 486ms	remaining: 33.4s
14:	learn: 0.0268479	total: 519ms	remaining: 33.2s
15:	learn: 0.0260622	total: 556ms	remaining: 33.3s
16:	learn: 0.0253479	total: 587ms	remaining: 33.1s
17:	learn: 0.0246985	total: 621ms	remaining: 33s
18:	learn: 0.0241099	total: 653ms	remainin

CV Progress: 100%|██████████| 5/5 [02:20<00:00, 28.03s/it]

972:	learn: 0.0183643	total: 27.6s	remaining: 28.4ms
973:	learn: 0.0183642	total: 27.7s	remaining: 0us





In [23]:
x2_train = pd.DataFrame({'lgb': pred_train_lgb, 'xgb': pred_train_xgb, 'cat': pred_train_cat})
x2_test = pd.DataFrame({'lgb': pred_test_lgb, 'xgb': pred_test_xgb, 'cat': pred_test_cat})

In [27]:
liner_model = LinearRegression()

In [36]:
cv = KFold(5, shuffle=True, random_state=0)
cv_splits = tqdm(cv.split(x2_train, y_train), total=cv.get_n_splits(), desc='CV Progress')

scores = []
for train_idx, val_idx in cv_splits:
    x_train_fold, x_val_fold = x2_train.iloc[train_idx], x2_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    liner_model.fit(x_train_fold, y_train_fold)
    y_pred = liner_model.predict(x_val_fold)
    r2 = r2_score(y_val_fold, y_pred)
    scores.append(r2)
    
    print(f'score: {r2}')

print(f"Mean Score ＝ {np.mean(scores):.5f}") 

CV Progress:  80%|████████  | 4/5 [00:00<00:00, 15.58it/s]

score: 0.8683306236987806
score: 0.8691986975717285
score: 0.8696091484942055
score: 0.8694933004081825


CV Progress: 100%|██████████| 5/5 [00:00<00:00, 15.20it/s]

score: 0.8691689959336155
Mean Score ＝ 0.86916





In [38]:
y_pred = liner_model.predict(x2_test)

## 提出用ファイルの作成

In [39]:
submit = pd.read_csv("inputs/sample_submission.csv")
submit["FloodProbability"] = y_pred
submit.to_csv("outputs/submission_stacking.csv", index=False)
submit.head()

Unnamed: 0,id,FloodProbability
0,1117957,0.577717
1,1117958,0.456253
2,1117959,0.448419
3,1117960,0.46689
4,1117961,0.467345
