## カラムの日本語訳

In [None]:
# MonsoonIntensity - モンスーンの強度
# TopographyDrainage - 地形排水
# RiverManagement - 河川管理
# Deforestation - 森林破壊
# Urbanization - 都市化
# ClimateChange - 気候変動
# DamsQuality - ダムの品質
# Siltation - 堆積
# AgriculturalPractices - 農業の慣行
# Encroachments - 侵害
# IneffectiveDisasterPreparedness - 効果のない災害対策
# DrainageSystems - 排水システム
# CoastalVulnerability - 沿岸の脆弱性
# Landslides - 地滑り
# Watersheds - 流域
# DeterioratingInfrastructure - 低下するインフラ
# PopulationScore - 人口スコア
# WetlandLoss - 湿地の喪失
# InadequatePlanning - 不十分な計画
# PoliticalFactors - 政治的要因
# FloodProbability - 洪水確率

## import

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import optuna
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from rgf.sklearn import RGFRegressor

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## データの読み込み

In [3]:
train = pd.read_csv("inputs/train.csv")
test = pd.read_csv("inputs/test.csv")

In [4]:
x_train = train.drop(columns=["FloodProbability", "id"], axis=1)
y_train = train["FloodProbability"]
x_test = test.drop(columns=["id"], axis=1)

## 特徴量エンジニアリング

In [5]:
# 統計データの追加
def cleaning(dataset):
    features = dataset.columns.tolist()
    dataset['total'] = dataset[features].sum(axis=1)
    dataset['mean_features'] = 0.1*dataset[features].mean(axis=1)
    dataset['std_features'] = dataset[features].std(axis=1)
    dataset['max_features'] = dataset[features].max(axis=1)
    dataset['min_features'] = dataset[features].min(axis=1)
    dataset['range_features'] = dataset['max_features'] - dataset['min_features']
    dataset['median_features'] = 0.1*dataset[features].median(axis=1)
    dataset['skewness_features'] = dataset[features].skew(axis=1)
    dataset['ptp'] = dataset[features].values.ptp(axis=1)
    dataset['q25'] = dataset[features].quantile(0.25, axis=1)
    dataset['q75'] = dataset[features].quantile(0.75, axis=1)

cleaning(x_train)
cleaning(x_test)

In [6]:
# 特徴量の追加
def add_features(df):
    df['ClimateImpact'] = df['MonsoonIntensity'] + df['ClimateChange']
    df['AnthropogenicPressure'] = df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments']
    df['InfrastructureQuality'] = df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']
    df['CoastalVulnerabilityTotal'] = df['CoastalVulnerability'] + df['Landslides']
    df['PreventiveMeasuresEfficiency'] = df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning']
    df['EcosystemImpact'] = df['WetlandLoss'] + df['Watersheds']
    df['SocioPoliticalContext'] = df['PopulationScore'] * df['PoliticalFactors']

add_features(x_train)
add_features(x_test)

## stackingによるアンサンブル

In [7]:
def objective(trial, model, x_train_op, y_train_op, x_test_op, y_test_op):
    base_params = model.get_params()
    
    if isinstance(model, xgb.XGBRegressor):
        new_params = {
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
            'max_depth': trial.suggest_int('max_depth', 1, 15),
            'subsample': trial.suggest_uniform('subsample', 0.25, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-9, 10.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-9, 10.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
        }
    elif isinstance(model, lgb.LGBMRegressor):
        new_params = {
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
            'max_depth': trial.suggest_int('max_depth', 1, 15),
            'subsample': trial.suggest_uniform('subsample', 0.25, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-9, 10.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-9, 10.0),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 20)
        }
    elif isinstance(model, CatBoostRegressor):
        new_params = {
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'depth': trial.suggest_int('depth', 1, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
            'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
            'border_count': trial.suggest_int('border_count', 1, 255),
            'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10),
            'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-3, 10),
            'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
            'od_wait': trial.suggest_int('od_wait', 10, 50)
        }
    else:
        raise ValueError("Unsupported model type!")

    base_params.update(new_params)
    model = model.__class__(**base_params)
    model.fit(x_train_op, y_train_op)
    y_pred = model.predict(x_test_op)
    r2 = r2_score(y_test_op, y_pred)

    return r2

def predict_cv(model, x_train, y_train, x_test):
    preds = list()
    preds_test = list()
    va_idxes = list()
    
    kf = KFold(n_splits=5, shuffle=True, random_state=71)
    iterator = tqdm(enumerate(kf.split(x_train)), total=kf.get_n_splits(), desc='CV Progress')  # tqdmを使ってプログレスバーを表示
    
    for _, (tr_idx, va_idx) in iterator:
        tr_x, va_x = x_train.iloc[tr_idx], x_train.iloc[va_idx]
        tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
        
        model.fit(tr_x, tr_y)
        
        pred = model.predict(va_x)
        preds.append(pred)
        
        pred_test = model.predict(x_test)
        preds_test.append(pred_test)
        
        va_idxes.append(va_idx)

    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]
    preds_test = np.mean(preds_test, axis=0)
    
    return pred_train, preds_test

## Model

In [None]:
x_train_op, x_test_op, y_train_op, y_test_op = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
initial_model = lgb.LGBMRegressor(objective='regression', random_state=0, device='cpu', verbosity=-1)
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, initial_model, x_train_op, y_train_op, x_test_op, y_test_op), n_trials=100)

In [None]:
x_train_op, x_test_op, y_train_op, y_test_op = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
initial_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=0)
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, initial_model, x_train_op, y_train_op, x_test_op, y_test_op), n_trials=100)

In [None]:
x_train_op, x_test_op, y_train_op, y_test_op = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
initial_model = CatBoostRegressor(objective='RMSE', random_seed=0,task_type='GPU', verbose=0)
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, initial_model, x_train_op, y_train_op, x_test_op, y_test_op), n_trials=100)

In [10]:
lgb_best_params = {'learning_rate': 0.03448065557991946, 'n_estimators': 1139, 'max_depth': 14, 'subsample': 0.7117013506265009, 'colsample_bytree': 0.8453899699874292, 'reg_alpha': 8.707740542701372e-06, 'reg_lambda': 1.3473848421538016e-07, 'min_child_samples': 19}
xgb_best_params = {'learning_rate': 0.011266220048489109, 'n_estimators': 673, 'max_depth': 9, 'subsample': 0.8719725913762578, 'colsample_bytree': 0.8055042169347792, 'reg_alpha': 0.00018948835810392984, 'reg_lambda': 3.8477359926778826e-09, 'min_child_weight': 10}
cat_best_params = {'iterations': 974, 'depth': 7, 'learning_rate': 0.06074053195699405, 'l2_leaf_reg': 6.255562336483921, 'border_count': 159, 'random_strength': 0.006997658088887226, 'bagging_temperature': 0.015062515205916959, 'od_type': 'IncToDec', 'od_wait': 43}
lgb_model = lgb.LGBMRegressor(**lgb_best_params, objective='regression', random_state=0, device='cpu', verbosity=-1)
xgb_model = xgb.XGBRegressor(**xgb_best_params, objective='reg:squarederror', random_state=0)
cat_model = CatBoostRegressor(**cat_best_params, objective='RMSE', random_seed=0)

In [None]:
pred_train_lgb, pred_test_lgb = predict_cv(lgb_model, x_train, y_train, x_test)
pred_train_xgb, pred_test_xgb = predict_cv(xgb_model, x_train, y_train, x_test)
pred_train_cat, pred_test_cat = predict_cv(cat_model, x_train, y_train, x_test)

In [23]:
x2_train = pd.DataFrame({'lgb': pred_train_lgb, 'xgb': pred_train_xgb, 'cat': pred_train_cat})
x2_test = pd.DataFrame({'lgb': pred_test_lgb, 'xgb': pred_test_xgb, 'cat': pred_test_cat})

In [27]:
liner_model = LinearRegression()

In [36]:
cv = KFold(5, shuffle=True, random_state=0)
cv_splits = tqdm(cv.split(x2_train, y_train), total=cv.get_n_splits(), desc='CV Progress')

scores = []
for train_idx, val_idx in cv_splits:
    x_train_fold, x_val_fold = x2_train.iloc[train_idx], x2_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    liner_model.fit(x_train_fold, y_train_fold)
    y_pred = liner_model.predict(x_val_fold)
    r2 = r2_score(y_val_fold, y_pred)
    scores.append(r2)
    
    print(f'score: {r2}')

print(f"Mean Score ＝ {np.mean(scores):.5f}") 

CV Progress:  80%|████████  | 4/5 [00:00<00:00, 15.58it/s]

score: 0.8683306236987806
score: 0.8691986975717285
score: 0.8696091484942055
score: 0.8694933004081825


CV Progress: 100%|██████████| 5/5 [00:00<00:00, 15.20it/s]

score: 0.8691689959336155
Mean Score ＝ 0.86916





In [38]:
y_pred = liner_model.predict(x2_test)

## 提出用ファイルの作成

In [39]:
submit = pd.read_csv("inputs/sample_submission.csv")
submit["FloodProbability"] = y_pred
submit.to_csv("outputs/submission_stacking.csv", index=False)
submit.head()

Unnamed: 0,id,FloodProbability
0,1117957,0.577717
1,1117958,0.456253
2,1117959,0.448419
3,1117960,0.46689
4,1117961,0.467345
