## カラムの日本語訳

In [1]:
# MonsoonIntensity - モンスーンの強度
# TopographyDrainage - 地形排水
# RiverManagement - 河川管理
# Deforestation - 森林破壊
# Urbanization - 都市化
# ClimateChange - 気候変動
# DamsQuality - ダムの品質
# Siltation - 堆積
# AgriculturalPractices - 農業の慣行
# Encroachments - 侵害
# IneffectiveDisasterPreparedness - 効果のない災害対策
# DrainageSystems - 排水システム
# CoastalVulnerability - 沿岸の脆弱性
# Landslides - 地滑り
# Watersheds - 流域
# DeterioratingInfrastructure - 低下するインフラ
# PopulationScore - 人口スコア
# WetlandLoss - 湿地の喪失
# InadequatePlanning - 不十分な計画
# PoliticalFactors - 政治的要因
# FloodProbability - 洪水確率

## import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score
import optuna

  from .autonotebook import tqdm as notebook_tqdm


## データの読み込み

In [2]:
train = pd.read_csv("inputs/train.csv")
test = pd.read_csv("inputs/test.csv")

In [3]:
x_train = train.drop(columns=["FloodProbability", "id"], axis=1)
y_train = train["FloodProbability"]
x_test = test.drop(columns=["id"], axis=1)

## 特徴量エンジニアリング

In [4]:
def cleaning(dataset):
    features = dataset.columns.tolist()
    dataset['total'] = dataset[features].sum(axis=1)
    dataset['mean_features'] = 0.1*dataset[features].mean(axis=1)
    dataset['std_features'] = dataset[features].std(axis=1)
    dataset['max_features'] = dataset[features].max(axis=1)
    dataset['min_features'] = dataset[features].min(axis=1)
    dataset['median_features'] = 0.1*dataset[features].median(axis=1)
    dataset['ptp'] = dataset[features].values.ptp(axis=1)
    dataset['q25'] = dataset[features].quantile(0.25, axis=1)
    dataset['q75'] = dataset[features].quantile(0.75, axis=1)

cleaning(x_train)
cleaning(x_test)

In [5]:
def add_features(df):
    df['ClimateImpact'] = df['MonsoonIntensity'] + df['ClimateChange']
    df['AnthropogenicPressure'] = df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments']
    df['InfrastructureQuality'] = df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']
    df['CoastalVulnerabilityTotal'] = df['CoastalVulnerability'] + df['Landslides']
    df['PreventiveMeasuresEfficiency'] = df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning']
    df['EcosystemImpact'] = df['WetlandLoss'] + df['Watersheds']
    df['SocioPoliticalContext'] = df['PopulationScore'] * df['PoliticalFactors']

add_features(x_train)
add_features(x_test)

## trainとtestの分布を確認

In [9]:
# def plot_distribution_pairs(train, test, feature, hue="set", palette=None):
#     data_df = train.copy()
#     data_df['set'] = 'train'
#     data_df = pd.concat([data_df, test.copy()]).fillna('test')
#     data_df.replace([np.inf, -np.inf], np.nan, inplace=True)

#     f, axes = plt.subplots(1, 2, figsize=(14, 6))
#     for i, s in enumerate(data_df[hue].unique()):
#         selection = data_df.loc[data_df[hue]==s, feature]
#         # Filter 'selection' to include only the central 95% of the data
#         q_025, q_975 = np.percentile(selection, [2.5, 97.5])
#         selection_filtered = selection[(selection >= q_025) & (selection <= q_975)]
#         with warnings.catch_warnings():
#             warnings.simplefilter("ignore", category=FutureWarning)
#             sns.histplot(selection_filtered, color=palette[i], ax=axes[0], label=s)
#             sns.boxplot(x=hue, y=feature, data=data_df, palette=palette, ax=axes[1])
#     axes[0].set_title(f"Paired train/test distributions of {feature}")
#     axes[1].set_title(f"Paired train/test boxplots of {feature}")
#     axes[0].legend()
#     axes[1].legend()
#     plt.show()
# color_list = ["#A5D7E8", "#576CBC", "#19376D", "#0B2447"]
# for feature in x_train.columns:
#   plot_distribution_pairs(x_train, x_test, feature, palette=color_list)

## optunaによるチューニング

In [6]:
x_train_op, x_test_op, y_train_op, y_test_op = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

def objective(trial):
    params = {
            'num_leaves': trial.suggest_int('num_leaves', 100, 500),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
            'subsample_for_bin': trial.suggest_int('subsample_for_bin', 20000, 300000),
            'min_child_samples': trial.suggest_int('min_child_samples', 20, 500),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
            'subsample': trial.suggest_float('subsample', 0.25, 1.0),
            'max_depth': trial.suggest_int('max_depth', 1, 15)
            }
    
    model = LGBMRegressor(**params, objective='regression', random_state=0, device='gpu', verbosity=-1)
    model.fit(x_train_op, y_train_op)
    y_pred = model.predict(x_test_op)
    r2 = r2_score(y_test_op, y_pred)

    return r2

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

[I 2024-05-16 15:27:30,897] A new study created in memory with name: no-name-d7f6faf2-65c2-485b-a2e7-601ea4cd4cf5
[I 2024-05-16 15:27:34,372] Trial 0 finished with value: 0.8677104921370952 and parameters: {'num_leaves': 212, 'learning_rate': 0.016865920320135744, 'n_estimators': 374, 'subsample_for_bin': 162242, 'min_child_samples': 457, 'reg_alpha': 3.100979412504212e-06, 'reg_lambda': 0.0002336157853237338, 'colsample_bytree': 0.7183178283828382, 'subsample': 0.49828590989000765, 'max_depth': 5}. Best is trial 0 with value: 0.8677104921370952.
[I 2024-05-16 15:27:36,641] Trial 1 finished with value: 0.8670324984552997 and parameters: {'num_leaves': 462, 'learning_rate': 0.7404743499848364, 'n_estimators': 350, 'subsample_for_bin': 54608, 'min_child_samples': 340, 'reg_alpha': 5.820940488643845, 'reg_lambda': 2.5172728661626707e-07, 'colsample_bytree': 0.5219269691388674, 'subsample': 0.8297468937552126, 'max_depth': 15}. Best is trial 0 with value: 0.8677104921370952.
[I 2024-05-16 

KeyboardInterrupt: 

## optunaによるチューニングパラメータを利用してモデルを作成

In [21]:
model = LGBMRegressor(**study.best_params, objective='regression', random_state=0, device='gpu', verbosity=-1)

## クロスバリデーション

In [26]:
cv = KFold(4, shuffle=True, random_state=0)
cv_splits = cv.split(x_train, y_train)
scores = []
for train_idx, val_idx in cv_splits:
    x_train_fold, x_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    model.fit(x_train_fold, y_train_fold)
    y_pred = model.predict(x_val_fold)
    r2 = r2_score(y_val_fold, y_pred)
    print(f'score: {r2}')
    scores.append(r2)

print(f"Mean Score ＝ {np.mean(scores):.5f}") 

score: 0.8681206562366337
score: 0.8690832475984227
score: 0.869744857717879
score: 0.8687887480329332
Mean Score ＝ 0.86893


## 特徴量の重要度を確認

In [23]:
from sklearn import datasets

feature_name = x_train.columns
importance = pd.DataFrame(model.feature_importances_, columns=['importance'], index=feature_name)
display(importance)

Unnamed: 0,importance
MonsoonIntensity,1908
TopographyDrainage,1349
RiverManagement,1509
Deforestation,1183
Urbanization,1395
ClimateChange,1527
DamsQuality,1149
Siltation,1445
AgriculturalPractices,1220
Encroachments,1255


## 重要ではない特徴量を取り除いて再訓練

In [None]:

# min = 2000
# feature_idx = list()
# for idx, item in enumerate(model.feature_importances_):
#     if item <= min:
#         feature_idx.append(idx)
# feature_name = x_train.columns

# x_train = x_train.drop(columns=list(feature_name[feature_idx]), axis=1)
# x_test = x_test.drop(columns=list(feature_name[feature_idx]), axis=1)
# model = LGBMRegressor(**study.best_params, objective='regression', random_state=0, device='gpu', verbosity=-1)
# cv = KFold(5, shuffle=True, random_state=0)
# cv_splits = cv.split(x_train, y_train)
# scores = []
# for train_idx, val_idx in cv_splits:
#     x_train_fold, x_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
#     y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
#     model.fit(x_train_fold, y_train_fold)
#     y_pred = model.predict(x_val_fold)
#     r2 = r2_score(y_val_fold, y_pred)
#     print(f'score: {r2}')
#     scores.append(r2)

# print(f"Mean Score ＝ {np.mean(scores):.5f}") 

## 提出用ファイルの作成

In [28]:
submit = pd.read_csv("inputs/sample_submission.csv")
y_pred = model.predict(x_test)
submit["FloodProbability"] = y_pred
submit.to_csv("outputs/submission.csv", index=False)
submit.head()

Unnamed: 0,id,FloodProbability
0,1117957,0.577679
1,1117958,0.454073
2,1117959,0.450004
3,1117960,0.467619
4,1117961,0.467846
