In [2]:
import time
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import featuretools as ft

from itertools import combinations
from sklearn.metrics import r2_score
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb 

SEED = 2024

In [3]:
# Check GPU availability
import subprocess

try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'cuda'
except Exception:
    DEVICE = 'cpu'

print(f'Available device: {DEVICE}')

Available device: cuda


In [4]:
DATA_DIR = '/kaggle/input/playground-series-s4e5'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

In [5]:
def reduce_memory_usage(df):
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [6]:
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

Memory usage of dataframe is 187.64549255371094 MB
Memory usage of dataframe after reduction 27.720460891723633 MB
Reduced by 85.22721728378899 % 
Memory usage of dataframe is 119.4108657836914 MB
Memory usage of dataframe after reduction 17.058799743652344 MB
Reduced by 85.71419809102318 % 


In [7]:
TARGET = 'FloodProbability'
og_features = list(test.columns)

In [8]:
labels = train.pop(TARGET)
composite = pd.concat([train, test], axis=0)

In [9]:
es = ft.EntitySet(id='data')

es = es.add_dataframe(
      dataframe_name='composite',
      dataframe=composite,
      index='id')

feature_matrix, feature_defs = ft.dfs(
    entityset=es, 
    target_dataframe_name='composite',
    agg_primitives=['sum', 'std'],
    trans_primitives=['add_numeric'])

In [10]:
feature_matrix.head()

Unnamed: 0_level_0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,AgriculturalPractices + ClimateChange,AgriculturalPractices + CoastalVulnerability,AgriculturalPractices + DamsQuality,AgriculturalPractices + Deforestation,AgriculturalPractices + DeterioratingInfrastructure,AgriculturalPractices + DrainageSystems,AgriculturalPractices + Encroachments,AgriculturalPractices + InadequatePlanning,AgriculturalPractices + IneffectiveDisasterPreparedness,AgriculturalPractices + Landslides,AgriculturalPractices + MonsoonIntensity,AgriculturalPractices + PoliticalFactors,AgriculturalPractices + PopulationScore,AgriculturalPractices + RiverManagement,AgriculturalPractices + Siltation,AgriculturalPractices + TopographyDrainage,AgriculturalPractices + Urbanization,AgriculturalPractices + Watersheds,AgriculturalPractices + WetlandLoss,ClimateChange + CoastalVulnerability,ClimateChange + DamsQuality,ClimateChange + Deforestation,ClimateChange + DeterioratingInfrastructure,ClimateChange + DrainageSystems,ClimateChange + Encroachments,ClimateChange + InadequatePlanning,ClimateChange + IneffectiveDisasterPreparedness,ClimateChange + Landslides,ClimateChange + MonsoonIntensity,ClimateChange + PoliticalFactors,ClimateChange + PopulationScore,ClimateChange + RiverManagement,ClimateChange + Siltation,ClimateChange + TopographyDrainage,ClimateChange + Urbanization,ClimateChange + Watersheds,ClimateChange + WetlandLoss,CoastalVulnerability + DamsQuality,CoastalVulnerability + Deforestation,CoastalVulnerability + DeterioratingInfrastructure,CoastalVulnerability + DrainageSystems,CoastalVulnerability + Encroachments,CoastalVulnerability + InadequatePlanning,CoastalVulnerability + IneffectiveDisasterPreparedness,CoastalVulnerability + Landslides,CoastalVulnerability + MonsoonIntensity,CoastalVulnerability + PoliticalFactors,CoastalVulnerability + PopulationScore,CoastalVulnerability + RiverManagement,CoastalVulnerability + Siltation,CoastalVulnerability + TopographyDrainage,CoastalVulnerability + Urbanization,CoastalVulnerability + Watersheds,CoastalVulnerability + WetlandLoss,DamsQuality + Deforestation,DamsQuality + DeterioratingInfrastructure,DamsQuality + DrainageSystems,DamsQuality + Encroachments,DamsQuality + InadequatePlanning,DamsQuality + IneffectiveDisasterPreparedness,DamsQuality + Landslides,DamsQuality + MonsoonIntensity,DamsQuality + PoliticalFactors,DamsQuality + PopulationScore,DamsQuality + RiverManagement,DamsQuality + Siltation,DamsQuality + TopographyDrainage,DamsQuality + Urbanization,DamsQuality + Watersheds,DamsQuality + WetlandLoss,Deforestation + DeterioratingInfrastructure,Deforestation + DrainageSystems,Deforestation + Encroachments,Deforestation + InadequatePlanning,Deforestation + IneffectiveDisasterPreparedness,Deforestation + Landslides,Deforestation + MonsoonIntensity,Deforestation + PoliticalFactors,Deforestation + PopulationScore,Deforestation + RiverManagement,Deforestation + Siltation,Deforestation + TopographyDrainage,Deforestation + Urbanization,Deforestation + Watersheds,Deforestation + WetlandLoss,DeterioratingInfrastructure + DrainageSystems,DeterioratingInfrastructure + Encroachments,DeterioratingInfrastructure + InadequatePlanning,DeterioratingInfrastructure + IneffectiveDisasterPreparedness,DeterioratingInfrastructure + Landslides,DeterioratingInfrastructure + MonsoonIntensity,DeterioratingInfrastructure + PoliticalFactors,DeterioratingInfrastructure + PopulationScore,DeterioratingInfrastructure + RiverManagement,DeterioratingInfrastructure + Siltation,DeterioratingInfrastructure + TopographyDrainage,DeterioratingInfrastructure + Urbanization,DeterioratingInfrastructure + Watersheds,DeterioratingInfrastructure + WetlandLoss,DrainageSystems + Encroachments,DrainageSystems + InadequatePlanning,DrainageSystems + IneffectiveDisasterPreparedness,DrainageSystems + Landslides,DrainageSystems + MonsoonIntensity,DrainageSystems + PoliticalFactors,DrainageSystems + PopulationScore,DrainageSystems + RiverManagement,DrainageSystems + Siltation,DrainageSystems + TopographyDrainage,DrainageSystems + Urbanization,DrainageSystems + Watersheds,DrainageSystems + WetlandLoss,Encroachments + InadequatePlanning,Encroachments + IneffectiveDisasterPreparedness,Encroachments + Landslides,Encroachments + MonsoonIntensity,Encroachments + PoliticalFactors,Encroachments + PopulationScore,Encroachments + RiverManagement,Encroachments + Siltation,Encroachments + TopographyDrainage,Encroachments + Urbanization,Encroachments + Watersheds,Encroachments + WetlandLoss,InadequatePlanning + IneffectiveDisasterPreparedness,InadequatePlanning + Landslides,InadequatePlanning + MonsoonIntensity,InadequatePlanning + PoliticalFactors,InadequatePlanning + PopulationScore,InadequatePlanning + RiverManagement,InadequatePlanning + Siltation,InadequatePlanning + TopographyDrainage,InadequatePlanning + Urbanization,InadequatePlanning + Watersheds,InadequatePlanning + WetlandLoss,IneffectiveDisasterPreparedness + Landslides,IneffectiveDisasterPreparedness + MonsoonIntensity,IneffectiveDisasterPreparedness + PoliticalFactors,IneffectiveDisasterPreparedness + PopulationScore,IneffectiveDisasterPreparedness + RiverManagement,IneffectiveDisasterPreparedness + Siltation,IneffectiveDisasterPreparedness + TopographyDrainage,IneffectiveDisasterPreparedness + Urbanization,IneffectiveDisasterPreparedness + Watersheds,IneffectiveDisasterPreparedness + WetlandLoss,Landslides + MonsoonIntensity,Landslides + PoliticalFactors,Landslides + PopulationScore,Landslides + RiverManagement,Landslides + Siltation,Landslides + TopographyDrainage,Landslides + Urbanization,Landslides + Watersheds,Landslides + WetlandLoss,MonsoonIntensity + PoliticalFactors,MonsoonIntensity + PopulationScore,MonsoonIntensity + RiverManagement,MonsoonIntensity + Siltation,MonsoonIntensity + TopographyDrainage,MonsoonIntensity + Urbanization,MonsoonIntensity + Watersheds,MonsoonIntensity + WetlandLoss,PoliticalFactors + PopulationScore,PoliticalFactors + RiverManagement,PoliticalFactors + Siltation,PoliticalFactors + TopographyDrainage,PoliticalFactors + Urbanization,PoliticalFactors + Watersheds,PoliticalFactors + WetlandLoss,PopulationScore + RiverManagement,PopulationScore + Siltation,PopulationScore + TopographyDrainage,PopulationScore + Urbanization,PopulationScore + Watersheds,PopulationScore + WetlandLoss,RiverManagement + Siltation,RiverManagement + TopographyDrainage,RiverManagement + Urbanization,RiverManagement + Watersheds,RiverManagement + WetlandLoss,Siltation + TopographyDrainage,Siltation + Urbanization,Siltation + Watersheds,Siltation + WetlandLoss,TopographyDrainage + Urbanization,TopographyDrainage + Watersheds,TopographyDrainage + WetlandLoss,Urbanization + Watersheds,Urbanization + WetlandLoss,Watersheds + WetlandLoss
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1
0,5,8,5,8,6,4,4,3,3,4,2,5,3,3,5,4,7,5,7,3,7.0,6.0,7.0,11.0,7.0,8.0,7.0,10.0,5.0,6.0,8.0,6.0,10.0,8.0,6.0,11.0,9.0,8.0,8.0,7.0,8.0,12.0,8.0,9.0,8.0,11.0,6.0,7.0,9.0,7.0,11.0,9.0,7.0,12.0,10.0,9.0,9.0,7.0,11.0,7.0,8.0,7.0,10.0,5.0,6.0,8.0,6.0,10.0,8.0,6.0,11.0,9.0,8.0,8.0,12.0,8.0,9.0,8.0,11.0,6.0,7.0,9.0,7.0,11.0,9.0,7.0,12.0,10.0,9.0,9.0,12.0,13.0,12.0,15.0,10.0,11.0,13.0,11.0,15.0,13.0,11.0,16.0,14.0,13.0,13.0,9.0,8.0,11.0,6.0,7.0,9.0,7.0,11.0,9.0,7.0,12.0,10.0,9.0,9.0,9.0,12.0,7.0,8.0,10.0,8.0,12.0,10.0,8.0,13.0,11.0,10.0,10.0,11.0,6.0,7.0,9.0,7.0,11.0,9.0,7.0,12.0,10.0,9.0,9.0,9.0,10.0,12.0,10.0,14.0,12.0,10.0,15.0,13.0,12.0,12.0,5.0,7.0,5.0,9.0,7.0,5.0,10.0,8.0,7.0,7.0,8.0,6.0,10.0,8.0,6.0,11.0,9.0,8.0,8.0,8.0,12.0,10.0,8.0,13.0,11.0,10.0,10.0,10.0,8.0,6.0,11.0,9.0,8.0,8.0,12.0,10.0,15.0,13.0,12.0,12.0,8.0,13.0,11.0,10.0,10.0,11.0,9.0,8.0,8.0,14.0,13.0,13.0,11.0,11.0,10.0
1,6,7,4,4,8,8,3,5,4,6,9,7,2,0,3,5,3,3,4,3,12.0,6.0,7.0,8.0,9.0,11.0,10.0,8.0,13.0,4.0,10.0,7.0,7.0,8.0,9.0,11.0,12.0,7.0,7.0,10.0,11.0,12.0,13.0,15.0,14.0,12.0,17.0,8.0,14.0,11.0,11.0,12.0,13.0,15.0,16.0,11.0,11.0,5.0,6.0,7.0,9.0,8.0,6.0,11.0,2.0,8.0,5.0,5.0,6.0,7.0,9.0,10.0,5.0,5.0,7.0,8.0,10.0,9.0,7.0,12.0,3.0,9.0,6.0,6.0,7.0,8.0,10.0,11.0,6.0,6.0,9.0,11.0,10.0,8.0,13.0,4.0,10.0,7.0,7.0,8.0,9.0,11.0,12.0,7.0,7.0,12.0,11.0,9.0,14.0,5.0,11.0,8.0,8.0,9.0,10.0,12.0,13.0,8.0,8.0,13.0,11.0,16.0,7.0,13.0,10.0,10.0,11.0,12.0,14.0,15.0,10.0,10.0,10.0,15.0,6.0,12.0,9.0,9.0,10.0,11.0,13.0,14.0,9.0,9.0,13.0,4.0,10.0,7.0,7.0,8.0,9.0,11.0,12.0,7.0,7.0,9.0,15.0,12.0,12.0,13.0,14.0,16.0,17.0,12.0,12.0,6.0,3.0,3.0,4.0,5.0,7.0,8.0,3.0,3.0,9.0,9.0,10.0,11.0,13.0,14.0,9.0,9.0,6.0,7.0,8.0,10.0,11.0,6.0,6.0,7.0,8.0,10.0,11.0,6.0,6.0,9.0,11.0,12.0,7.0,7.0,12.0,13.0,8.0,8.0,15.0,10.0,10.0,11.0,11.0,6.0
2,6,5,6,7,3,7,1,5,4,5,6,7,3,7,5,6,8,2,3,3,11.0,7.0,5.0,11.0,10.0,11.0,9.0,7.0,10.0,11.0,10.0,7.0,12.0,10.0,9.0,9.0,7.0,9.0,6.0,10.0,8.0,14.0,13.0,14.0,12.0,10.0,13.0,14.0,13.0,10.0,15.0,13.0,12.0,12.0,10.0,12.0,9.0,4.0,10.0,9.0,10.0,8.0,6.0,9.0,10.0,9.0,6.0,11.0,9.0,8.0,8.0,6.0,8.0,5.0,8.0,7.0,8.0,6.0,4.0,7.0,8.0,7.0,4.0,9.0,7.0,6.0,6.0,4.0,6.0,3.0,13.0,14.0,12.0,10.0,13.0,14.0,13.0,10.0,15.0,13.0,12.0,12.0,10.0,12.0,9.0,13.0,11.0,9.0,12.0,13.0,12.0,9.0,14.0,12.0,11.0,11.0,9.0,11.0,8.0,12.0,10.0,13.0,14.0,13.0,10.0,15.0,13.0,12.0,12.0,10.0,12.0,9.0,8.0,11.0,12.0,11.0,8.0,13.0,11.0,10.0,10.0,8.0,10.0,7.0,9.0,10.0,9.0,6.0,11.0,9.0,8.0,8.0,6.0,8.0,5.0,13.0,12.0,9.0,14.0,12.0,11.0,11.0,9.0,11.0,8.0,13.0,10.0,15.0,13.0,12.0,12.0,10.0,12.0,9.0,9.0,14.0,12.0,11.0,11.0,9.0,11.0,8.0,11.0,9.0,8.0,8.0,6.0,8.0,5.0,14.0,13.0,13.0,11.0,13.0,10.0,11.0,11.0,9.0,11.0,8.0,10.0,8.0,10.0,7.0,8.0,10.0,7.0,8.0,5.0,7.0
3,3,4,6,5,4,8,4,7,6,8,5,2,4,7,4,4,6,5,7,5,14.0,10.0,10.0,11.0,10.0,8.0,14.0,13.0,11.0,13.0,9.0,11.0,12.0,12.0,13.0,10.0,10.0,10.0,11.0,12.0,12.0,13.0,12.0,10.0,16.0,15.0,13.0,15.0,11.0,13.0,14.0,14.0,15.0,12.0,12.0,12.0,13.0,8.0,9.0,8.0,6.0,12.0,11.0,9.0,11.0,7.0,9.0,10.0,10.0,11.0,8.0,8.0,8.0,9.0,9.0,8.0,6.0,12.0,11.0,9.0,11.0,7.0,9.0,10.0,10.0,11.0,8.0,8.0,8.0,9.0,9.0,7.0,13.0,12.0,10.0,12.0,8.0,10.0,11.0,11.0,12.0,9.0,9.0,9.0,10.0,6.0,12.0,11.0,9.0,11.0,7.0,9.0,10.0,10.0,11.0,8.0,8.0,8.0,9.0,10.0,9.0,7.0,9.0,5.0,7.0,8.0,8.0,9.0,6.0,6.0,6.0,7.0,15.0,13.0,15.0,11.0,13.0,14.0,14.0,15.0,12.0,12.0,12.0,13.0,12.0,14.0,10.0,12.0,13.0,13.0,14.0,11.0,11.0,11.0,12.0,12.0,8.0,10.0,11.0,11.0,12.0,9.0,9.0,9.0,10.0,10.0,12.0,13.0,13.0,14.0,11.0,11.0,11.0,12.0,8.0,9.0,9.0,10.0,7.0,7.0,7.0,8.0,11.0,11.0,12.0,9.0,9.0,9.0,10.0,12.0,13.0,10.0,10.0,10.0,11.0,13.0,10.0,10.0,10.0,11.0,11.0,11.0,11.0,12.0,8.0,8.0,9.0,8.0,9.0,9.0
4,5,3,2,6,4,4,3,3,3,3,5,2,2,6,6,4,1,2,3,5,7.0,5.0,6.0,9.0,7.0,5.0,6.0,6.0,8.0,9.0,8.0,8.0,4.0,5.0,6.0,6.0,7.0,9.0,5.0,6.0,7.0,10.0,8.0,6.0,7.0,7.0,9.0,10.0,9.0,9.0,5.0,6.0,7.0,7.0,8.0,10.0,6.0,5.0,8.0,6.0,4.0,5.0,5.0,7.0,8.0,7.0,7.0,3.0,4.0,5.0,5.0,6.0,8.0,4.0,9.0,7.0,5.0,6.0,6.0,8.0,9.0,8.0,8.0,4.0,5.0,6.0,6.0,7.0,9.0,5.0,10.0,8.0,9.0,9.0,11.0,12.0,11.0,11.0,7.0,8.0,9.0,9.0,10.0,12.0,8.0,6.0,7.0,7.0,9.0,10.0,9.0,9.0,5.0,6.0,7.0,7.0,8.0,10.0,6.0,5.0,5.0,7.0,8.0,7.0,7.0,3.0,4.0,5.0,5.0,6.0,8.0,4.0,6.0,8.0,9.0,8.0,8.0,4.0,5.0,6.0,6.0,7.0,9.0,5.0,8.0,9.0,8.0,8.0,4.0,5.0,6.0,6.0,7.0,9.0,5.0,11.0,10.0,10.0,6.0,7.0,8.0,8.0,9.0,11.0,7.0,11.0,11.0,7.0,8.0,9.0,9.0,10.0,12.0,8.0,10.0,6.0,7.0,8.0,8.0,9.0,11.0,7.0,6.0,7.0,8.0,8.0,9.0,11.0,7.0,3.0,4.0,4.0,5.0,7.0,3.0,5.0,5.0,6.0,8.0,4.0,6.0,7.0,9.0,5.0,7.0,9.0,5.0,10.0,6.0,8.0


In [11]:
feature_matrix.shape

(1863262, 210)

In [12]:
train = feature_matrix.iloc[:train.shape[0], :]
train[TARGET] = labels

test = feature_matrix.iloc[train.shape[0]:, :]

del(composite)
del(labels)
_ = gc.collect()

In [13]:
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

_ = gc.collect()

Memory usage of dataframe is 1801.8220233917236 MB
Memory usage of dataframe after reduction 437.12841987609863 MB
Reduced by 75.7396449704142 % 
Memory usage of dataframe is 1199.7936630249023 MB
Memory usage of dataframe after reduction 289.99752044677734 MB
Reduced by 75.82938388625593 % 


In [14]:
train['FloodLabel'] = (train['FloodProbability'] >= 0.5).astype('int8')
CAT_TARGET = 'FloodLabel'

In [15]:
gen_features = list(test.columns)

In [16]:
def comp_metric(y_true, y_pred):
    return r2_score(y_true, y_pred)

In [17]:
def custom_cv(task, feature_set, estimator, folds=10, seed=SEED, verbose=True):
    oof_preds, test_preds = {}, {}
    scores = []
    
    if task == 'clf':
        task_target = CAT_TARGET
    elif task == 'reg':
        task_target = TARGET
    else:
        print('Invalid task.')
        return

    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[CAT_TARGET])):
        X_train, y_train = train[feature_set].iloc[train_ids], train[task_target].iloc[train_ids]
        X_val, y_val = train[feature_set].iloc[val_ids], train[TARGET].iloc[val_ids]
        
        model = clone(estimator)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)

        if task == 'clf':
            val_preds = model.predict_proba(X_val)[:, 1]
            oof_preds.update(dict(zip(val_ids, val_preds)))
            test_preds[f'fold{fold}'] = model.predict_proba(test[feature_set])[:, 1]
        else:  # 'reg'
            val_preds = model.predict(X_val)
            oof_preds.update(dict(zip(val_ids, val_preds)))
            test_preds[f'fold{fold}'] = model.predict(test[feature_set])

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration:>5} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)  # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [18]:
op, tp = {}, {}

In [19]:
BASE_PARAMS = {
    'base_score': 0.5,
    'booster': 'gbtree',
    'tree_method': 'hist',
    'n_estimators': 25000,
    'early_stopping_rounds': 200,
    'device': DEVICE,
    'enable_categorical': True,
    'verbosity': 0,
    'n_jobs': -1,
    'seed': SEED
}

In [20]:
reg_model1 = xgb.XGBRegressor(
    **BASE_PARAMS, 
    learning_rate=0.1,
    alpha=10,
    objective='reg:squarederror', 
    eval_metric='rmse')

op['cfg1'], tp['cfg1'] = custom_cv(
    task='reg',
    feature_set=gen_features,
    estimator=reg_model1)

Fold # 0: 0.83528 ( 1744 rounds)
Fold # 1: 0.83721 ( 1857 rounds)
Fold # 2: 0.83576 ( 1831 rounds)
Fold # 3: 0.83657 ( 1885 rounds)
Fold # 4: 0.83872 ( 1866 rounds)
Fold # 5: 0.83709 ( 1800 rounds)
Fold # 6: 0.83658 ( 1837 rounds)
Fold # 7: 0.83717 ( 1777 rounds)
Fold # 8: 0.83606 ( 1847 rounds)
Fold # 9: 0.83567 ( 1869 rounds)

Avg score: 0.83661 +/- 0.00095
OOF score: 0.83661



In [21]:
reg_model2 = xgb.XGBRegressor(
    **BASE_PARAMS, 
    objective='reg:tweedie',
    learning_rate=0.01,
    tweedie_variance_power=1.5,
    eval_metric='rmse')

op['cfg2'], tp['cfg2'] = custom_cv(
    task='reg',
    feature_set=gen_features,
    estimator=reg_model2)

Fold # 0: 0.84492 (17316 rounds)
Fold # 1: 0.84669 (16359 rounds)
Fold # 2: 0.84562 (17077 rounds)
Fold # 3: 0.84633 (16306 rounds)
Fold # 4: 0.84810 (16564 rounds)
Fold # 5: 0.84691 (15142 rounds)
Fold # 6: 0.84674 (16638 rounds)
Fold # 7: 0.84686 (16154 rounds)
Fold # 8: 0.84577 (17084 rounds)
Fold # 9: 0.84582 (18406 rounds)

Avg score: 0.84638 +/- 0.00084
OOF score: 0.84638

