In [1]:
!pip install -q autogluon

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
keras-cv 0.8.2 requires keras-core, which is not installed.
keras-nlp 0.9.3 requires keras-core, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
aiobotocore 2.12.3 requires botocore<1.34.70,>=1.34.41, but you have botocore 1.29.165 which is incompatible.
albumentations 1.4.0 requires scikit-image>=0.21.0, but you have scikit-image 0.20.0 which is incompatible.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 15.0.2 which is incompatible.
beatrix-jupyterlab 2023.128.151533 requires jupyterlab~=3.6.0, but you have jupyterlab 4.1.6 w

In [2]:
from autogluon.tabular import TabularPredictor,TabularDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
import shutil

warnings.filterwarnings('ignore')
seed = 27

# Loading the Dataset

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv', index_col='id')

In [4]:
train.head(10).T

id,0,1,2,3,4,5,6,7,8,9
MonsoonIntensity,5.0,6.0,6.0,3.0,5.0,5.0,8.0,6.0,5.0,4.0
TopographyDrainage,8.0,7.0,5.0,4.0,3.0,4.0,3.0,6.0,2.0,2.0
RiverManagement,5.0,4.0,6.0,6.0,2.0,1.0,1.0,5.0,8.0,3.0
Deforestation,8.0,4.0,7.0,5.0,6.0,4.0,2.0,7.0,5.0,5.0
Urbanization,6.0,8.0,3.0,4.0,4.0,2.0,3.0,5.0,4.0,8.0
ClimateChange,4.0,8.0,7.0,8.0,4.0,4.0,7.0,5.0,5.0,6.0
DamsQuality,4.0,3.0,1.0,4.0,3.0,6.0,3.0,3.0,2.0,5.0
Siltation,3.0,5.0,5.0,7.0,3.0,6.0,4.0,5.0,4.0,5.0
AgriculturalPractices,3.0,4.0,4.0,6.0,3.0,7.0,6.0,5.0,5.0,7.0
Encroachments,4.0,6.0,5.0,8.0,3.0,5.0,7.0,5.0,5.0,6.0


In [5]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MonsoonIntensity,1117957.0,4.92145,2.056387,0.0,3.0,5.0,6.0,16.0
TopographyDrainage,1117957.0,4.926671,2.093879,0.0,3.0,5.0,6.0,18.0
RiverManagement,1117957.0,4.955322,2.072186,0.0,4.0,5.0,6.0,16.0
Deforestation,1117957.0,4.94224,2.051689,0.0,4.0,5.0,6.0,17.0
Urbanization,1117957.0,4.942517,2.083391,0.0,3.0,5.0,6.0,17.0
ClimateChange,1117957.0,4.934093,2.057742,0.0,3.0,5.0,6.0,17.0
DamsQuality,1117957.0,4.955878,2.083063,0.0,4.0,5.0,6.0,16.0
Siltation,1117957.0,4.927791,2.065992,0.0,3.0,5.0,6.0,16.0
AgriculturalPractices,1117957.0,4.942619,2.068545,0.0,3.0,5.0,6.0,16.0
Encroachments,1117957.0,4.94923,2.083324,0.0,4.0,5.0,6.0,18.0


# Preprocessing
- I'm incorporating a new feature which was suggested [here](https://www.kaggle.com/competitions/playground-series-s4e5/discussion/499274).
- I've also added a few other statistical features which have been suggested on the forums by various people.

In [6]:
def add_stat_features(dataframe, cols):
    dataframe['_sum'] = dataframe[cols].sum(1)
    dataframe['_var'] = dataframe[cols].var(1)
    dataframe['_prod'] = dataframe[cols].prod(1)
    dataframe['_mean'] = dataframe[cols].mean(1)
    dataframe['_skew'] = dataframe[cols].skew(1)
    dataframe['_median'] = dataframe[cols].median(1)
    dataframe['_kurtosis'] = dataframe[cols].kurtosis(1)
    dataframe['_abs_energy'] = dataframe[cols].abs().sum(1)
    dataframe['_gmean'] = dataframe[cols].apply(lambda x: np.exp(np.log(x).mean()), axis=1)
    dataframe['_hmean'] = dataframe[cols].apply(lambda x: len(x) / np.sum(1.0/x), axis=1)
    
    sorted_features = [f"sort_{i}" for i in np.arange(len(cols))]
    dataframe[sorted_features] = np.sort(dataframe[cols], axis=1)
    
    return dataframe

In [7]:
feature_cols = list(test.columns)

train = add_stat_features(train, feature_cols)
train = train.drop(feature_cols, axis=1)

test = add_stat_features(test, feature_cols) 
test = test.drop(feature_cols, axis=1)

In [8]:
_train, _val = train_test_split(
    train, 
    test_size=0.1, 
    random_state=seed
)

In [9]:
_train = TabularDataset(_train)
_val = TabularDataset(_val)
test = TabularDataset(test)

# Training

In [10]:
predictor = TabularPredictor(
    label='FloodProbability',
    eval_metric='r2',
    problem_type="regression",
    verbosity=2
).fit(
    train_data=_train,
    time_limit=3600 * 10,
    presets="best_quality",
    excluded_model_types = ["NN_TORCH", "NN"],
#     keep_only_best=True
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240506_202019"
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 36000 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20240506_202019/ds_sub_fit/sub_fit_ho.
2024-05-06 20:20:19,652	INFO util.py:124 -- Outdated packages:
  ipywidgets==

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: d8de7b78f73f7d8784d38b4e81e6b98a9380b91a01000000 Worker ID: a65d34a453cfbf58596fac6c21bafddd5c3936e8c3a405e03038a338 Node ID: ac487de3527035d01eb3a808a97a7d8624efb15490213e3e16642467 Worker IP address: 172.19.2.2 Worker port: 44573 Worker PID: 10139 Worker exit type: SYSTEM_ERROR Worker exit detail: The leased worker has unrecoverable failure. Worker is requested to be destroyed when it is returned. RPC Error message: recvmsg:Connection reset by peer; RPC Error details: 
[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: 9332830fe9c2e63483785873d98151b8b521dcee01000000 Worker ID: 6d33018e34bfb873d6d6133e86a94198a881627503dd9da2670ea524 Node ID: ac487de3527035d01eb3a808a97a7d8624efb1549021

	0.8692	 = Validation score   (r2)
	1931.06s	 = Training   runtime
	16.09s	 = Validation runtime
Fitting model: LightGBM_r96_BAG_L1 ... Training model for up to 9434.5s of the 9434.48s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (4 workers, per: cpus=1, gpus=0, memory=5.10%)
	0.8646	 = Validation score   (r2)
	3780.49s	 = Training   runtime
	765.01s	 = Validation runtime
Fitting model: XGBoost_r33_BAG_L1 ... Training model for up to 5564.44s of the 5564.43s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (4 workers, per: cpus=1, gpus=0, memory=7.01%)
	0.869	 = Validation score   (r2)
	615.58s	 = Training   runtime
	216.75s	 = Validation runtime
Fitting model: ExtraTrees_r42_BAG_L1 ... Training model for up to 4913.13s of the 4913.12s of remaining time.
	0.8658	 = Validation score   (r2)
	560.81s	 = Training   runtime
	45.72s	 = Validation runtime
Fitting model: CatBoost

# Visualizing the Results

In [11]:
score = predictor.evaluate(_val, silent=True)['r2']

In [12]:
print(f"Validation R2 Score: {score}")

Validation R2 Score: 0.8693950767494774


In [13]:
fi = predictor.feature_importance(data=_val, silent=True)
lb = predictor.leaderboard(_val, silent=True)

These features in provided data are not utilized by the predictor and will be ignored: ['_abs_energy']
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.predict: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.predict: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU


In [14]:
lb.style.background_gradient(subset=['score_val', 'score_test'], cmap='RdYlGn')

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.869395,0.869255,r2,55.62643,296.866134,7847.844091,0.008823,0.020598,7.424823,2,True,20
1,CatBoost_BAG_L1,0.869315,0.869151,r2,1.293416,2.404151,2529.428668,1.293416,2.404151,2529.428668,1,True,6
2,CatBoost_r177_BAG_L1,0.869314,0.869147,r2,1.009218,2.114818,1894.591559,1.009218,2.114818,1894.591559,1,True,11
3,CatBoost_r9_BAG_L1,0.869313,0.869174,r2,10.854967,16.086246,1931.05678,10.854967,16.086246,1931.05678,1,True,13
4,XGBoost_r33_BAG_L1,0.86927,0.869004,r2,31.655933,216.74825,615.580102,31.655933,216.74825,615.580102,1,True,15
5,CatBoost_r137_BAG_L1,0.869245,0.869085,r2,2.093149,7.737953,3451.184211,2.093149,7.737953,3451.184211,1,True,17
6,XGBoost_BAG_L1,0.869174,0.86908,r2,6.926582,16.742749,218.472362,6.926582,16.742749,218.472362,1,True,9
7,ExtraTreesMSE_BAG_L1,0.866411,0.866073,r2,3.877491,42.74932,651.289796,3.877491,42.74932,651.289796,1,True,7
8,LightGBMXT_BAG_L1,0.86633,0.865678,r2,24.91412,175.008928,1435.403529,24.91412,175.008928,1435.403529,1,True,3
9,ExtraTrees_r42_BAG_L1,0.866085,0.865766,r2,7.033136,45.722073,560.81343,7.033136,45.722073,560.81343,1,True,16


In [15]:
fi.style.background_gradient(cmap='RdYlGn')

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
_sum,0.560372,0.004021,0.0,5,0.568653,0.552092
_mean,0.260002,0.001459,0.0,5,0.263007,0.256998
sort_19,0.002065,0.000297,5e-05,5,0.002676,0.001453
_prod,0.001515,0.000548,0.001738,5,0.002643,0.000387
_gmean,0.000966,0.000355,0.001846,5,0.001697,0.000235
_var,0.000706,0.000137,0.000159,5,0.000988,0.000425
_skew,0.000596,0.000187,0.001019,5,0.000981,0.000212
_kurtosis,0.000552,0.000149,0.00058,5,0.000859,0.000245
sort_18,0.000312,0.000119,0.002133,5,0.000558,6.6e-05
_hmean,0.000201,0.000169,0.028224,5,0.000549,-0.000147


# Making Predictions and Creating a Submission File

In [16]:
preds = predictor.predict(test)

In [17]:
submission = pd.DataFrame({'id': test.index, 'FloodProbability ': preds})
submission.to_csv(f'autogluon-{score:.6f}.csv', index=False)
submission.head()

Unnamed: 0_level_0,id,FloodProbability
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1117957,1117957,0.578191
1117958,1117958,0.455773
1117959,1117959,0.447621
1117960,1117960,0.467861
1117961,1117961,0.466842


In [18]:
shutil.rmtree("AutogluonModels")