In [1]:
!pip install -q autogluon

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
keras-cv 0.8.2 requires keras-core, which is not installed.
keras-nlp 0.9.3 requires keras-core, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
aiobotocore 2.12.3 requires botocore<1.34.70,>=1.34.41, but you have botocore 1.29.165 which is incompatible.
albumentations 1.4.0 requires scikit-image>=0.21.0, but you have scikit-image 0.20.0 which is incompatible.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 15.0.2 which is incompatible.
beatrix-jupyterlab 2023.128.151533 requires jupyterlab~=3.6.0, but you have jupyterlab 4.1.6 w

In [2]:
from autogluon.tabular import TabularPredictor,TabularDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
import shutil

warnings.filterwarnings('ignore')
seed = 27

# Loading the Dataset

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv', index_col='id')

In [4]:
train.head(10).T

id,0,1,2,3,4,5,6,7,8,9
MonsoonIntensity,5.0,6.0,6.0,3.0,5.0,5.0,8.0,6.0,5.0,4.0
TopographyDrainage,8.0,7.0,5.0,4.0,3.0,4.0,3.0,6.0,2.0,2.0
RiverManagement,5.0,4.0,6.0,6.0,2.0,1.0,1.0,5.0,8.0,3.0
Deforestation,8.0,4.0,7.0,5.0,6.0,4.0,2.0,7.0,5.0,5.0
Urbanization,6.0,8.0,3.0,4.0,4.0,2.0,3.0,5.0,4.0,8.0
ClimateChange,4.0,8.0,7.0,8.0,4.0,4.0,7.0,5.0,5.0,6.0
DamsQuality,4.0,3.0,1.0,4.0,3.0,6.0,3.0,3.0,2.0,5.0
Siltation,3.0,5.0,5.0,7.0,3.0,6.0,4.0,5.0,4.0,5.0
AgriculturalPractices,3.0,4.0,4.0,6.0,3.0,7.0,6.0,5.0,5.0,7.0
Encroachments,4.0,6.0,5.0,8.0,3.0,5.0,7.0,5.0,5.0,6.0


In [5]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MonsoonIntensity,1117957.0,4.92145,2.056387,0.0,3.0,5.0,6.0,16.0
TopographyDrainage,1117957.0,4.926671,2.093879,0.0,3.0,5.0,6.0,18.0
RiverManagement,1117957.0,4.955322,2.072186,0.0,4.0,5.0,6.0,16.0
Deforestation,1117957.0,4.94224,2.051689,0.0,4.0,5.0,6.0,17.0
Urbanization,1117957.0,4.942517,2.083391,0.0,3.0,5.0,6.0,17.0
ClimateChange,1117957.0,4.934093,2.057742,0.0,3.0,5.0,6.0,17.0
DamsQuality,1117957.0,4.955878,2.083063,0.0,4.0,5.0,6.0,16.0
Siltation,1117957.0,4.927791,2.065992,0.0,3.0,5.0,6.0,16.0
AgriculturalPractices,1117957.0,4.942619,2.068545,0.0,3.0,5.0,6.0,16.0
Encroachments,1117957.0,4.94923,2.083324,0.0,4.0,5.0,6.0,18.0


# Preprocessing
- I'm incorporating a new feature which was suggested [here](https://www.kaggle.com/competitions/playground-series-s4e5/discussion/499274).
- I've also added a few other statistical features which have been suggested on the forums by various people.

In [6]:
def add_stat_features(dataframe, cols):
    dataframe['_sum'] = dataframe[cols].sum(1)
    dataframe['_var'] = dataframe[cols].var(1)
    dataframe['_prod'] = dataframe[cols].prod(1)
    dataframe['_mean'] = dataframe[cols].mean(1)
    dataframe['_skew'] = dataframe[cols].skew(1)
    dataframe['_median'] = dataframe[cols].median(1)
    dataframe['_kurtosis'] = dataframe[cols].kurtosis(1)
    dataframe['_abs_energy'] = dataframe[cols].abs().sum(1)
    dataframe['_gmean'] = dataframe[cols].apply(lambda x: np.exp(np.log(x).mean()), axis=1)
    dataframe['_hmean'] = dataframe[cols].apply(lambda x: len(x) / np.sum(1.0/x), axis=1)
    return dataframe

In [7]:
feature_cols = list(test.columns)

train = add_stat_features(train, feature_cols)
train = train.drop(feature_cols, axis=1)

test = add_stat_features(test, feature_cols) 

In [8]:
_train, _val = train_test_split(
    train, 
    test_size=0.1, 
    random_state=seed, 
    stratify=train['FloodProbability']
)

In [9]:
_train = TabularDataset(_train)
_val = TabularDataset(_val)
test = TabularDataset(test)

# Training

In [10]:
predictor = TabularPredictor(
    label='FloodProbability',
    eval_metric='r2',
    problem_type="regression",
    verbosity=2
).fit(
    train_data=_train,
    time_limit=3600 * 6,
    presets="best_quality",
    excluded_model_types = ["NN_TORCH", "FASTAI", "NN"],
    keep_only_best=True
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240502_192911"
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 21600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20240502_192911/ds_sub_fit/sub_fit_ho.
2024-05-02 19:29:12,270	INFO util.py:124 -- Outdated packages:
  ipywidgets==

# Visualizing the Results

In [11]:
score = predictor.evaluate(_val, silent=True)['r2']

In [12]:
print(f"Validation R2 Score: {score}")

Validation R2 Score: 0.8681484535683687


In [13]:
fi = predictor.feature_importance(data=_val, silent=True)
lb = predictor.leaderboard(_val, silent=True)

These features in provided data are not utilized by the predictor and will be ignored: ['_abs_energy']


In [14]:
lb.style.background_gradient(subset=['score_val', 'score_test'], cmap='RdYlGn')

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.868148,0.869001,r2,51.335355,234.228744,5187.852298,0.008835,0.020697,7.349454,2,True,6
1,CatBoost_r9_BAG_L1,0.868128,0.868938,r2,8.899496,13.556859,1267.594913,8.899496,13.556859,1267.594913,1,True,4
2,CatBoost_r177_BAG_L1,0.868127,0.868921,r2,0.964839,1.667261,1351.651048,0.964839,1.667261,1351.651048,1,True,3
3,CatBoost_BAG_L1,0.868116,0.868925,r2,1.297021,2.422087,1908.585755,1.297021,2.422087,1908.585755,1,True,1
4,XGBoost_r33_BAG_L1,0.868001,0.868731,r2,36.628678,182.547242,396.331979,36.628678,182.547242,396.331979,1,True,5
5,ExtraTreesMSE_BAG_L1,0.865672,0.866794,r2,3.536486,34.014598,256.339149,3.536486,34.014598,256.339149,1,True,2


In [15]:
fi.style.background_gradient(cmap='RdYlGn')

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
_sum,0.43408,0.006855,0.0,5,0.448194,0.419966
_mean,0.397441,0.007911,0.0,5,0.413729,0.381152
_prod,0.003107,0.000574,0.000134,5,0.004288,0.001925
_kurtosis,0.002057,0.000571,0.000644,5,0.003233,0.000882
_var,0.002046,0.000707,0.001472,5,0.003502,0.000589
_gmean,0.001976,0.000384,0.000163,5,0.002767,0.001186
_skew,0.001062,0.00031,0.000776,5,0.0017,0.000425
_hmean,0.001041,0.00026,0.000432,5,0.001577,0.000505
_median,0.000339,0.000152,0.003723,5,0.000651,2.7e-05


# Making Predictions and Creating a Submission File

In [16]:
preds = predictor.predict(test)

In [17]:
submission = pd.DataFrame({'id': test.index, 'FloodProbability ': preds})
submission.to_csv(f'autogluon-{score:.6f}.csv', index=False)
submission.head()

Unnamed: 0_level_0,id,FloodProbability
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1117957,1117957,0.578311
1117958,1117958,0.455109
1117959,1117959,0.448276
1117960,1117960,0.466795
1117961,1117961,0.465864


In [18]:
shutil.rmtree("AutogluonModels")