In [5]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import lightgbm as lgb

from sklearn.model_selection import KFold, cross_val_score
from sklearn.multioutput import MultiOutputRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
BASE_DIR = os.path.join(os.getcwd() , 'data')


In [7]:
df = pd.read_csv(BASE_DIR  +  '/train.csv')

In [10]:
mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
sd_columns = [col for col in df.columns if col.endswith('_mean')]
train = df.drop(columns=sd_columns)

In [12]:
X_full = df.drop(columns=mean_columns + ['id'])
Y_full = df[mean_columns]

In [13]:
do_cv = True

# Initialize the LightGBM Regressor
lgb_regressor = lgb.LGBMRegressor(objective='regression', n_estimators=150, learning_rate=0.05, max_depth=10, num_leaves=512,
                                  subsample = 0.8, colsample_bytree = 0.8, n_jobs=-1, verbose=-1)

# Wrap LightGBM with MultiOutputRegressor to handle multi-output regression
multi_output_model = MultiOutputRegressor(lgb_regressor)

if do_cv:
    print("\nDoing cross-validation scoring...")
    scores = cross_val_score(multi_output_model, X_full, Y_full,
                             cv=KFold(n_splits=3, shuffle=True, random_state=42),
                             scoring='r2')  
    print(f"Average R^2 score across all targets: {np.mean(scores)}")

# Train model with all data
print("Training final model...")
multi_output_model.fit(X_full, Y_full)


Doing cross-validation scoring...
Average R^2 score across all targets: 0.1988760388099238
Training final model...


In [17]:
X_full.columns.values

array(['WORLDCLIM_BIO1_annual_mean_temperature',
       'WORLDCLIM_BIO12_annual_precipitation',
       'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month',
       'WORLDCLIM_BIO15_precipitation_seasonality',
       'WORLDCLIM_BIO4_temperature_seasonality',
       'WORLDCLIM_BIO7_temperature_annual_range',
       'SOIL_bdod_0.5cm_mean_0.01_deg',
       'SOIL_bdod_100.200cm_mean_0.01_deg',
       'SOIL_bdod_15.30cm_mean_0.01_deg',
       'SOIL_bdod_30.60cm_mean_0.01_deg',
       'SOIL_bdod_5.15cm_mean_0.01_deg',
       'SOIL_bdod_60.100cm_mean_0.01_deg', 'SOIL_cec_0.5cm_mean_0.01_deg',
       'SOIL_cec_100.200cm_mean_0.01_deg',
       'SOIL_cec_15.30cm_mean_0.01_deg', 'SOIL_cec_30.60cm_mean_0.01_deg',
       'SOIL_cec_5.15cm_mean_0.01_deg', 'SOIL_cec_60.100cm_mean_0.01_deg',
       'SOIL_cfvo_0.5cm_mean_0.01_deg',
       'SOIL_cfvo_100.200cm_mean_0.01_deg',
       'SOIL_cfvo_15.30cm_mean_0.01_deg',
       'SOIL_cfvo_30.60cm_mean_0.01_deg',
       'SOIL_cfvo_5.15cm_mea

In [21]:
FEATURE_COLS = X_full.columns.values

# Extract feature importances from each LightGBM estimator
feature_importances = np.zeros((len(FEATURE_COLS), len(multi_output_model.estimators_)))

for i, estimator in enumerate(multi_output_model.estimators_):
    feature_importances[:, i] = estimator.feature_importances_

# Aggregate the feature importances (mean or sum, depending on preference)
mean_feature_importances = np.mean(feature_importances, axis=1)

# Print feature importances
data = {}
for feature_name, importance in zip(FEATURE_COLS, mean_feature_importances):
    data[feature_name]  = importance

In [44]:
tf =pd.DataFrame(data,index = ["Value"]).T

In [67]:
for i,row in tf.sort_values(by=["Value"],ascending = False).iterrows():
    if row["Value"] >= 200:
        print(f"'{row.name}',")


'WORLDCLIM_BIO1_annual_mean_temperature',
'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month',
'WORLDCLIM_BIO15_precipitation_seasonality',
'WORLDCLIM_BIO4_temperature_seasonality',
'SOIL_ocd_100.200cm_mean_0.01_deg',
'WORLDCLIM_BIO12_annual_precipitation',
'WORLDCLIM_BIO7_temperature_annual_range',
'SOIL_cec_0.5cm_mean_0.01_deg',
'SOIL_nitrogen_0.5cm_mean_0.01_deg',
'SOIL_ocd_60.100cm_mean_0.01_deg',
'SOIL_ocd_30.60cm_mean_0.01_deg',
'SOIL_clay_100.200cm_mean_0.01_deg',
'SOIL_soc_100.200cm_mean_0.01_deg',
'SOIL_nitrogen_100.200cm_mean_0.01_deg',
'SOIL_cec_100.200cm_mean_0.01_deg',
'SOIL_clay_15.30cm_mean_0.01_deg',
'SOIL_soc_60.100cm_mean_0.01_deg',
'MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m10',
'SOIL_nitrogen_30.60cm_mean_0.01_deg',
'SOIL_clay_5.15cm_mean_0.01_deg',
'MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m4',
'MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m11',
'SOIL_ocd_0.5cm_mean_0.01_deg',
'

In [66]:
print(f"'{tf.iloc[0].name}")

'WORLDCLIM_BIO1_annual_mean_temperature
