In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score

In [10]:
# Import 20-bin gridded dataset
metadata_file = 'metadata19_hmineq0.0_tmin20050000_mean_grid_20.csv'

class CFG:
    min_thick_value_train = 1.0

    featuresSmall = ['RGI', 'Area', 'Zmin', 'Zmax', 'Zmed', 'Slope', 'Lmax', 'Form', 'TermType', 'Aspect',
                     'elevation', 'elevation_from_zmin', 'dist_from_border_km_geom',
                     'slope50', 'slope100', 'slope150', 'slope300', 'slope450', 'slopegfa',
                     'curv_50', 'curv_300', 'curv_gfa', 'aspect_50', 'aspect_300', 'aspect_gfa', 'lat', 'dmdtda_hugo',
                     'smb']

    features_train = featuresSmall + ['vx_gfa', 'vy_gfa', 'v50', 'v100', 'v150', 'v300', 'v450', 'vgfa']
    target = 'THICKNESS'
    millan = 'ith_m'
    farinotti = 'ith_f'


    model = lgb.LGBMRegressor()

In [11]:
# Import the training dataset
glathida_rgis = pd.read_csv(metadata_file, low_memory=False)
# Remove nans
glathida_rgis = glathida_rgis.dropna()
# Filter out measurements less than 1 meter
glathida_rgis = glathida_rgis.loc[glathida_rgis['THICKNESS']>=CFG.min_thick_value_train]

# Lets see how many measurements we have in each region
print(glathida_rgis['RGI'].value_counts())

# Add some features for training
glathida_rgis['lat'] = glathida_rgis['POINT_LAT']
glathida_rgis['v50'] = np.sqrt(glathida_rgis['vx_gf50']**2 + glathida_rgis['vy_gf50']**2)
glathida_rgis['v100'] = np.sqrt(glathida_rgis['vx_gf100']**2 + glathida_rgis['vy_gf100']**2)
glathida_rgis['v150'] = np.sqrt(glathida_rgis['vx_gf150']**2 + glathida_rgis['vy_gf150']**2)
glathida_rgis['v300'] = np.sqrt(glathida_rgis['vx_gf300']**2 + glathida_rgis['vy_gf300']**2)
glathida_rgis['v450'] = np.sqrt(glathida_rgis['vx_gf450']**2 + glathida_rgis['vy_gf450']**2)
glathida_rgis['vgfa'] = np.sqrt(glathida_rgis['vx_gfa']**2 + glathida_rgis['vy_gfa']**2)
glathida_rgis['dvx'] = np.sqrt(glathida_rgis['dvx_dx']**2 + glathida_rgis['dvx_dy']**2)

glathida_rgis['slope50'] = np.sqrt(glathida_rgis['slope_lon_gf50']**2 + glathida_rgis['slope_lat_gf50']**2)
glathida_rgis['slope100'] = np.sqrt(glathida_rgis['slope_lon_gf100']**2 + glathida_rgis['slope_lat_gf100']**2)
glathida_rgis['slope150'] = np.sqrt(glathida_rgis['slope_lon_gf150']**2 + glathida_rgis['slope_lat_gf150']**2)
glathida_rgis['slope300'] = np.sqrt(glathida_rgis['slope_lon_gf300']**2 + glathida_rgis['slope_lat_gf300']**2)
glathida_rgis['slope450'] = np.sqrt(glathida_rgis['slope_lon_gf450']**2 + glathida_rgis['slope_lat_gf450']**2)
glathida_rgis['slopegfa'] = np.sqrt(glathida_rgis['slope_lon_gfa']**2 + glathida_rgis['slope_lat_gfa']**2)
glathida_rgis['elevation_from_zmin'] = glathida_rgis['elevation'] - glathida_rgis['Zmin']



RGI
3.0     12877
5.0     10247
7.0      6926
11.0     5907
4.0      5363
19.0     4437
8.0      2101
1.0      1835
13.0      793
17.0      654
12.0      221
16.0      139
10.0      136
2.0       120
Name: count, dtype: int64


In [12]:
X = glathida_rgis[CFG.features_train]
y = glathida_rgis[CFG.target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

model = CFG.model
model.fit(X_train, y_train)
y_preds = model.predict(X_test)

mae = mean_absolute_error(y_test, y_preds)
R2 = r2_score(y_test, y_preds)
print(f"mae: {mae:.2f} meters")
print(f"R2: {R2:.2f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8389
[LightGBM] [Info] Number of data points in the train set: 41404, number of used features: 36
[LightGBM] [Info] Start training from score 180.809922
mae: 36.77 meters
R2: 0.90


In [13]:
# Perform k-fold cross-validation (e.g., k=5)
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Cross-Validation: %.4f +- %.4f" %(-cv_scores.mean(), cv_scores.std()))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8388
[LightGBM] [Info] Number of data points in the train set: 33123, number of used features: 36
[LightGBM] [Info] Start training from score 181.114183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8386
[LightGBM] [Info] Number of data points in the train set: 33123, number of used features: 36
[LightGBM] [Info] Start training from score 180.352455
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002962 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8385
[LightGBM] [Info] Number of data points in the train