In [2]:
import pandas as pd
import numpy as np

Cleaning positive list

In [67]:
# --- Step 1: Load the base dataframe ---
df_pos = pd.read_csv("df_pos_calculatedarea.csv")

# Drop unwanted columns
cols_to_drop = ["Lake_type", "id", "Lake_area_ha", "Unnamed: 0"]
df_pos = df_pos.drop(columns=cols_to_drop)

# --- Step 2: Load expansion rates ---
exp5 = pd.read_csv("pos_expansion5y.csv")
exp10 = pd.read_csv("pos_expansion10y.csv")

# Keep only 'expansion_ha_peryr' from each and rename
exp5 = exp5[["expansion_ha_peryr"]].rename(columns={"expansion_ha_peryr": "5y_expansion_rate"})
exp10 = exp10[["expansion_ha_peryr"]].rename(columns={"expansion_ha_peryr": "10y_expansion_rate"})

# Concatenate them horizontally
df_pos = pd.concat([df_pos, exp5, exp10], axis=1)

# --- Step 3: Load glacier features ---
glac = pd.read_csv("glac_pos_list_correct_slope.csv")

# Keep only the required glacier columns
needed_glacier_cols = [
    "glacier_area_ha",
    "slope_glac_to_lake",
    "glacier_contact",
    "glacier_touch_count",
    "nearest_glacier_dist_m",
    "glacier_elev_m"
]
glac = glac[needed_glacier_cols]

# Concatenate glacier features to main df
df_pos = pd.concat([df_pos, glac], axis=1)


# --- Step 5: Rename final dataframe ---
uncleaned_ml_pos = df_pos

# --- Final check ---
print("Final Positive dataset shape:", uncleaned_ml_pos.shape)
uncleaned_ml_pos.head()


Final Positive dataset shape: (241, 16)


Unnamed: 0,Elevation_m,GLOF,Lake_type_simplified,Latitude,Longitude,Year_final,is_supraglacial,Lake_area_calculated_ha,5y_expansion_rate,10y_expansion_rate,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m
0,3466,1,other,42.522,74.609,1984,0,,,,536.125805,0.0,True,2,0.0,3985.0
1,3269,1,ice,42.207,79.846,1984,0,,,,396.118156,2.959854,False,0,446.305779,4590.0
2,4779,1,ice,35.677,77.193,1984,0,,,,9444.515664,0.71855,False,0,1036.810675,5524.0
3,4594,1,other,29.307,83.967,1985,0,,,,,,False,0,,
4,3301,1,other,36.039,74.633,1985,0,,,,6998.712273,0.0,True,3,0.0,4575.0


In [68]:

#convert to CSV file
uncleaned_ml_pos.to_csv("uncleaned_ml_pos.csv",index=False)

Cleaning negative list

In [69]:
import pandas as pd

# --- Step 1: Load the negative base list ---
neg = pd.read_csv("glac_neg_list_correct_slope.csv")

# Drop unwanted columns
neg = neg.drop(columns=["Unnamed: 0", "Lake_type", "Lake_area_ha", "area_t1", "area_t2", "expansion_ha_peryr"], errors="ignore")

# --- Step 2: Load expansion rates ---
exp5 = pd.read_csv("neg_expansion5y.csv")[["expansion_ha_peryr"]] \
          .rename(columns={"expansion_ha_peryr": "5y_expansion_rate"})
exp10 = pd.read_csv("neg_expansion10y.csv")[["expansion_ha_peryr"]] \
           .rename(columns={"expansion_ha_peryr": "10y_expansion_rate"})

# --- Step 3: Concatenate expansions with base ---
uncleaned_ml_neg = pd.concat(
    [neg.reset_index(drop=True),
     exp5.reset_index(drop=True),
     exp10.reset_index(drop=True)],
    axis=1
)

# --- Final check ---
print("Final NEGATIVE dataset shape:", uncleaned_ml_neg.shape)
uncleaned_ml_neg.head()


Final NEGATIVE dataset shape: (2411, 16)


Unnamed: 0,Longitude,Latitude,Year_final,Elevation_m,GLOF,Lake_type_simplified,is_supraglacial,Lake_area_calculated_ha,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate
0,72.901,36.412,2019,4256,0,moraine,0,19.373351,735.52788,0.0,True,4,0.0,4706.0,-20.935474,0.925453
1,73.048,36.263,2019,4618,0,other,0,18.463383,12.913846,0.386765,False,0,726.539168,4899.0,-0.014481,-2.158693
2,73.407,36.642,2019,2748,0,moraine,0,14.050228,4204.096644,0.0,True,3,0.0,4413.0,0.417908,-4.397514
3,73.384,36.131,2019,4503,0,other,0,8.558558,12.377344,0.291547,False,0,607.10722,4681.0,-0.116048,0.11587
4,73.423,36.116,2019,4437,0,other,0,11.896605,31.006463,0.456346,False,0,703.413916,4764.0,-0.551315,-1.320284


In [70]:
#convert to CSV file
uncleaned_ml_neg.to_csv("uncleaned_ml_neg.csv",index=False)

In [71]:
# --- Define the correct final column order ---
final_order = [
    "Longitude",
    "Latitude",
    "Year_final",
    "Lake_area_calculated_ha",
    "Elevation_m",
    "Lake_type_simplified",
    "is_supraglacial",
    "glacier_area_ha",
    "slope_glac_to_lake",
    "glacier_contact",
    "glacier_touch_count",
    "nearest_glacier_dist_m",
    "glacier_elev_m",
    "5y_expansion_rate",
    "10y_expansion_rate",
    "GLOF"
]



In [72]:
# --- Reorder negative dataframe ---
uncleaned_ml_neg = uncleaned_ml_neg[final_order]

# --- Reorder positive dataframe ---
uncleaned_ml_pos = uncleaned_ml_pos[final_order]

In [73]:
uncleaned_ml_pos.head()

Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF
0,74.609,42.522,1984,,3466,other,0,536.125805,0.0,True,2,0.0,3985.0,,,1
1,79.846,42.207,1984,,3269,ice,0,396.118156,2.959854,False,0,446.305779,4590.0,,,1
2,77.193,35.677,1984,,4779,ice,0,9444.515664,0.71855,False,0,1036.810675,5524.0,,,1
3,83.967,29.307,1985,,4594,other,0,,,False,0,,,,,1
4,74.633,36.039,1985,,3301,other,0,6998.712273,0.0,True,3,0.0,4575.0,,,1


In [74]:
uncleaned_ml_neg.head()

Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF
0,72.901,36.412,2019,19.373351,4256,moraine,0,735.52788,0.0,True,4,0.0,4706.0,-20.935474,0.925453,0
1,73.048,36.263,2019,18.463383,4618,other,0,12.913846,0.386765,False,0,726.539168,4899.0,-0.014481,-2.158693,0
2,73.407,36.642,2019,14.050228,2748,moraine,0,4204.096644,0.0,True,3,0.0,4413.0,0.417908,-4.397514,0
3,73.384,36.131,2019,8.558558,4503,other,0,12.377344,0.291547,False,0,607.10722,4681.0,-0.116048,0.11587,0
4,73.423,36.116,2019,11.896605,4437,other,0,31.006463,0.456346,False,0,703.413916,4764.0,-0.551315,-1.320284,0


In [75]:
# --- Step 2: Enforce correct dtypes ---
dtype_map = {
    # numeric (floats)
    "Longitude": "float",
    "Latitude": "float",
    "Lake_area_calculated_ha": "float",
    "glacier_area_ha": "float",
    "slope_glac_to_lake": "float",
    "nearest_glacier_dist_m": "float",
    "glacier_elev_m": "float",
    "5y_expansion_rate": "float",
    "10y_expansion_rate": "float",

    # integers (nullable)
    "Year_final": "Int64",
    "Elevation_m": "Int64",
    "glacier_touch_count": "Int64",
    #flags
    "is_supraglacial": "Int64",
    "glacier_contact": "Int64",

    # categorical
    "Lake_type_simplified": "category",

    # label
    "GLOF": "Int64"
}

uncleaned_ml_pos = uncleaned_ml_pos.astype(dtype_map)
uncleaned_ml_neg = uncleaned_ml_neg.astype(dtype_map)




In [76]:
uncleaned_ml_pos[:10]

Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF
0,74.609,42.522,1984,,3466,other,0,536.125805,0.0,1,2,0.0,3985.0,,,1
1,79.846,42.207,1984,,3269,ice,0,396.118156,2.959854,0,0,446.305779,4590.0,,,1
2,77.193,35.677,1984,,4779,ice,0,9444.515664,0.71855,0,0,1036.810675,5524.0,,,1
3,83.967,29.307,1985,,4594,other,0,,,0,0,,,,,1
4,74.633,36.039,1985,,3301,other,0,6998.712273,0.0,1,3,0.0,4575.0,,,1
5,76.867,41.975,1985,,3627,moraine,0,38.457305,0.4188,0,0,768.862677,3949.0,,,1
6,86.586,27.874,1985,,4368,moraine,0,652.480729,0.0,1,2,0.0,4977.0,,,1
7,79.846,42.207,1985,,3269,ice,0,396.118156,2.959854,0,0,446.305779,4590.0,,,1
8,74.879,36.414,1986,,2501,other,0,2339.676629,9.981328,0,0,198.570765,4483.0,,,1
9,88.027,27.586,1986,,4437,other,0,29403.154647,0.0,1,4,0.0,5350.0,,,1


In [77]:
uncleaned_ml_neg[:10]

Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF
0,72.901,36.412,2019,19.373351,4256,moraine,0,735.52788,0.0,1,4,0.0,4706.0,-20.935474,0.925453,0
1,73.048,36.263,2019,18.463383,4618,other,0,12.913846,0.386765,0,0,726.539168,4899.0,-0.014481,-2.158693,0
2,73.407,36.642,2019,14.050228,2748,moraine,0,4204.096644,0.0,1,3,0.0,4413.0,0.417908,-4.397514,0
3,73.384,36.131,2019,8.558558,4503,other,0,12.377344,0.291547,0,0,607.10722,4681.0,-0.116048,0.11587,0
4,73.423,36.116,2019,11.896605,4437,other,0,31.006463,0.456346,0,0,703.413916,4764.0,-0.551315,-1.320284,0
5,73.449,36.112,2019,16.831355,4427,other,0,10.582141,0.355723,0,0,567.858127,4693.0,-1e-06,-0.333334,0
6,73.465,36.108,2019,16.831355,4488,other,0,10.582141,0.355723,0,0,567.858127,4693.0,-1e-06,0.769022,0
7,73.461,36.086,2019,16.831355,4577,other,0,10.582141,0.355723,0,0,567.858127,4693.0,-1e-06,0.769022,0
8,73.328,36.024,2019,8.715218,4559,other,0,11.846031,0.211689,0,0,760.549757,4714.0,0.058102,0.232406,0
9,73.312,36.0,2019,9.153732,4615,other,0,2.851505,0.196036,0,0,505.008072,4713.0,-0.377781,0.043584,0


In [78]:
uncleaned_ml_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Longitude                241 non-null    float64 
 1   Latitude                 241 non-null    float64 
 2   Year_final               241 non-null    Int64   
 3   Lake_area_calculated_ha  181 non-null    float64 
 4   Elevation_m              241 non-null    Int64   
 5   Lake_type_simplified     241 non-null    category
 6   is_supraglacial          241 non-null    Int64   
 7   glacier_area_ha          239 non-null    float64 
 8   slope_glac_to_lake       239 non-null    float64 
 9   glacier_contact          241 non-null    Int64   
 10  glacier_touch_count      241 non-null    Int64   
 11  nearest_glacier_dist_m   239 non-null    float64 
 12  glacier_elev_m           239 non-null    float64 
 13  5y_expansion_rate        128 non-null    float64 
 14  10y_expans

In [79]:
uncleaned_ml_pos["Lake_type_simplified"].unique()

['other', 'ice', 'moraine']
Categories (3, object): ['ice', 'moraine', 'other']

In [80]:
uncleaned_ml_pos.to_csv("uncleaned_ml_pos.csv")
uncleaned_ml_neg.to_csv("uncleaned_ml_neg.csv")

In [81]:
print("Rows per source:")
print("df_pos_calculatedarea.csv   :", len(pd.read_csv("df_pos_calculatedarea.csv")))
print("pos_expansion5y.csv         :", len(pd.read_csv("pos_expansion5y.csv")))
print("pos_expansion10y.csv        :", len(pd.read_csv("pos_expansion10y.csv")))
print("glac_pos_list_correct_slope :", len(pd.read_csv("glac_pos_list_correct_slope.csv")))


Rows per source:
df_pos_calculatedarea.csv   : 241
pos_expansion5y.csv         : 241
pos_expansion10y.csv        : 241
glac_pos_list_correct_slope : 241


In [82]:
print("Rows per NEGATIVE source:")
print("glac_neg_list_correct_slope.csv :", len(pd.read_csv("glac_neg_list_correct_slope.csv")))
print("neg_expansion5y.csv             :", len(pd.read_csv("neg_expansion5y.csv")))
print("neg_expansion10y.csv            :", len(pd.read_csv("neg_expansion10y.csv")))



Rows per NEGATIVE source:
glac_neg_list_correct_slope.csv : 2411
neg_expansion5y.csv             : 2411
neg_expansion10y.csv            : 2411


Final Converstion to CSV File

In [83]:
#convert to CSV file
uncleaned_ml_pos.to_csv("uncleaned_ml_pos.csv",index=False)
uncleaned_ml_neg.to_csv("uncleaned_ml_neg.csv",index=False)

Combined CSV

In [86]:
# Load both datasets
pos = pd.read_csv("uncleaned_ml_pos.csv")
neg = pd.read_csv("uncleaned_ml_neg.csv")

# Concatenate with pos first, neg second
uncleaned_ml_combined = pd.concat([pos, neg], axis=0, ignore_index=True)

# Quick check
print("Shape:", uncleaned_ml_combined.shape)
print(uncleaned_ml_combined["GLOF"].value_counts())
uncleaned_ml_combined.head()


Shape: (2652, 16)
GLOF
0    2411
1     241
Name: count, dtype: int64


Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF
0,74.609,42.522,1984,,3466,other,0,536.125805,0.0,1,2,0.0,3985.0,,,1
1,79.846,42.207,1984,,3269,ice,0,396.118156,2.959854,0,0,446.305779,4590.0,,,1
2,77.193,35.677,1984,,4779,ice,0,9444.515664,0.71855,0,0,1036.810675,5524.0,,,1
3,83.967,29.307,1985,,4594,other,0,,,0,0,,,,,1
4,74.633,36.039,1985,,3301,other,0,6998.712273,0.0,1,3,0.0,4575.0,,,1


In [89]:
#To CSV file:
uncleaned_ml_combined.to_csv("uncleaned_ml_combined.csv",index=False)


## Add Lake Area from Database for Missing Calculated Area


In [97]:
uncleaned_ml_pos.shape

(241, 16)

In [113]:
# Load (no index read)
uncleaned_ml_pos = pd.read_csv("uncleaned_ml_pos.csv", index_col=False)
df_pos_calculatedarea = pd.read_csv("df_pos_calculatedarea.csv", index_col=False)

# (Already verified) alignment check — optional to keep
assert (uncleaned_ml_pos[['Latitude','Longitude','Year_final']].reset_index(drop=True)
        == df_pos_calculatedarea[['Latitude','Longitude','Year_final']].reset_index(drop=True)).all().all()

# Before counts
before_missing = uncleaned_ml_pos['Lake_area_calculated_ha'].isna().sum()

# Fill-by-index where missing
uncleaned_ml_pos_1 = uncleaned_ml_pos.copy()
uncleaned_ml_pos_1['lake_area_filled_from_db'] = False

mask = uncleaned_ml_pos_1['Lake_area_calculated_ha'].isna() & df_pos_calculatedarea['Lake_area_ha'].notna()
uncleaned_ml_pos_1.loc[mask, 'Lake_area_calculated_ha'] = df_pos_calculatedarea.loc[mask, 'Lake_area_ha']
uncleaned_ml_pos_1.loc[mask, 'lake_area_filled_from_db'] = True

# (Optional) ensure numeric dtype for area
uncleaned_ml_pos_1['Lake_area_calculated_ha'] = pd.to_numeric(uncleaned_ml_pos_1['Lake_area_calculated_ha'], errors='coerce')

# After checks
after_missing = uncleaned_ml_pos_1['Lake_area_calculated_ha'].isna().sum()
print(f"Rows total: {len(uncleaned_ml_pos_1)} (should still be 241)")
print(f"Missing before: {before_missing}  |  filled from DB: {mask.sum()}  |  missing after: {after_missing}")
print("Filled flag True count:", uncleaned_ml_pos_1['lake_area_filled_from_db'].sum())


uncleaned_ml_pos_1.head()

Rows total: 241 (should still be 241)
Missing before: 60  |  filled from DB: 15  |  missing after: 45
Filled flag True count: 15


Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF,lake_area_filled_from_db
0,74.609,42.522,1984,,3466,other,0,536.125805,0.0,1,2,0.0,3985.0,,,1,False
1,79.846,42.207,1984,,3269,ice,0,396.118156,2.959854,0,0,446.305779,4590.0,,,1,False
2,77.193,35.677,1984,,4779,ice,0,9444.515664,0.71855,0,0,1036.810675,5524.0,,,1,False
3,83.967,29.307,1985,,4594,other,0,,,0,0,,,,,1,False
4,74.633,36.039,1985,,3301,other,0,6998.712273,0.0,1,3,0.0,4575.0,,,1,False


In [106]:
uncleaned_ml_pos_1.isnull().mean()*100

Longitude                    0.000000
Latitude                     0.000000
Year_final                   0.000000
Lake_area_calculated_ha     18.672199
Elevation_m                  0.000000
Lake_type_simplified         0.000000
is_supraglacial              0.000000
glacier_area_ha              0.829876
slope_glac_to_lake           0.829876
glacier_contact              0.000000
glacier_touch_count          0.000000
nearest_glacier_dist_m       0.829876
glacier_elev_m               0.829876
5y_expansion_rate           46.887967
10y_expansion_rate          60.165975
GLOF                         0.000000
lake_area_filled_from_db     0.000000
dtype: float64

In [107]:
uncleaned_ml_pos_1.to_csv("uncleaned_ml_pos_1.csv")

In [109]:
# Load (no index read)
uncleaned_ml_neg = pd.read_csv("uncleaned_ml_neg.csv", index_col=False)
glac_neg_list_correct_slope = pd.read_csv("glac_neg_list_correct_slope.csv", index_col=False)
# (Already verified) alignment check — optional to keep
assert (uncleaned_ml_neg[['Latitude','Longitude','Year_final']].reset_index(drop=True)
        == glac_neg_list_correct_slope[['Latitude','Longitude','Year_final']].reset_index(drop=True)).all().all()
# Before counts
before_missing = uncleaned_ml_neg['Lake_area_calculated_ha'].isna().sum()
# Fill-by-index where missing
uncleaned_ml_neg_1 = uncleaned_ml_neg.copy()
uncleaned_ml_neg_1['lake_area_filled_from_db'] = False
mask = uncleaned_ml_neg_1['Lake_area_calculated_ha'].isna() & glac_neg_list_correct_slope['Lake_area_ha'].notna()
uncleaned_ml_neg_1.loc[mask, 'Lake_area_calculated_ha'] = glac_neg_list_correct_slope.loc[mask, 'Lake_area_ha']
uncleaned_ml_neg_1.loc[mask, 'lake_area_filled_from_db'] = True
# (Optional) ensure numeric dtype for area
uncleaned_ml_neg_1['Lake_area_calculated_ha'] = pd.to_numeric(uncleaned_ml_neg_1['Lake_area_calculated_ha'], errors='coerce')
# After checks
after_missing = uncleaned_ml_neg_1['Lake_area_calculated_ha'].isna().sum()
print(f"Rows total: {len(uncleaned_ml_neg_1)} (should still be 2411)")
print(f"Missing before: {before_missing}  |  filled from DB: {mask.sum()}  |  missing after: {after_missing}")
print("Filled flag True count:", uncleaned_ml_neg_1['lake_area_filled_from_db'].sum())

Rows total: 2411 (should still be 2411)
Missing before: 2  |  filled from DB: 2  |  missing after: 0
Filled flag True count: 2


In [110]:
uncleaned_ml_neg_1.head()

Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF,lake_area_filled_from_db
0,72.901,36.412,2019,19.373351,4256,moraine,0,735.52788,0.0,1,4,0.0,4706.0,-20.935474,0.925453,0,False
1,73.048,36.263,2019,18.463383,4618,other,0,12.913846,0.386765,0,0,726.539168,4899.0,-0.014481,-2.158693,0,False
2,73.407,36.642,2019,14.050228,2748,moraine,0,4204.096644,0.0,1,3,0.0,4413.0,0.417908,-4.397514,0,False
3,73.384,36.131,2019,8.558558,4503,other,0,12.377344,0.291547,0,0,607.10722,4681.0,-0.116048,0.11587,0,False
4,73.423,36.116,2019,11.896605,4437,other,0,31.006463,0.456346,0,0,703.413916,4764.0,-0.551315,-1.320284,0,False


In [112]:
uncleaned_ml_neg_1.isnull().mean()*100

Longitude                    0.000000
Latitude                     0.000000
Year_final                   0.000000
Lake_area_calculated_ha      0.000000
Elevation_m                  0.000000
Lake_type_simplified         0.000000
is_supraglacial              0.000000
glacier_area_ha             24.346744
slope_glac_to_lake          24.346744
glacier_contact              0.000000
glacier_touch_count          0.000000
nearest_glacier_dist_m      24.346744
glacier_elev_m              24.346744
5y_expansion_rate            0.082953
10y_expansion_rate           1.202820
GLOF                         0.000000
lake_area_filled_from_db     0.000000
dtype: float64

In [114]:
uncleaned_ml_neg_1.to_csv("uncleaned_ml_neg_1.csv")