In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import os


In [2]:
merged_dir = "../../data/merged"
fire_file_path = os.path.join(merged_dir, "fire_land_elev_soil_climate.parquet")
fire_cropped = pd.read_parquet(fire_file_path)
fire_cropped.head()

Unnamed: 0,longitude,latitude,fire,LCCCode,elevation,COARSE,SAND,CLAY,TCARBON_EQ,PH_WATER,...,tmin_iqr_summer,tmin_iqr_winter,tmax_iqr_autumn,tmax_iqr_spring,tmax_iqr_summer,tmax_iqr_winter,tmax_max,prec_min,longest_dry_period,longest_hot_period
0,9.68184,33.53246,1,0011,183,9.0,45.333333,20.333333,9.433333,7.866667,...,1.5,0.75,5.0,4.0,1.625,1.5,40.0,0.0,1,2
1,9.27448,33.45376,1,6004,102,3.0,37.5,22.0,16.25,8.0,...,1.5,1.5,5.25,3.875,1.5,1.5,42.0,0.0,1,3
2,8.53867,33.36744,1,6004,21,6.0,90.0,5.0,0.0,6.7,...,1.5,1.25,5.0,4.0,2.0,2.0,44.0,0.0,2,3
3,8.35868,35.69833,1,0003 / 0004,865,19.666667,50.333333,21.333333,11.033333,7.633333,...,1.875,0.375,4.375,4.0,1.625,1.0,35.75,1.6,0,0
4,8.53032,34.9382,1,0011,752,3.0,55.0,18.0,2.0,7.5,...,1.625,0.625,4.75,4.625,1.625,1.25,37.25,1.075,0,0


# split

In [3]:
# data for modeling
# remove longitude and latitude
fire_prep = fire_cropped.drop(columns=['longitude', 'latitude'])
fire_prep.head()

Unnamed: 0,fire,LCCCode,elevation,COARSE,SAND,CLAY,TCARBON_EQ,PH_WATER,TOTAL_N,CN_RATIO,...,tmin_iqr_summer,tmin_iqr_winter,tmax_iqr_autumn,tmax_iqr_spring,tmax_iqr_summer,tmax_iqr_winter,tmax_max,prec_min,longest_dry_period,longest_hot_period
0,1,0011,183,9.0,45.333333,20.333333,9.433333,7.866667,0.763333,9.0,...,1.5,0.75,5.0,4.0,1.625,1.5,40.0,0.0,1,2
1,1,6004,102,3.0,37.5,22.0,16.25,8.0,0.695,9.0,...,1.5,1.5,5.25,3.875,1.5,1.5,42.0,0.0,1,3
2,1,6004,21,6.0,90.0,5.0,0.0,6.7,0.32,9.0,...,1.5,1.25,5.0,4.0,2.0,2.0,44.0,0.0,2,3
3,1,0003 / 0004,865,19.666667,50.333333,21.333333,11.033333,7.633333,1.0,10.0,...,1.875,0.375,4.375,4.0,1.625,1.0,35.75,1.6,0,0
4,1,0011,752,3.0,55.0,18.0,2.0,7.5,0.93,9.0,...,1.625,0.625,4.75,4.625,1.625,1.25,37.25,1.075,0,0


In [4]:
fire_prep.shape

(42291, 41)

In [5]:
# split data into train, test with 80% train and 20% test and stratify by 'fire' column
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(fire_cropped, test_size=0.2, stratify=fire_cropped['fire'], random_state=42)
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
# distribution of fire in train and test data
print("Train data fire distribution:\n", train_data['fire'].value_counts(normalize=True))
print("Test data fire distribution:\n", test_data['fire'].value_counts(normalize=True))

Train data shape: (33832, 43)
Test data shape: (8459, 43)
Train data fire distribution:
 fire
0    0.895572
1    0.104428
Name: proportion, dtype: float64
Test data fire distribution:
 fire
0    0.895614
1    0.104386
Name: proportion, dtype: float64


# Encode LCCCode

In [6]:
# nunique lcccode in train and test data
print("Train data LCCCode unique values:", train_data['LCCCode'].nunique())
print("Test data LCCCode unique values:", test_data['LCCCode'].nunique())

Train data LCCCode unique values: 19
Test data LCCCode unique values: 18


In [7]:
from sklearn.preprocessing import OneHotEncoder

# Define encoder: rare/unknown categories go to 'infrequent' column
encoder = OneHotEncoder(
    sparse_output=False,
    handle_unknown='ignore',
)

# 1️⃣ Fit on train data
encoder.fit(train_data[['LCCCode']])

# 2️⃣ Transform train and test
train_lcc_encoded = encoder.transform(train_data[['LCCCode']])
test_lcc_encoded  = encoder.transform(test_data[['LCCCode']])

# 3️⃣ Convert to DataFrame with proper column names
train_lcc_df = pd.DataFrame(
    train_lcc_encoded, 
    columns=encoder.get_feature_names_out(['LCCCode']), 
    index=train_data.index
)
test_lcc_df = pd.DataFrame(
    test_lcc_encoded, 
    columns=encoder.get_feature_names_out(['LCCCode']), 
    index=test_data.index
)

# 4️⃣ Drop original column and concat encoded
train_data = pd.concat([train_data.drop(columns=['LCCCode']), train_lcc_df], axis=1)
test_data  = pd.concat([test_data.drop(columns=['LCCCode']), test_lcc_df], axis=1)

print("Train shape after encoding:", train_data.shape)
print("Test shape after encoding:", test_data.shape)


Train shape after encoding: (33832, 61)
Test shape after encoding: (8459, 61)


In [8]:
# nb lcc columns
lcc_columns = [col for col in train_data.columns if col.startswith('LCCCode_')]
print("LCCCode encoded columns:", lcc_columns)
print("Number of LCCCode encoded columns:", len(lcc_columns))

LCCCode encoded columns: ['LCCCode_0003 / 0004', 'LCCCode_0004 // 0003', 'LCCCode_0010', 'LCCCode_0011', 'LCCCode_11490 // 11494', 'LCCCode_11498', 'LCCCode_20049 // 20058', 'LCCCode_20058', 'LCCCode_21446 // 21450-121340 / 21454', 'LCCCode_21450', 'LCCCode_21454 // 21446 // 21450', 'LCCCode_21497-121340', 'LCCCode_21497-15045', 'LCCCode_21499-121340', 'LCCCode_21518', 'LCCCode_6001', 'LCCCode_6004', 'LCCCode_6020', 'LCCCode_7001 // 8001']
Number of LCCCode encoded columns: 19


# scale data

In [9]:
train_data.describe()

Unnamed: 0,longitude,latitude,fire,elevation,COARSE,SAND,CLAY,TCARBON_EQ,PH_WATER,TOTAL_N,...,LCCCode_21450,LCCCode_21454 // 21446 // 21450,LCCCode_21497-121340,LCCCode_21497-15045,LCCCode_21499-121340,LCCCode_21518,LCCCode_6001,LCCCode_6004,LCCCode_6020,LCCCode_7001 // 8001
count,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,...,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0
mean,5.053862,34.472887,0.104428,548.036918,11.441834,50.06192,19.919454,8.740652,7.674801,0.898144,...,0.019538,0.000414,0.007921,3e-05,0.00133,0.005025,0.176904,0.144153,8.9e-05,0.006473
std,3.488754,1.409184,0.305819,418.016667,4.329425,11.806982,7.052235,4.352778,0.536496,0.309429,...,0.138407,0.020338,0.088651,0.005437,0.036447,0.070709,0.381593,0.35125,0.009416,0.080196
min,-2.50425,32.13063,0.0,-31.0,2.0,13.0,4.0,0.0,5.233333,0.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.20864,33.21155,0.0,136.0,8.333333,41.0,15.0,5.45,7.433333,0.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.25061,34.49901,0.0,506.0,12.0,50.333333,17.5,9.833333,7.825,0.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.94981,35.67868,0.0,888.0,14.0,57.0,24.5,11.533333,8.05,1.106667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,11.5916,37.32346,1.0,2154.0,46.0,90.0,55.0,26.9,8.6,3.44,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
from sklearn.preprocessing import RobustScaler

# 1️⃣ Identify columns to scale
# Exclude target 'fire' and all one-hot encoded LCCCode columns
exclude_cols = ['fire'] + [col for col in train_data.columns if col.startswith('LCCCode')]
num_cols = [col for col in train_data.columns if col not in exclude_cols]

# 2️⃣ Initialize RobustScaler
scaler = RobustScaler()

# 3️⃣ Fit on train and transform train
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])

# 4️⃣ Transform test using same scaler
test_data[num_cols] = scaler.transform(test_data[num_cols])

print("Train data after scaling:")
print(train_data.head())

print("\nTest data after scaling:")
print(test_data.head())


Train data after scaling:
       longitude  latitude  fire  elevation    COARSE      SAND      CLAY  \
22332   0.238805  0.000000     0  -0.672872 -0.176471  0.083333 -0.087719   
34703  -0.365673  0.606758     0   0.607713 -0.705882 -0.958333  1.631579   
9592   -0.514926 -0.623794     0   0.352394  0.529412  0.447917 -0.263158   
26983  -0.276120  0.215992     0   0.704787  0.176471  0.395833 -0.263158   
35564  -0.258994  0.759214     1   0.259309  0.264706 -0.583333  0.736842   

       TCARBON_EQ  PH_WATER   TOTAL_N  ...  LCCCode_21450  \
22332    0.109589  0.175676 -0.008197  ...            0.0   
34703   -1.616438 -2.472973  1.561476  ...            0.0   
9592     0.043836 -0.040540 -0.196721  ...            0.0   
26983    0.000000  0.175676 -0.122951  ...            0.0   
35564   -0.810959 -0.770270  1.327869  ...            0.0   

       LCCCode_21454 // 21446 // 21450  LCCCode_21497-121340  \
22332                              0.0                   0.0   
34703           

In [11]:
train_data.describe()

Unnamed: 0,longitude,latitude,fire,elevation,COARSE,SAND,CLAY,TCARBON_EQ,PH_WATER,TOTAL_N,...,LCCCode_21450,LCCCode_21454 // 21446 // 21450,LCCCode_21497-121340,LCCCode_21497-15045,LCCCode_21499-121340,LCCCode_21518,LCCCode_6001,LCCCode_6004,LCCCode_6020,LCCCode_7001 // 8001
count,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,...,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0,33832.0
mean,-0.03427,-0.010588,0.104428,0.0559,-0.0985,-0.016963,0.254679,-0.179619,-0.243566,0.315108,...,0.019538,0.000414,0.007921,3e-05,0.00133,0.005025,0.176904,0.144153,8.9e-05,0.006473
std,0.607673,0.571183,0.305819,0.555873,0.764016,0.737936,0.742341,0.715525,0.869994,0.760891,...,0.138407,0.020338,0.088651,0.005437,0.036447,0.070709,0.381593,0.35125,0.009416,0.080196
min,-1.350746,-0.959974,0.0,-0.714096,-1.764706,-2.333333,-1.421053,-1.616438,-4.202702,-1.254098,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.529852,-0.521845,0.0,-0.492021,-0.647059,-0.583333,-0.263158,-0.720548,-0.635135,-0.172131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.470148,0.478155,0.0,0.507979,0.352941,0.416667,0.736842,0.279452,0.364865,0.827869,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.104477,1.144832,1.0,2.191489,6.0,2.479167,3.947368,2.805479,1.256757,6.565574,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# save

In [12]:
# save train and test data
train_data.to_parquet(os.path.join(merged_dir, "train_data.parquet"), index=False)
test_data.to_parquet(os.path.join(merged_dir, "test_data.parquet"), index=False)

# downsample

### NearMiss (downsample)

NearMiss Under Sampling: Selects majority class samples close to the minority class based on distance metrics. This helps keep samples near the decision boundary, improving classification

In [13]:
train_data['fire'].value_counts()

fire
0    30299
1     3533
Name: count, dtype: int64

In [14]:
from imblearn.under_sampling import NearMiss

# Prepare input features and label arrays
X = train_data.drop(columns=['fire']) 
y = train_data['fire']

# NearMiss version 1 
nm = NearMiss(version=1, n_neighbors=3)  
X_resampled, y_resampled = nm.fit_resample(X, y)

print("New class counts:", pd.Series(y_resampled).value_counts())

New class counts: fire
0    3533
1    3533
Name: count, dtype: int64


In [15]:
# save 
train_resampled = X_resampled.copy()
train_resampled['fire'] = y_resampled.values

train_resampled.to_parquet(os.path.join(merged_dir, "train_downsampled.parquet"), index=False)

### kmeansSmote (hybrid)

KMeansSMOTE improves on basic SMOTE by:

- Clustering: First applies KMeans clustering to the dataset, grouping similar samples together.
- Focused Oversampling: Applies SMOTE within the clusters, focusing on regions where the minority class is most at risk of being ignored (in "safe" clusters rather than noisy ones).
- Reducing Noise: By not generating synthetic samples in sparse/noisy regions, kmeansSMOTE reduces the risk of introducing overlapping or unrealistic samples.

In [16]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import KMeansSMOTE

# Features and labels
X = train_data.drop(columns=['fire'])
y = train_data['fire']

# Step 1: Downsample the majority class to 10,000
rus = RandomUnderSampler(sampling_strategy={0: 10000, 1: sum(y==1)}, random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

# Step 2: Apply KMeansSMOTE to upsample 'fire' to 10,000
kmeans_smote = KMeansSMOTE(sampling_strategy={1: 10000}, random_state=42, n_jobs=-1)
X_bal, y_bal = kmeans_smote.fit_resample(X_rus, y_rus)

# Combine back into a DataFrame
train_balanced = pd.DataFrame(X_bal, columns=X.columns)
train_balanced['fire'] = y_bal

print(train_balanced['fire'].value_counts())

fire
0    10000
1    10000
Name: count, dtype: int64


In [17]:
train_balanced.to_parquet(os.path.join(merged_dir, "train_kmeanssmote.parquet"), index=False)

## SmoteTomek

- SMOTE+Tomek Links generally yields better class separation and higher evaluation scores for the minority class than SMOTE alone, especially in highly imbalanced contexts
- SMOTE is applied, increasing the minority class to match majority.
- Tomek Links removes some majority (and possibly some minority) samples that are ambiguous at the class border.

In [18]:
from imblearn.combine import SMOTETomek

X = train_data.drop(columns=['fire'])
y = train_data['fire']

# Initialize SMOTETomek (by default, resamples minority to the majority class size)
smote_tomek = SMOTETomek(random_state=42, n_jobs=-1)

X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

# Combine back 
train_smtomek = pd.DataFrame(X_resampled, columns=X.columns)
train_smtomek['fire'] = y_resampled

print(train_smtomek['fire'].value_counts())

fire
0    30051
1    30051
Name: count, dtype: int64


In [19]:
train_smtomek.to_parquet(os.path.join(merged_dir, "train_smote_tomek.parquet"), index=False)