In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import os
from shapely import wkt

In [18]:
# Define base paths
data_dir = "../../data/loaded"
processed_data_dir = "../../data/processed"
Images_dir = "../../images/processing_1"
processed_data_dir = "../../data/processed"
merged_dir = "../../data/merged"
soil_file_path = os.path.join(processed_data_dir, "soil_reduced_nondup.csv")
land_file_path = os.path.join(processed_data_dir, "landcover_processed.csv")
elev_file_path = os.path.join(data_dir, "elevation.csv")
fire_file_path = os.path.join(processed_data_dir, "fire_buffer1000.parquet")
clim_file_path = os.path.join(processed_data_dir, "processed_climate.csv")

# join with landcover

In [35]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import os
from pathlib import Path
import numpy as np
from datetime import datetime

# -----------------------------
# Define paths
# -----------------------------
data_dir = "../../data/loaded"
processed_data_dir = "../../data/processed"
merged_dir = "../../data/merged"

Path(merged_dir).mkdir(parents=True, exist_ok=True)

land_file_path = os.path.join(processed_data_dir, "landcover_processed.csv")
fire_file_path = os.path.join(processed_data_dir, "fire_buffer1000.parquet")
output_file = os.path.join(merged_dir, "fire_with_landcover.parquet")

In [37]:
fire = pd.read_parquet(fire_file_path)
fire['fire'].value_counts(normalize=True) * 100

fire
0    89.558062
1    10.441938
Name: proportion, dtype: float64

In [None]:


# -----------------------------
# Configuration
# -----------------------------
LATITUDE_CHUNK_SIZE = 5
OVERLAP = 0.05
LAT_MIN = 31.0
LAT_MAX = 38.0

print("="*60)
print("SPATIAL JOIN: Fire Points â†’ Landcover Polygons")
print("="*60)

# -----------------------------
# Step 1: Load landcover
# -----------------------------
print("\n[1/4] Loading landcover data...")
landcover_df = pd.read_csv(land_file_path, usecols=['geometry', 'LCCCode'])
print(f"Landcover polygons: {len(landcover_df):,}")

# Convert WKT to shapely geometry
landcover_df['geometry'] = landcover_df['geometry'].apply(wkt.loads)
landcover_gdf = gpd.GeoDataFrame(landcover_df, geometry='geometry', crs='EPSG:4326')

# Build spatial index
landcover_gdf.sindex
del landcover_df

# -----------------------------
# Step 2: Latitude bands
# -----------------------------
print("\n[2/4] Creating latitude bands...")
lat_bands = []
actual_min = np.floor(LAT_MIN * 100)/100
actual_max = np.ceil(LAT_MAX * 100)/100
current_lat = actual_min
while current_lat < actual_max:
    band_start = current_lat - OVERLAP
    band_end = min(current_lat + LATITUDE_CHUNK_SIZE + OVERLAP, actual_max + OVERLAP)
    lat_bands.append((band_start, band_end))
    current_lat += LATITUDE_CHUNK_SIZE

print(f"Latitude bands: {len(lat_bands)}")
print(f"Coverage: [{actual_min} - {actual_max}]")

# -----------------------------
# Step 3: Process each band
# -----------------------------
print("\n[3/4] Processing latitude bands...")
start_time = datetime.now()
results = []

for i, (band_start, band_end) in enumerate(lat_bands, 1):
    band_time = datetime.now()
    print(f"\nBand {i}/{len(lat_bands)}: Lat [{band_start:.2f}, {band_end:.2f}]", end='')

    # Load fire points in band
    fire_chunk = pd.read_parquet(
        fire_file_path,
        filters=[('latitude', '>=', band_start), ('latitude', '<=', band_end)]
    )

    if len(fire_chunk) == 0:
        print(" â†’ SKIP (no points)")
        continue

    print(f" â†’ {len(fire_chunk):,} pts", end='')

    # Convert to GeoDataFrame
    fire_gdf = gpd.GeoDataFrame(
        fire_chunk,
        geometry=gpd.points_from_xy(fire_chunk.longitude, fire_chunk.latitude),
        crs='EPSG:4326'
    )

    # Filter landcover to bounding box
    bounds = fire_gdf.total_bounds
    landcover_filtered = landcover_gdf.cx[bounds[0]:bounds[2], bounds[1]:bounds[3]]
    print(f" | {len(landcover_filtered):,} polys", end='')

    if len(landcover_filtered) == 0:
        print(" â†’ WARNING: No polygons!")
        fire_chunk['LCCCode'] = pd.NA
        results.append(fire_chunk[['longitude','latitude','fire','LCCCode']])
        continue

    # Spatial join using intersects
    joined = gpd.sjoin(
        fire_gdf,
        landcover_filtered[['geometry','LCCCode']],
        how='left',
        predicate='intersects'
    )

    # Keep relevant columns and drop duplicates
    result_chunk = joined[['longitude','latitude','fire','LCCCode']].drop_duplicates(subset=['longitude','latitude'])

    # -----------------------------
    # Nearest fallback for unmatched points
    # -----------------------------
    unmatched = result_chunk[result_chunk['LCCCode'].isna()]
    if len(unmatched) > 0:
        fire_unmatched_gdf = gpd.GeoDataFrame(
            unmatched,
            geometry=gpd.points_from_xy(unmatched.longitude, unmatched.latitude),
            crs='EPSG:4326'
        )
        # Find nearest polygon for each unmatched point
        nearest_idx = fire_unmatched_gdf.geometry.apply(lambda pt: landcover_filtered.distance(pt).idxmin())
        result_chunk.loc[result_chunk['LCCCode'].isna(), 'LCCCode'] = landcover_filtered.loc[nearest_idx, 'LCCCode'].values

    matched = result_chunk['LCCCode'].notna().sum()
    print(f" | âœ“ {matched:,} ({matched/len(result_chunk)*100:.1f}%)", end='')

    elapsed = (datetime.now() - band_time).total_seconds()
    print(f" | {elapsed:.1f}s")

    results.append(result_chunk)
    del fire_chunk, fire_gdf, landcover_filtered, joined, result_chunk

# -----------------------------
# Step 4: Combine all results
# -----------------------------
print("\n[4/4] Combining all bands...")
final_result = pd.concat(results, ignore_index=True)
print(f"Before dedup: {len(final_result):,} rows")
final_result = final_result.drop_duplicates(subset=['longitude','latitude'])
print(f"After dedup: {len(final_result):,} rows")

# Save to parquet
final_result.to_parquet(output_file, index=False, compression='snappy')

# -----------------------------
# Summary
# -----------------------------
total_time = (datetime.now() - start_time).total_seconds()
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Total fire points: {len(final_result):,}")
print(f"With LCCCode: {final_result['LCCCode'].notna().sum():,}")
print(f"Without LCCCode: {final_result['LCCCode'].isna().sum():,}")
print(f"Match rate: {final_result['LCCCode'].notna().sum()/len(final_result)*100:.2f}%")
print(f"Total time: {total_time/60:.1f} minutes")
print(f"Output saved to: {output_file}")
print("="*60)


SPATIAL JOIN: Fire Points â†’ Landcover Polygons

[1/4] Loading landcover data...
Landcover polygons: 438,513

[2/4] Creating latitude bands...
Latitude bands: 2
Coverage: [31.0 - 38.0]

[3/4] Processing latitude bands...

Band 1/2: Lat [30.95, 36.05] â†’ 35,116 pts | 140,981 polys | âœ“ 35,116 (100.0%) | 37.9s

Band 2/2: Lat [35.95, 38.05] â†’ 8,200 pts | 59,922 polys


  nearest_idx = fire_unmatched_gdf.geometry.apply(lambda pt: landcover_filtered.distance(pt).idxmin())

  nearest_idx = fire_unmatched_gdf.geometry.apply(lambda pt: landcover_filtered.distance(pt).idxmin())

  nearest_idx = fire_unmatched_gdf.geometry.apply(lambda pt: landcover_filtered.distance(pt).idxmin())

  nearest_idx = fire_unmatched_gdf.geometry.apply(lambda pt: landcover_filtered.distance(pt).idxmin())

  nearest_idx = fire_unmatched_gdf.geometry.apply(lambda pt: landcover_filtered.distance(pt).idxmin())

  nearest_idx = fire_unmatched_gdf.geometry.apply(lambda pt: landcover_filtered.distance(pt).idxmin())

  nearest_idx = fire_unmatched_gdf.geometry.apply(lambda pt: landcover_filtered.distance(pt).idxmin())


 | âœ“ 8,200 (100.0%) | 1.7s

[4/4] Combining all bands...
Before dedup: 43,316 rows
After dedup: 42,291 rows

SUMMARY
Total fire points: 42,291
With LCCCode: 42,291
Without LCCCode: 0
Match rate: 100.00%
Total time: 0.7 minutes
Output saved to: ../../data/merged\fire_with_landcover.parquet


In [38]:
final_result['fire'].value_counts(normalize=True) * 100

fire
0    89.558062
1    10.441938
Name: proportion, dtype: float64

In [39]:
import gc

gc.collect()

33

# elevation

In [40]:
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import KDTree
from pathlib import Path
from datetime import datetime
import pyarrow.parquet as pq

# Paths
processed_data_dir = "../../data/processed"
merged_dir = "../../data/merged"

fire_land_file = os.path.join(merged_dir, "fire_with_landcover.parquet")
elev_file_path = os.path.join("../../data/loaded", "elevation.csv")
output_file = os.path.join(merged_dir, "fire_land_elev.parquet")

# Config
CHUNK_SIZE = 3_000_000       # 3M safe chunk
DECIMALS = 5                 # same precision as elevation dataset

print("=" * 60)
print("NEAREST-NEIGHBOR JOIN: Fire + Elevation")
print("=" * 60)

# ---------------------------------------------------
# 1. Load elevation table and build KD-Tree
# ---------------------------------------------------
print("\n[1/3] Loading elevation data...")

elev = pd.read_csv(
    elev_file_path,
    usecols=["longitude", "latitude", "elevation"]
)

print(f"   Elevation rows: {len(elev):,}")

# Normalize lat/lon (avoid float noise mismatch)
elev["longitude"] = elev["longitude"].round(DECIMALS)
elev["latitude"]  = elev["latitude"].round(DECIMALS)

# Build KD-Tree on (lat, lon)
print("   Building KD-Tree...")
coords = np.vstack([elev["latitude"].to_numpy(), elev["longitude"].to_numpy()]).T
kdtree = KDTree(coords, leaf_size=40)

print("   KD-Tree ready.")

# ---------------------------------------------------
# 2. Read fire_land in chunks and query nearest elevation
# ---------------------------------------------------
print("\n[2/3] Processing fire data in chunks...")

fire_pf = pq.ParquetFile(fire_land_file)
results = []
start_time = datetime.now()
chunk_idx = 0

for batch in fire_pf.iter_batches(batch_size=CHUNK_SIZE):
    chunk_idx += 1
    t0 = datetime.now()

    print(f"\n   Chunk {chunk_idx} â†’ Converting Arrow â†’ pandas...")
    fire_chunk = batch.to_pandas()

    print(f"      Rows: {len(fire_chunk):,}")

    # Round coordinates
    fire_chunk["longitude"] = fire_chunk["longitude"].round(DECIMALS)
    fire_chunk["latitude"]  = fire_chunk["latitude"].round(DECIMALS)

    # -----------------------------------------------------------
    # Nearest elevation lookup
    # -----------------------------------------------------------
    fire_coords = np.vstack([fire_chunk["latitude"], fire_chunk["longitude"]]).T

    dists, idx = kdtree.query(fire_coords, k=1)

    # Get matching elevation values
    fire_chunk["elevation"] = elev["elevation"].to_numpy()[idx[:, 0]]

    print(f"      Added elevation. Max distance = {dists.max():.6f}Â°")

    results.append(fire_chunk)

    dt = (datetime.now() - t0).total_seconds()
    print(f"      âœ“ Done in {dt:.1f} seconds")

# ---------------------------------------------------
# 3. Combine and save
# ---------------------------------------------------
print("\n[3/3] Concatenating results and saving...")

final_df = pd.concat(results, ignore_index=True)

final_df.to_parquet(output_file, index=False, compression="snappy")

total_time = (datetime.now() - start_time).total_seconds()

print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Total points processed: {len(final_df):,}")
print(f"Saved to: {output_file}")
print(f"Total time: {total_time/60:.1f} minutes ({total_time/3600:.2f} hours)")
print("=" * 60)


NEAREST-NEIGHBOR JOIN: Fire + Elevation

[1/3] Loading elevation data...
   Elevation rows: 13,167,778
   Building KD-Tree...
   KD-Tree ready.

[2/3] Processing fire data in chunks...

   Chunk 1 â†’ Converting Arrow â†’ pandas...
      Rows: 42,291
      Added elevation. Max distance = 0.006047Â°
      âœ“ Done in 0.2 seconds

[3/3] Concatenating results and saving...

SUMMARY
Total points processed: 42,291
Saved to: ../../data/merged\fire_land_elev.parquet
Total time: 0.0 minutes (0.00 hours)


In [41]:
final_df['fire'].value_counts(normalize=True) * 100

fire
0    89.558062
1    10.441938
Name: proportion, dtype: float64

In [48]:
final_df.shape

(42291, 5)

# Soil

In [42]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import os
from pathlib import Path
import gc

# ------------ MEMORY CLEANUP ----------------
def free_mem(*vars):
    for v in vars:
        try: del v
        except: pass
    gc.collect()

# ------------ PATHS --------------------------
data_dir = "../../data/loaded"
processed_data_dir = "../../data/processed"
merged_dir = "../../data/merged"
Path(merged_dir).mkdir(parents=True, exist_ok=True)

fire_file_path = os.path.join(merged_dir, "fire_land_elev.parquet")
soil_file_path = os.path.join(processed_data_dir, "soil_reduced_nondup.csv")
out_file = os.path.join(merged_dir, "fire_land_elev_soil.parquet")

# ------------ PARAMETERS ----------------------
LAT_CHUNK = 10
OVERLAP = 0.05
LAT_MIN = 30.00000
LAT_MAX = 38.00000

soil_cols = [
    'longitude', 'latitude', 'COARSE', 'SAND', 'CLAY',
    'TCARBON_EQ', 'PH_WATER', 'TOTAL_N', 'CN_RATIO',
    'CEC_SOIL', 'ESP', 'GYPSUM'
]

# ------------ LOAD SOIL -----------------------
print("[1/3] Loading soil data...")
soil = pd.read_csv(soil_file_path, usecols=soil_cols)

# Rename soil longitude/latitude to avoid duplicates
soil = soil.rename(columns={'longitude':'soil_lon', 'latitude':'soil_lat'})

# Keep 5-decimal precision
soil['soil_lon'] = soil['soil_lon'].round(5)
soil['soil_lat'] = soil['soil_lat'].round(5)

soil_points = soil[['soil_lon','soil_lat']].to_numpy()
tree = cKDTree(soil_points)
print(f"   Soil rows: {len(soil):,}")

# ---------- LATITUDE BANDS --------------------
print("\n[2/3] Preparing latitude bands...")
bands = []
cur = np.floor(LAT_MIN * 100) / 100
lat_end = np.ceil(LAT_MAX * 100) / 100

while cur < lat_end:
    bands.append((cur - OVERLAP,
                  min(cur + LAT_CHUNK + OVERLAP, lat_end + OVERLAP)))
    cur += LAT_CHUNK

print(f"   Total bands: {len(bands)}")

# ------------- PROCESS BANDS -------------------
results = []

print("\n[3/3] Processing...")
for idx, (b_start, b_end) in enumerate(bands,1):

    print(f"\n   Band {idx}/{len(bands)} â†’ [{b_start:.2f}, {b_end:.2f}]")

    fire_chunk = pd.read_parquet(
        fire_file_path,
        filters=[('latitude','>=',b_start),('latitude','<=',b_end)]
    )
    if len(fire_chunk)==0:
        print("      skip (no points)")
        continue

    print(f"      Fire pts: {len(fire_chunk):,}")

    # Round to 5 decimals for stability
    fire_chunk['longitude'] = fire_chunk['longitude'].round(5)
    fire_chunk['latitude'] = fire_chunk['latitude'].round(5)

    fire_pts = fire_chunk[['longitude','latitude']].to_numpy()

    # Query KDTree for nearest soil point
    dist, idxs = tree.query(fire_pts, k=1)

    soil_match = soil.iloc[idxs].reset_index(drop=True)

    # Merge fire + nearest soil (soil coords now renamed)
    merged = pd.concat([fire_chunk.reset_index(drop=True),
                        soil_match.reset_index(drop=True)], axis=1)

    # --- DROP DUPLICATES PER BAND ---
    merged = merged.drop_duplicates(subset=['longitude','latitude'])

    print(f"      âœ“ matched & dedup: {len(merged):,}")

    results.append(merged)

    free_mem(fire_chunk, fire_pts, soil_match, merged)

# ------------- FINAL MERGE ---------------------
print("\nCombining all bands...")
final = pd.concat(results, ignore_index=True)

# Final deduplication (should be minimal now)
final = final.drop_duplicates(subset=['longitude','latitude'])

print(f"\nFinal rows: {len(final):,}")
print(f"Columns: {final.columns.tolist()}")

final.to_parquet(out_file, index=False, compression='snappy')
print(f"Saved â†’ {out_file}")


[1/3] Loading soil data...
   Soil rows: 3,291,933

[2/3] Preparing latitude bands...
   Total bands: 1

[3/3] Processing...

   Band 1/1 â†’ [29.95, 38.05]
      Fire pts: 42,291
      âœ“ matched & dedup: 42,291

Combining all bands...

Final rows: 42,291
Columns: ['longitude', 'latitude', 'fire', 'LCCCode', 'elevation', 'soil_lon', 'soil_lat', 'COARSE', 'SAND', 'CLAY', 'TCARBON_EQ', 'PH_WATER', 'TOTAL_N', 'CN_RATIO', 'CEC_SOIL', 'ESP', 'GYPSUM']
Saved â†’ ../../data/merged\fire_land_elev_soil.parquet


In [43]:
final['fire'].value_counts(normalize=True) * 100

fire
0    89.558062
1    10.441938
Name: proportion, dtype: float64

# climate

In [44]:
import pyarrow.parquet as pq
import pandas as pd
import os

merged_dir = "../../data/merged"
processed_data_dir = "../../data/processed"

fire_file_path = os.path.join(merged_dir, "fire_land_elev_soil.parquet")
clim_file_path = os.path.join(processed_data_dir, "processed_climate.csv")

# Get Parquet column names without loading full data
pf = pq.ParquetFile(fire_file_path)
fire_cols = pf.schema.names

# Get CSV column names (very cheap)
clim_cols = pd.read_csv(clim_file_path, nrows=0).columns.tolist()

print("ðŸ”¥ Fire columns:", fire_cols)
print("ðŸŒ¦ Climate columns:", clim_cols)

# Check for overlap
overlap = set(fire_cols) & set(clim_cols)
print("âš  Columns that would overlap:", overlap)


ðŸ”¥ Fire columns: ['longitude', 'latitude', 'fire', 'LCCCode', 'elevation', 'soil_lon', 'soil_lat', 'COARSE', 'SAND', 'CLAY', 'TCARBON_EQ', 'PH_WATER', 'TOTAL_N', 'CN_RATIO', 'CEC_SOIL', 'ESP', 'GYPSUM']
ðŸŒ¦ Climate columns: ['lon', 'lat', 'prec_median_autumn', 'prec_median_spring', 'prec_median_summer', 'prec_median_winter', 'tmin_median_autumn', 'tmin_median_spring', 'tmin_median_summer', 'tmin_median_winter', 'tmax_median_autumn', 'tmax_median_spring', 'tmax_median_summer', 'tmax_median_winter', 'prec_iqr_autumn', 'prec_iqr_spring', 'prec_iqr_summer', 'prec_iqr_winter', 'tmin_iqr_autumn', 'tmin_iqr_spring', 'tmin_iqr_summer', 'tmin_iqr_winter', 'tmax_iqr_autumn', 'tmax_iqr_spring', 'tmax_iqr_summer', 'tmax_iqr_winter', 'tmax_max', 'prec_min', 'longest_dry_period', 'longest_hot_period']
âš  Columns that would overlap: set()


In [67]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import os
import gc

def free_mem(*vars):
    for v in vars:
        try: del v
        except: pass
    gc.collect()


# ------------------- PATHS -----------------------
merged_dir = "../../data/merged"
processed_data_dir = "../../data/processed"
os.makedirs(merged_dir, exist_ok=True)

fire_file_path = os.path.join(merged_dir, "fire_land_elev_soil.parquet")
clim_file_path = os.path.join(processed_data_dir, "processed_climate.csv")
out_file = os.path.join(merged_dir, "fire_land_elev_soil_climate.parquet")

# -------------------- LOAD DATA -------------------
print("[1/3] Loading data...")

fire = pd.read_parquet(fire_file_path)
clim = pd.read_csv(clim_file_path)

print(" Fire shape before:", fire.shape)
print(" Climate shape:", clim.shape)

# Round fire coords
fire["longitude"] = fire["longitude"].round(5)
fire["latitude"] = fire["latitude"].round(5)

# Prepare climate
clim = clim.rename(columns={'lon':'clim_lon','lat':'clim_lat'})
clim["clim_lon"] = clim["clim_lon"].round(5)
clim["clim_lat"] = clim["clim_lat"].round(5)

clim_cols_extra = [c for c in clim.columns if c not in ['clim_lon','clim_lat']]

# Build KDTree
print("[2/3] Building KDTree for climate...")
clim_points = clim[['clim_lon','clim_lat']].to_numpy()
tree = cKDTree(clim_points)

# Query
print(" Querying nearest climate points...")
fire_pts = fire[['longitude','latitude']].to_numpy()

dist, idxs = tree.query(fire_pts, k=1)

print(" Distance summary:")
print("   min:", float(dist.min()))
print("   max:", float(dist.max()))
print("   mean:", float(dist.mean()))

# Build climate match dataframe
clim_match = clim.iloc[idxs].reset_index(drop=True)

# Merge
print("[3/3] Merging...")
merged = pd.concat([fire.reset_index(drop=True), clim_match], axis=1)

# Drop duplicate fire coords
before = merged.shape[0]
merged = merged.drop_duplicates(subset=['longitude','latitude'])
after = merged.shape[0]

print(f" Dedup: before={before:,} â†’ after={after:,}")

# Save
merged.to_parquet(out_file, index=False, compression='snappy')
print(f"\nSaved â†’ {out_file}")
print("Final shape:", merged.shape)


[1/3] Loading data...
 Fire shape before: (42291, 17)
 Climate shape: (32927, 30)
[2/3] Building KDTree for climate...
 Querying nearest climate points...
 Distance summary:
   min: 0.0009716995420378196
   max: 0.2818213121820252
   mean: 0.031619546642710813
[3/3] Merging...
 Dedup: before=42,291 â†’ after=42,291

Saved â†’ ../../data/merged\fire_land_elev_soil_climate.parquet
Final shape: (42291, 47)


In [69]:
merged['fire'].value_counts(normalize=True) * 100

fire
0    89.558062
1    10.441938
Name: proportion, dtype: float64

In [70]:
merged.shape

(42291, 47)

# Check

In [71]:
# check duplicates in final file
import pandas as pd
import os
merged_dir = "../../data/merged"
fire_file_path = os.path.join(merged_dir, "fire_land_elev_soil_climate.parquet")
fire_cropped = pd.read_parquet(fire_file_path)
fire_cropped.shape

(42291, 47)

In [72]:
merged.shape

(42291, 47)

In [58]:
#check duplicated lon/lat
duplicates = fire_cropped.duplicated(subset=['longitude','latitude'])
duplicates.sum()

np.int64(78136)

In [73]:
# remove duplicates by takin first occurrence
fire_cropped = fire_cropped.drop_duplicates(subset=['longitude','latitude'], keep='first')
fire_cropped.shape

(42291, 47)

In [74]:
fire_cropped.columns

Index(['longitude', 'latitude', 'fire', 'LCCCode', 'elevation', 'soil_lon',
       'soil_lat', 'COARSE', 'SAND', 'CLAY', 'TCARBON_EQ', 'PH_WATER',
       'TOTAL_N', 'CN_RATIO', 'CEC_SOIL', 'ESP', 'GYPSUM', 'clim_lon',
       'clim_lat', 'prec_median_autumn', 'prec_median_spring',
       'prec_median_summer', 'prec_median_winter', 'tmin_median_autumn',
       'tmin_median_spring', 'tmin_median_summer', 'tmin_median_winter',
       'tmax_median_autumn', 'tmax_median_spring', 'tmax_median_summer',
       'tmax_median_winter', 'prec_iqr_autumn', 'prec_iqr_spring',
       'prec_iqr_summer', 'prec_iqr_winter', 'tmin_iqr_autumn',
       'tmin_iqr_spring', 'tmin_iqr_summer', 'tmin_iqr_winter',
       'tmax_iqr_autumn', 'tmax_iqr_spring', 'tmax_iqr_summer',
       'tmax_iqr_winter', 'tmax_max', 'prec_min', 'longest_dry_period',
       'longest_hot_period'],
      dtype='object')

In [75]:
# remove cols: soil_lon, soil_lat, clim_lon, clim_lat
fire_cropped = fire_cropped.drop(columns=['soil_lon', 'soil_lat', 'clim_lon', 'clim_lat'])

In [76]:
fire_cropped.head()

Unnamed: 0,longitude,latitude,fire,LCCCode,elevation,COARSE,SAND,CLAY,TCARBON_EQ,PH_WATER,...,tmin_iqr_summer,tmin_iqr_winter,tmax_iqr_autumn,tmax_iqr_spring,tmax_iqr_summer,tmax_iqr_winter,tmax_max,prec_min,longest_dry_period,longest_hot_period
0,9.68184,33.53246,1,0011,183,9.0,45.333333,20.333333,9.433333,7.866667,...,1.5,0.75,5.0,4.0,1.625,1.5,40.0,0.0,1,2
1,9.27448,33.45376,1,6004,102,3.0,37.5,22.0,16.25,8.0,...,1.5,1.5,5.25,3.875,1.5,1.5,42.0,0.0,1,3
2,8.53867,33.36744,1,6004,21,6.0,90.0,5.0,0.0,6.7,...,1.5,1.25,5.0,4.0,2.0,2.0,44.0,0.0,2,3
3,8.35868,35.69833,1,0003 / 0004,865,19.666667,50.333333,21.333333,11.033333,7.633333,...,1.875,0.375,4.375,4.0,1.625,1.0,35.75,1.6,0,0
4,8.53032,34.9382,1,0011,752,3.0,55.0,18.0,2.0,7.5,...,1.625,0.625,4.75,4.625,1.625,1.25,37.25,1.075,0,0


In [77]:
# save again 
fire_cropped.to_parquet(fire_file_path, index=False, compression='snappy')

In [78]:
fire_cropped['fire'].value_counts(normalize=True) * 100

fire
0    89.558062
1    10.441938
Name: proportion, dtype: float64