In [4]:
import os
import numpy as np
import pandas as pd
import rasterio
import geopandas as gpd
from rasterio.features import shapes
from rasterio.warp import reproject, Resampling
from shapely.geometry import shape, Point
from scipy.spatial import cKDTree
from pyproj import CRS, Transformer

# ===================== 1. Load 200 River Points from File ===================== #
print("Step 1: Loading 200 river points from '200Sampling.shp'...")
sampling_file_path = "200Sampling.shp"
try:
    # Load the user's pre-created shapefile
    river_sample_gdf = gpd.read_file(sampling_file_path)
    num_samples = len(river_sample_gdf)
    
    # Ensure the GeoDataFrame has the correct CRS for further processing
    if river_sample_gdf.crs != "EPSG:4326":
        river_sample_gdf = river_sample_gdf.to_crs("EPSG:4326")
    
    # Ensure all geometries are points
    river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid
    
except FileNotFoundError:
    print(f"❌ Error: The file '{sampling_file_path}' was not found. Please ensure it is in the correct directory.")
    exit()
except Exception as e:
    print(f"❌ Error: Could not read '{sampling_file_path}'. Please check the file's integrity and format. Error: {e}")
    exit()

river_coords = np.array([[p.x, p.y] for p in river_sample_gdf.geometry])
print(f"✅ Successfully loaded {num_samples} points from the shapefile.")
print("Step 1 Complete.\n")

# ===================== 2. Interpolate Initial Features ===================== #
print("Step 2: Interpolating initial features...")
try:
    data = pd.read_csv("../data/RainySeason.csv")
except FileNotFoundError:
    print("❌ Error: RainySeason.csv not found. Please check the file path.")
    exit()

coords = data[['Long', 'Lat']].values
features_to_interpolate = ['CrR', 'NiR', 'CuR', 'AsR', 'CdR', 'PbR', 'MR', 'SandR', 'SiltR', 'ClayR', 'FeR']
numeric_features = data[features_to_interpolate]

def idw_interpolation(known_coords, known_values, query_coords, power=2):
    tree = cKDTree(known_coords)
    dists, idxs = tree.query(query_coords, k=4)
    dists[dists == 0] = 1e-10
    weights = 1 / (dists ** power)
    weights /= weights.sum(axis=1)[:, None]
    return np.sum(weights * known_values[idxs], axis=1)

interpolated_features = np.zeros((len(river_coords), numeric_features.shape[1]))
for i, col in enumerate(numeric_features.columns):
    interpolated_features[:, i] = idw_interpolation(coords, numeric_features[col].values, river_coords)

river_df = pd.DataFrame(interpolated_features, columns=numeric_features.columns)
river_df['Long'] = river_coords[:, 0]
river_df['Lat'] = river_coords[:, 1]
river_df['Source'] = 'River_Interpolated'
print("Step 2 Complete.\n")

# ===================== 3. Calculate Hydrological and LULC Features (Optimized) ===================== #
print("Step 3: Calculating hydrological and LULC features...")
dem_path = "DEMF.tif"
ndwi_path = "CalIndices/ndwi.tif"
aligned_ndwi_path = "ndwi_aligned.tif"

print("  - Aligning rasters...")
def align_rasters(base_raster_path, match_raster_path, out_raster_path):
    try:
        with rasterio.open(base_raster_path) as base:
            base_meta = base.meta.copy()
            with rasterio.open(match_raster_path) as match:
                data = match.read(1)
                reprojected = np.empty((base.height, base.width), dtype=np.float32)
                reproject(
                    source=data,
                    destination=reprojected,
                    src_transform=match.transform,
                    src_crs=match.crs,
                    dst_transform=base.transform,
                    dst_crs=base.crs,
                    resampling=Resampling.bilinear
                )
                kwargs = base_meta
                with rasterio.open(out_raster_path, 'w', **kwargs) as dst:
                    dst.write(reprojected, 1)
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error: Could not open raster files for alignment. Please check the file paths. Error: {e}")
        exit()
    return out_raster_path

align_rasters(dem_path, ndwi_path, aligned_ndwi_path)

print("  - Loading DEM and NDWI...")
with rasterio.open(dem_path) as dem_src:
    dem_transform = dem_src.transform
    dem_crs = dem_src.crs
    dem_resolution = dem_src.res[0]

# Create samples_gdf from the loaded points
samples_gdf = gpd.GeoDataFrame(river_df, geometry=gpd.points_from_xy(river_df.Long, river_df.Lat), crs="EPSG:4326")
samples_gdf = samples_gdf.to_crs(dem_crs)

print("  - Reading vector files (brickfields and industries)...")
try:
    brickfields_path = "brick_field_point.shp"
    industries_path = "industry_point.shp"
    brickfields = gpd.read_file(brickfields_path).to_crs(dem_crs)
    industries = gpd.read_file(industries_path).to_crs(dem_crs)
except Exception as e:
    print(f"❌ Error: Could not read shapefiles. Please check the file paths and ensure they are valid. Error: {e}")
    exit()

samples_gdf["geometry"] = samples_gdf.geometry.centroid
brickfields["geometry"] = brickfields.geometry.centroid
industries["geometry"] = industries.geometry.centroid

def world_to_pixel(transform, x, y):
    col, row = ~transform * (x, y)
    return int(row), int(col)

def compute_distances_euclidean(points_gdf, targets_gdf, transform, resolution):
    target_pixels = np.array([world_to_pixel(transform, x, y) for x, y in zip(targets_gdf.geometry.x, targets_gdf.geometry.y)])
    tree = cKDTree(target_pixels)
    distances = []
    for px, py in zip(points_gdf.geometry.x, points_gdf.geometry.y):
        start = world_to_pixel(transform, px, py)
        dist_pixels, _ = tree.query(start)
        dist_meters = dist_pixels * resolution
        distances.append(dist_meters)
    return np.array(distances)

print("  - Calculating Euclidean-based flow-path distances...")
samples_gdf["hydro_dist_brick"] = compute_distances_euclidean(samples_gdf, brickfields, dem_transform, dem_resolution)
samples_gdf["hydro_dist_ind"] = compute_distances_euclidean(samples_gdf, industries, dem_transform, dem_resolution)

print("  - Starting LULC extraction loop...")
lulc_dir = "LULCMerged"
years = [2017, 2018, 2019, 2020, 2021, 2022]
for y in years:
    lulc_path = os.path.join(lulc_dir, f"LULC{y}.tif")
    print(f"    - Processing LULC for year {y}...")
    try:
        with rasterio.open(lulc_path) as lulc_src:
            # Corrected line: convert GeoDataFrame geometry to a list of (x, y) tuples
            lulc_values = [x[0] for x in lulc_src.sample([(p.x, p.y) for p in samples_gdf.geometry])]
            samples_gdf[f"LULC_{y}"] = lulc_values
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error processing {lulc_path}: {e}")
        # Add a placeholder column with NaNs if the file can't be read
        samples_gdf[f"LULC_{y}"] = np.nan
        continue
print("  - LULC extraction loop complete.")

print("  - Calculating year-to-year LULC changes...")
for i in range(len(years) - 1):
    y1, y2 = years[i], years[i + 1]
    # Corrected line: cast to float for numerical representation
    samples_gdf[f"LULC_change_{y1}_{y2}"] = samples_gdf[f"LULC_{y2}"].astype(float) - samples_gdf[f"LULC_{y1}"].astype(float)

# Corrected line: cast to float for numerical representation
samples_gdf["LULC_change_17_22"] = samples_gdf["LULC_2022"].astype(float) - samples_gdf["LULC_2017"].astype(float)
print("Step 3 Complete.\n")

# ===================== 4. Save Final Output ===================== #
print("Step 4: Saving final output...")
os.makedirs("data", exist_ok=True)
output_name = f"Samples_{num_samples}"
samples_gdf.to_file(f"data/{output_name}.shp")
samples_gdf.drop(columns="geometry").to_csv(f"data/{output_name}.csv", index=False)

print(f"✅ Final dataset with {num_samples} sample(s) saved.")

# ===================== 5. Calculate and Print Mean LULC Changes ===================== #
print("\nStep 5: Calculating and printing mean LULC changes...")
lulc_change_columns = [col for col in samples_gdf.columns if "LULC_change" in col]
for col in lulc_change_columns:
    mean_change = samples_gdf[col].mean()
    print(f"  - Mean value for '{col}': {mean_change:.4f}")
print("Step 5 Complete.\n")


Step 1: Loading 200 river points from '200Sampling.shp'...
✅ Successfully loaded 200 points from the shapefile.
Step 1 Complete.

Step 2: Interpolating initial features...
Step 2 Complete.

Step 3: Calculating hydrological and LULC features...
  - Aligning rasters...



  river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid


  - Loading DEM and NDWI...
  - Reading vector files (brickfields and industries)...
  - Calculating Euclidean-based flow-path distances...
  - Starting LULC extraction loop...
    - Processing LULC for year 2017...
    - Processing LULC for year 2018...
    - Processing LULC for year 2019...
    - Processing LULC for year 2020...
    - Processing LULC for year 2021...
    - Processing LULC for year 2022...
  - LULC extraction loop complete.
  - Calculating year-to-year LULC changes...
Step 3 Complete.

Step 4: Saving final output...
✅ Final dataset with 200 sample(s) saved.

Step 5: Calculating and printing mean LULC changes...
  - Mean value for 'LULC_change_2017_2018': 0.2150
  - Mean value for 'LULC_change_2018_2019': 0.2050
  - Mean value for 'LULC_change_2019_2020': -0.3550
  - Mean value for 'LULC_change_2020_2021': 0.2650
  - Mean value for 'LULC_change_2021_2022': -0.1450
  - Mean value for 'LULC_change_17_22': 0.1850
Step 5 Complete.



  samples_gdf.to_file(f"data/{output_name}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [7]:
import os
import numpy as np
import pandas as pd
import rasterio
import geopandas as gpd
from rasterio.features import shapes
from rasterio.warp import reproject, Resampling
from shapely.geometry import shape, Point
from scipy.spatial import cKDTree
from pyproj import CRS, Transformer

# ===================== 1. Load 200 River Points from File ===================== #
print("Step 1: Loading 200 river points from '200Sampling.shp'...")
sampling_file_path = "200Sampling.shp"
try:
    # Load the user's pre-created shapefile
    river_sample_gdf = gpd.read_file(sampling_file_path)
    num_samples = len(river_sample_gdf)
    
    # Ensure the GeoDataFrame has the correct CRS for further processing
    if river_sample_gdf.crs != "EPSG:4326":
        river_sample_gdf = river_sample_gdf.to_crs("EPSG:4326")
    
    # Ensure all geometries are points
    river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid
    
except FileNotFoundError:
    print(f"❌ Error: The file '{sampling_file_path}' was not found. Please ensure it is in the correct directory.")
    exit()
except Exception as e:
    print(f"❌ Error: Could not read '{sampling_file_path}'. Please check the file's integrity and format. Error: {e}")
    exit()

river_coords = np.array([[p.x, p.y] for p in river_sample_gdf.geometry])
print(f"✅ Successfully loaded {num_samples} points from the shapefile.")
print("Step 1 Complete.\n")

# ===================== 2. Interpolate Initial Features ===================== #
print("Step 2: Interpolating initial features...")
try:
    data = pd.read_csv("../data/RainySeason.csv")
except FileNotFoundError:
    print("❌ Error: RainySeason.csv not found. Please check the file path.")
    exit()

coords = data[['Long', 'Lat']].values
features_to_interpolate = ['CrR', 'NiR', 'CuR', 'AsR', 'CdR', 'PbR', 'MR', 'SandR', 'SiltR', 'ClayR', 'FeR']
numeric_features = data[features_to_interpolate]

def idw_interpolation(known_coords, known_values, query_coords, power=2):
    tree = cKDTree(known_coords)
    dists, idxs = tree.query(query_coords, k=4)
    dists[dists == 0] = 1e-10
    weights = 1 / (dists ** power)
    weights /= weights.sum(axis=1)[:, None]
    return np.sum(weights * known_values[idxs], axis=1)

interpolated_features = np.zeros((len(river_coords), numeric_features.shape[1]))
for i, col in enumerate(numeric_features.columns):
    interpolated_features[:, i] = idw_interpolation(coords, numeric_features[col].values, river_coords)

river_df = pd.DataFrame(interpolated_features, columns=numeric_features.columns)
river_df['Long'] = river_coords[:, 0]
river_df['Lat'] = river_coords[:, 1]
river_df['Source'] = 'River_Interpolated'
print("Step 2 Complete.\n")

# ===================== 3. Calculate Hydrological and LULC Features (Optimized) ===================== #
print("Step 3: Calculating hydrological and LULC features...")
dem_path = "DEMF.tif"
ndwi_path = "CalIndices/ndwi.tif"
aligned_ndwi_path = "ndwi_aligned.tif"

print("  - Aligning rasters...")
def align_rasters(base_raster_path, match_raster_path, out_raster_path):
    try:
        with rasterio.open(base_raster_path) as base:
            base_meta = base.meta.copy()
            with rasterio.open(match_raster_path) as match:
                data = match.read(1)
                reprojected = np.empty((base.height, base.width), dtype=np.float32)
                reproject(
                    source=data,
                    destination=reprojected,
                    src_transform=match.transform,
                    src_crs=match.crs,
                    dst_transform=base.transform,
                    dst_crs=base.crs,
                    resampling=Resampling.bilinear
                )
                kwargs = base_meta
                with rasterio.open(out_raster_path, 'w', **kwargs) as dst:
                    dst.write(reprojected, 1)
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error: Could not open raster files for alignment. Please check the file paths. Error: {e}")
        exit()
    return out_raster_path

align_rasters(dem_path, ndwi_path, aligned_ndwi_path)

print("  - Loading DEM and NDWI...")
with rasterio.open(dem_path) as dem_src:
    dem_transform = dem_src.transform
    dem_crs = dem_src.crs
    dem_resolution = dem_src.res[0]

# Create samples_gdf from the loaded points
samples_gdf = gpd.GeoDataFrame(river_df, geometry=gpd.points_from_xy(river_df.Long, river_df.Lat), crs="EPSG:4326")
samples_gdf = samples_gdf.to_crs(dem_crs)

print("  - Reading vector files (brickfields and industries)...")
try:
    brickfields_path = "brick_field_point.shp"
    industries_path = "industry_point.shp"
    brickfields = gpd.read_file(brickfields_path).to_crs(dem_crs)
    industries = gpd.read_file(industries_path).to_crs(dem_crs)
except Exception as e:
    print(f"❌ Error: Could not read shapefiles. Please check the file paths and ensure they are valid. Error: {e}")
    exit()

samples_gdf["geometry"] = samples_gdf.geometry.centroid
brickfields["geometry"] = brickfields.geometry.centroid
industries["geometry"] = industries.geometry.centroid

# --- ADDED: Calculate number of brickfields and industries within a buffer ---
print("  - Counting nearby brickfields and industries...")
buffer_distance = 1000 # 1000 meters
samples_gdf['num_brick_field'] = 0
samples_gdf['num_industry'] = 0

samples_buffered = samples_gdf.copy()
samples_buffered['geometry'] = samples_buffered.geometry.buffer(buffer_distance)

# Spatial join to count brickfields
sjoin_brick = gpd.sjoin(samples_buffered, brickfields, how="left", predicate="intersects")
brick_counts = sjoin_brick.groupby(sjoin_brick.index).size()
samples_gdf['num_brick_field'] = samples_gdf.index.map(brick_counts).fillna(0).astype(int)

# Spatial join to count industries
sjoin_ind = gpd.sjoin(samples_buffered, industries, how="left", predicate="intersects")
ind_counts = sjoin_ind.groupby(sjoin_ind.index).size()
samples_gdf['num_industry'] = samples_gdf.index.map(ind_counts).fillna(0).astype(int)

print("  - Counting complete.")
# -----------------------------------------------------------------------------

def world_to_pixel(transform, x, y):
    col, row = ~transform * (x, y)
    return int(row), int(col)

def compute_distances_euclidean(points_gdf, targets_gdf, transform, resolution):
    target_pixels = np.array([world_to_pixel(transform, x, y) for x, y in zip(targets_gdf.geometry.x, targets_gdf.geometry.y)])
    tree = cKDTree(target_pixels)
    distances = []
    for px, py in zip(points_gdf.geometry.x, points_gdf.geometry.y):
        start = world_to_pixel(transform, px, py)
        dist_pixels, _ = tree.query(start)
        dist_meters = dist_pixels * resolution
        distances.append(dist_meters)
    return np.array(distances)

print("  - Calculating Euclidean-based flow-path distances...")
samples_gdf["hydro_dist_brick"] = compute_distances_euclidean(samples_gdf, brickfields, dem_transform, dem_resolution)
samples_gdf["hydro_dist_ind"] = compute_distances_euclidean(samples_gdf, industries, dem_transform, dem_resolution)

print("  - Starting LULC extraction loop...")
lulc_dir = "LULCMerged"
years = [2017, 2018, 2019, 2020, 2021, 2022]
for y in years:
    lulc_path = os.path.join(lulc_dir, f"LULC{y}.tif")
    print(f"    - Processing LULC for year {y}...")
    try:
        with rasterio.open(lulc_path) as lulc_src:
            # Corrected line: convert GeoDataFrame geometry to a list of (x, y) tuples
            lulc_values = [x[0] for x in lulc_src.sample([(p.x, p.y) for p in samples_gdf.geometry])]
            samples_gdf[f"LULC_{y}"] = lulc_values
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error processing {lulc_path}: {e}")
        # Add a placeholder column with NaNs if the file can't be read
        samples_gdf[f"LULC_{y}"] = np.nan
        continue
print("  - LULC extraction loop complete.")

print("  - Calculating year-to-year LULC changes...")
for i in range(len(years) - 1):
    y1, y2 = years[i], years[i + 1]
    # Corrected line: cast to float for numerical representation
    samples_gdf[f"LULC_change_{y1}_{y2}"] = samples_gdf[f"LULC_{y2}"].astype(float) - samples_gdf[f"LULC_{y1}"].astype(float)

# Corrected line: cast to float for numerical representation
samples_gdf["LULC_change_17_22"] = samples_gdf["LULC_2022"].astype(float) - samples_gdf["LULC_2017"].astype(float)
print("Step 3 Complete.\n")

# ===================== 4. Save Final Output ===================== #
print("Step 4: Saving final output...")
os.makedirs("data", exist_ok=True)
output_name = f"Samples_{num_samples}"
samples_gdf.to_file(f"data/{output_name}.shp")
samples_gdf.drop(columns="geometry").to_csv(f"data/{output_name}.csv", index=False)

print(f"✅ Final dataset with {num_samples} sample(s) saved.")

# ===================== 5. Calculate and Print Mean LULC Changes ===================== #
print("\nStep 5: Calculating and printing mean LULC changes...")
lulc_change_columns = [col for col in samples_gdf.columns if "LULC_change" in col]
for col in lulc_change_columns:
    mean_change = samples_gdf[col].mean()
    print(f"  - Mean value for '{col}': {mean_change:.4f}")
print("Step 5 Complete.\n")

Step 1: Loading 200 river points from '200Sampling.shp'...
✅ Successfully loaded 200 points from the shapefile.
Step 1 Complete.

Step 2: Interpolating initial features...
Step 2 Complete.

Step 3: Calculating hydrological and LULC features...
  - Aligning rasters...



  river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid


  - Loading DEM and NDWI...
  - Reading vector files (brickfields and industries)...
  - Counting nearby brickfields and industries...
  - Counting complete.
  - Calculating Euclidean-based flow-path distances...
  - Starting LULC extraction loop...
    - Processing LULC for year 2017...
    - Processing LULC for year 2018...
    - Processing LULC for year 2019...
    - Processing LULC for year 2020...
    - Processing LULC for year 2021...
    - Processing LULC for year 2022...
  - LULC extraction loop complete.
  - Calculating year-to-year LULC changes...
Step 3 Complete.

Step 4: Saving final output...
✅ Final dataset with 200 sample(s) saved.

Step 5: Calculating and printing mean LULC changes...
  - Mean value for 'LULC_change_2017_2018': 0.2150
  - Mean value for 'LULC_change_2018_2019': 0.2050
  - Mean value for 'LULC_change_2019_2020': -0.3550
  - Mean value for 'LULC_change_2020_2021': 0.2650
  - Mean value for 'LULC_change_2021_2022': -0.1450
  - Mean value for 'LULC_change_1

  samples_gdf.to_file(f"data/{output_name}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [8]:
import os
import numpy as np
import pandas as pd
import rasterio
import geopandas as gpd
from rasterio.features import shapes
from rasterio.warp import reproject, Resampling
from shapely.geometry import shape, Point
from scipy.spatial import cKDTree
from pyproj import CRS, Transformer

# ===================== 1. Load 200 River Points from File ===================== #
print("Step 1: Loading 200 river points from '200Sampling.shp'...")
sampling_file_path = "200Sampling.shp"
try:
    # Load the user's pre-created shapefile
    river_sample_gdf = gpd.read_file(sampling_file_path)
    num_samples = len(river_sample_gdf)
    
    # Ensure the GeoDataFrame has the correct CRS for further processing
    if river_sample_gdf.crs != "EPSG:4326":
        river_sample_gdf = river_sample_gdf.to_crs("EPSG:4326")
    
    # Ensure all geometries are points
    river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid
    
except FileNotFoundError:
    print(f"❌ Error: The file '{sampling_file_path}' was not found. Please ensure it is in the correct directory.")
    exit()
except Exception as e:
    print(f"❌ Error: Could not read '{sampling_file_path}'. Please check the file's integrity and format. Error: {e}")
    exit()

river_coords = np.array([[p.x, p.y] for p in river_sample_gdf.geometry])
print(f"✅ Successfully loaded {num_samples} points from the shapefile.")
print("Step 1 Complete.\n")

# ===================== 2. Interpolate Initial Features ===================== #
print("Step 2: Interpolating initial features...")
try:
    data = pd.read_csv("../data/RainySeason.csv")
except FileNotFoundError:
    print("❌ Error: RainySeason.csv not found. Please check the file path.")
    exit()

coords = data[['Long', 'Lat']].values
features_to_interpolate = ['CrR', 'NiR', 'CuR', 'AsR', 'CdR', 'PbR', 'MR', 'SandR', 'SiltR', 'ClayR', 'FeR']
numeric_features = data[features_to_interpolate]

def idw_interpolation(known_coords, known_values, query_coords, power=2):
    tree = cKDTree(known_coords)
    dists, idxs = tree.query(query_coords, k=4)
    dists[dists == 0] = 1e-10
    weights = 1 / (dists ** power)
    weights /= weights.sum(axis=1)[:, None]
    return np.sum(weights * known_values[idxs], axis=1)

interpolated_features = np.zeros((len(river_coords), numeric_features.shape[1]))
for i, col in enumerate(numeric_features.columns):
    interpolated_features[:, i] = idw_interpolation(coords, numeric_features[col].values, river_coords)

river_df = pd.DataFrame(interpolated_features, columns=numeric_features.columns)
river_df['Long'] = river_coords[:, 0]
river_df['Lat'] = river_coords[:, 1]
river_df['Source'] = 'River_Interpolated'
print("Step 2 Complete.\n")

# ===================== 3. Calculate Hydrological and LULC Features (Optimized) ===================== #
print("Step 3: Calculating hydrological and LULC features...")
dem_path = "DEMF.tif"
ndwi_path = "CalIndices/ndwi.tif"
aligned_ndwi_path = "ndwi_aligned.tif"

print("  - Aligning rasters...")
def align_rasters(base_raster_path, match_raster_path, out_raster_path):
    try:
        with rasterio.open(base_raster_path) as base:
            base_meta = base.meta.copy()
            with rasterio.open(match_raster_path) as match:
                data = match.read(1)
                reprojected = np.empty((base.height, base.width), dtype=np.float32)
                reproject(
                    source=data,
                    destination=reprojected,
                    src_transform=match.transform,
                    src_crs=match.crs,
                    dst_transform=base.transform,
                    dst_crs=base.crs,
                    resampling=Resampling.bilinear
                )
                kwargs = base_meta
                with rasterio.open(out_raster_path, 'w', **kwargs) as dst:
                    dst.write(reprojected, 1)
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error: Could not open raster files for alignment. Please check the file paths. Error: {e}")
        exit()
    return out_raster_path

align_rasters(dem_path, ndwi_path, aligned_ndwi_path)

print("  - Loading DEM and NDWI...")
with rasterio.open(dem_path) as dem_src:
    dem_transform = dem_src.transform
    dem_crs = dem_src.crs
    dem_resolution = dem_src.res[0]

# Create samples_gdf from the loaded points
samples_gdf = gpd.GeoDataFrame(river_df, geometry=gpd.points_from_xy(river_df.Long, river_df.Lat), crs="EPSG:4326")
samples_gdf = samples_gdf.to_crs(dem_crs)

print("  - Reading vector files (brickfields and industries)...")
try:
    brickfields_path = "brick_field_point.shp"
    industries_path = "industry_point.shp"
    brickfields = gpd.read_file(brickfields_path).to_crs(dem_crs)
    industries = gpd.read_file(industries_path).to_crs(dem_crs)
except Exception as e:
    print(f"❌ Error: Could not read shapefiles. Please check the file paths and ensure they are valid. Error: {e}")
    exit()

samples_gdf["geometry"] = samples_gdf.geometry.centroid
brickfields["geometry"] = brickfields.geometry.centroid
industries["geometry"] = industries.geometry.centroid

# --- ADDED: Calculate number of brickfields and industries within a buffer ---
print("  - Counting nearby brickfields and industries...")
buffer_distance = 1000 # 1000 meters
samples_gdf['num_brick_field'] = 0
samples_gdf['num_industry'] = 0

samples_buffered = samples_gdf.copy()
samples_buffered['geometry'] = samples_buffered.geometry.buffer(buffer_distance)

# Spatial join to count brickfields
sjoin_brick = gpd.sjoin(samples_buffered, brickfields, how="left", predicate="intersects")
brick_counts = sjoin_brick.groupby(sjoin_brick.index).size()
samples_gdf['num_brick_field'] = samples_gdf.index.map(brick_counts).fillna(0).astype(int)

# Spatial join to count industries
sjoin_ind = gpd.sjoin(samples_buffered, industries, how="left", predicate="intersects")
ind_counts = sjoin_ind.groupby(sjoin_ind.index).size()
samples_gdf['num_industry'] = samples_gdf.index.map(ind_counts).fillna(0).astype(int)

print("  - Counting complete.")
# -----------------------------------------------------------------------------

def world_to_pixel(transform, x, y):
    col, row = ~transform * (x, y)
    return int(row), int(col)

def compute_distances_euclidean(points_gdf, targets_gdf, transform, resolution):
    target_pixels = np.array([world_to_pixel(transform, x, y) for x, y in zip(targets_gdf.geometry.x, targets_gdf.geometry.y)])
    tree = cKDTree(target_pixels)
    distances = []
    for px, py in zip(points_gdf.geometry.x, points_gdf.geometry.y):
        start = world_to_pixel(transform, px, py)
        dist_pixels, _ = tree.query(start)
        dist_meters = dist_pixels * resolution
        distances.append(dist_meters)
    return np.array(distances)

print("  - Calculating Euclidean-based flow-path distances...")
samples_gdf["hydro_dist_brick"] = compute_distances_euclidean(samples_gdf, brickfields, dem_transform, dem_resolution)
samples_gdf["hydro_dist_ind"] = compute_distances_euclidean(samples_gdf, industries, dem_transform, dem_resolution)

print("  - Starting LULC extraction loop...")
lulc_dir = "LULCMerged"
years = [2017, 2018, 2019, 2020, 2021, 2022]
for y in years:
    lulc_path = os.path.join(lulc_dir, f"LULC{y}.tif")
    print(f"    - Processing LULC for year {y}...")
    try:
        with rasterio.open(lulc_path) as lulc_src:
            # Corrected line: convert GeoDataFrame geometry to a list of (x, y) tuples
            lulc_values = [x[0] for x in lulc_src.sample([(p.x, p.y) for p in samples_gdf.geometry])]
            samples_gdf[f"LULC_{y}"] = lulc_values
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error processing {lulc_path}: {e}")
        # Add a placeholder column with NaNs if the file can't be read
        samples_gdf[f"LULC_{y}"] = np.nan
        continue
print("  - LULC extraction loop complete.")

# --- UPDATED: Calculate LULC Change using Change Vector Analysis (CVA) ---
print("  - Calculating LULC changes using Change Vector Analysis (CVA)...")
if "LULC_2017" in samples_gdf.columns and "LULC_2022" in samples_gdf.columns:
    lulc_change_vector = samples_gdf["LULC_2022"].astype(float) - samples_gdf["LULC_2017"].astype(float)
    samples_gdf["CVA_Magnitude"] = np.abs(lulc_change_vector)
    samples_gdf["CVA_Direction"] = lulc_change_vector
else:
    print("❌ Error: LULC_2017 and LULC_2022 columns not found. Cannot perform CVA.")
print("Step 3 Complete.\n")

# ===================== 4. Save Final Output ===================== #
print("Step 4: Saving final output...")
os.makedirs("data", exist_ok=True)
output_name = f"Samples_{num_samples}"
samples_gdf.to_file(f"data/{output_name}.shp")
samples_gdf.drop(columns="geometry").to_csv(f"data/{output_name}.csv", index=False)

print(f"✅ Final dataset with {num_samples} sample(s) saved.")

# ===================== 5. Calculate and Print Mean LULC Changes ===================== #
print("\nStep 5: Calculating and printing mean LULC changes...")
cva_columns = ["CVA_Magnitude", "CVA_Direction"]
for col in cva_columns:
    if col in samples_gdf.columns:
        mean_change = samples_gdf[col].mean()
        print(f"  - Mean value for '{col}': {mean_change:.4f}")
print("Step 5 Complete.\n")


Step 1: Loading 200 river points from '200Sampling.shp'...
✅ Successfully loaded 200 points from the shapefile.
Step 1 Complete.

Step 2: Interpolating initial features...
Step 2 Complete.

Step 3: Calculating hydrological and LULC features...
  - Aligning rasters...



  river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid


  - Loading DEM and NDWI...
  - Reading vector files (brickfields and industries)...
  - Counting nearby brickfields and industries...
  - Counting complete.
  - Calculating Euclidean-based flow-path distances...
  - Starting LULC extraction loop...
    - Processing LULC for year 2017...
    - Processing LULC for year 2018...
    - Processing LULC for year 2019...
    - Processing LULC for year 2020...
    - Processing LULC for year 2021...
    - Processing LULC for year 2022...
  - LULC extraction loop complete.
  - Calculating LULC changes using Change Vector Analysis (CVA)...
Step 3 Complete.

Step 4: Saving final output...
✅ Final dataset with 200 sample(s) saved.

Step 5: Calculating and printing mean LULC changes...
  - Mean value for 'CVA_Magnitude': 0.5950
  - Mean value for 'CVA_Direction': 0.1850
Step 5 Complete.



  samples_gdf.to_file(f"data/{output_name}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [9]:
import os
import numpy as np
import pandas as pd
import rasterio
import geopandas as gpd
from rasterio.features import shapes
from rasterio.warp import reproject, Resampling
from shapely.geometry import shape, Point
from scipy.spatial import cKDTree
from pyproj import CRS, Transformer

# ===================== 1. Load 200 River Points from File ===================== #
print("Step 1: Loading 200 river points from '200Sampling.shp'...")
sampling_file_path = "200Sampling.shp"
try:
    # Load the user's pre-created shapefile
    river_sample_gdf = gpd.read_file(sampling_file_path)
    num_samples = len(river_sample_gdf)
    
    # Ensure the GeoDataFrame has the correct CRS for further processing
    if river_sample_gdf.crs != "EPSG:4326":
        river_sample_gdf = river_sample_gdf.to_crs("EPSG:4326")
    
    # Ensure all geometries are points
    river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid
    
except FileNotFoundError:
    print(f"❌ Error: The file '{sampling_file_path}' was not found. Please ensure it is in the correct directory.")
    exit()
except Exception as e:
    print(f"❌ Error: Could not read '{sampling_file_path}'. Please check the file's integrity and format. Error: {e}")
    exit()

river_coords = np.array([[p.x, p.y] for p in river_sample_gdf.geometry])
print(f"✅ Successfully loaded {num_samples} points from the shapefile.")
print("Step 1 Complete.\n")

# ===================== 2. Interpolate Initial Features ===================== #
print("Step 2: Interpolating initial features...")
try:
    data = pd.read_csv("../data/RainySeason.csv")
except FileNotFoundError:
    print("❌ Error: RainySeason.csv not found. Please check the file path.")
    exit()

coords = data[['Long', 'Lat']].values
features_to_interpolate = ['CrR', 'NiR', 'CuR', 'AsR', 'CdR', 'PbR', 'MR', 'SandR', 'SiltR', 'ClayR', 'FeR']
numeric_features = data[features_to_interpolate]

def idw_interpolation(known_coords, known_values, query_coords, power=2):
    tree = cKDTree(known_coords)
    dists, idxs = tree.query(query_coords, k=4)
    dists[dists == 0] = 1e-10
    weights = 1 / (dists ** power)
    weights /= weights.sum(axis=1)[:, None]
    return np.sum(weights * known_values[idxs], axis=1)

interpolated_features = np.zeros((len(river_coords), numeric_features.shape[1]))
for i, col in enumerate(numeric_features.columns):
    interpolated_features[:, i] = idw_interpolation(coords, numeric_features[col].values, river_coords)

river_df = pd.DataFrame(interpolated_features, columns=numeric_features.columns)
river_df['Long'] = river_coords[:, 0]
river_df['Lat'] = river_coords[:, 1]
river_df['Source'] = 'River_Interpolated'
print("Step 2 Complete.\n")

# ===================== 3. Calculate Hydrological and LULC Features (Optimized) ===================== #
print("Step 3: Calculating hydrological and LULC features...")
dem_path = "DEMF.tif"
ndwi_path = "CalIndices/ndwi.tif"
aligned_ndwi_path = "ndwi_aligned.tif"

print("  - Aligning rasters...")
def align_rasters(base_raster_path, match_raster_path, out_raster_path):
    try:
        with rasterio.open(base_raster_path) as base:
            base_meta = base.meta.copy()
            with rasterio.open(match_raster_path) as match:
                data = match.read(1)
                reprojected = np.empty((base.height, base.width), dtype=np.float32)
                reproject(
                    source=data,
                    destination=reprojected,
                    src_transform=match.transform,
                    src_crs=match.crs,
                    dst_transform=base.transform,
                    dst_crs=base.crs,
                    resampling=Resampling.bilinear
                )
                kwargs = base_meta
                with rasterio.open(out_raster_path, 'w', **kwargs) as dst:
                    dst.write(reprojected, 1)
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error: Could not open raster files for alignment. Please check the file paths. Error: {e}")
        exit()
    return out_raster_path

align_rasters(dem_path, ndwi_path, aligned_ndwi_path)

print("  - Loading DEM and NDWI...")
with rasterio.open(dem_path) as dem_src:
    dem_transform = dem_src.transform
    dem_crs = dem_src.crs
    dem_resolution = dem_src.res[0]

# Create samples_gdf from the loaded points
samples_gdf = gpd.GeoDataFrame(river_df, geometry=gpd.points_from_xy(river_df.Long, river_df.Lat), crs="EPSG:4326")
samples_gdf = samples_gdf.to_crs(dem_crs)

print("  - Reading vector files (brickfields and industries)...")
try:
    brickfields_path = "brick_field_point.shp"
    industries_path = "industry_point.shp"
    brickfields = gpd.read_file(brickfields_path).to_crs(dem_crs)
    industries = gpd.read_file(industries_path).to_crs(dem_crs)
except Exception as e:
    print(f"❌ Error: Could not read shapefiles. Please check the file paths and ensure they are valid. Error: {e}")
    exit()

samples_gdf["geometry"] = samples_gdf.geometry.centroid
brickfields["geometry"] = brickfields.geometry.centroid
industries["geometry"] = industries.geometry.centroid

# --- ADDED: Calculate number of brickfields and industries within a buffer ---
print("  - Counting nearby brickfields and industries...")
buffer_distance = 2000 # 2000 meters
samples_gdf['num_brick_field'] = 0
samples_gdf['num_industry'] = 0

samples_buffered = samples_gdf.copy()
samples_buffered['geometry'] = samples_buffered.geometry.buffer(buffer_distance)

# Spatial join to count brickfields
sjoin_brick = gpd.sjoin(samples_buffered, brickfields, how="left", predicate="intersects")
brick_counts = sjoin_brick.groupby(sjoin_brick.index).size()
samples_gdf['num_brick_field'] = samples_gdf.index.map(brick_counts).fillna(0).astype(int)

# Spatial join to count industries
sjoin_ind = gpd.sjoin(samples_buffered, industries, how="left", predicate="intersects")
ind_counts = sjoin_ind.groupby(sjoin_ind.index).size()
samples_gdf['num_industry'] = samples_gdf.index.map(ind_counts).fillna(0).astype(int)

print("  - Counting complete.")
# -----------------------------------------------------------------------------

def world_to_pixel(transform, x, y):
    col, row = ~transform * (x, y)
    return int(row), int(col)

def compute_distances_euclidean(points_gdf, targets_gdf, transform, resolution):
    target_pixels = np.array([world_to_pixel(transform, x, y) for x, y in zip(targets_gdf.geometry.x, targets_gdf.geometry.y)])
    tree = cKDTree(target_pixels)
    distances = []
    for px, py in zip(points_gdf.geometry.x, points_gdf.geometry.y):
        start = world_to_pixel(transform, px, py)
        dist_pixels, _ = tree.query(start)
        dist_meters = dist_pixels * resolution
        distances.append(dist_meters)
    return np.array(distances)

print("  - Calculating Euclidean-based flow-path distances...")
samples_gdf["hydro_dist_brick"] = compute_distances_euclidean(samples_gdf, brickfields, dem_transform, dem_resolution)
samples_gdf["hydro_dist_ind"] = compute_distances_euclidean(samples_gdf, industries, dem_transform, dem_resolution)

print("  - Starting LULC extraction loop...")
lulc_dir = "LULCMerged"
years = [2017, 2018, 2019, 2020, 2021, 2022]
for y in years:
    lulc_path = os.path.join(lulc_dir, f"LULC{y}.tif")
    print(f"    - Processing LULC for year {y}...")
    try:
        with rasterio.open(lulc_path) as lulc_src:
            # Corrected line: convert GeoDataFrame geometry to a list of (x, y) tuples
            lulc_values = [x[0] for x in lulc_src.sample([(p.x, p.y) for p in samples_gdf.geometry])]
            samples_gdf[f"LULC_{y}"] = lulc_values
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error processing {lulc_path}: {e}")
        # Add a placeholder column with NaNs if the file can't be read
        samples_gdf[f"LULC_{y}"] = np.nan
        continue
print("  - LULC extraction loop complete.")

# --- UPDATED: Calculate LULC Change using Change Vector Analysis (CVA) ---
print("  - Calculating LULC changes using Change Vector Analysis (CVA)...")
if "LULC_2017" in samples_gdf.columns and "LULC_2022" in samples_gdf.columns:
    lulc_change_vector = samples_gdf["LULC_2022"].astype(float) - samples_gdf["LULC_2017"].astype(float)
    samples_gdf["CVA_Magnitude"] = np.abs(lulc_change_vector)
    samples_gdf["CVA_Direction"] = lulc_change_vector
else:
    print("❌ Error: LULC_2017 and LULC_2022 columns not found. Cannot perform CVA.")
print("Step 3 Complete.\n")

# ===================== 4. Save Final Output ===================== #
print("Step 4: Saving final output...")
os.makedirs("data", exist_ok=True)
output_name = f"Samples_{num_samples}"
samples_gdf.to_file(f"data/{output_name}.shp")
samples_gdf.drop(columns="geometry").to_csv(f"data/{output_name}.csv", index=False)

print(f"✅ Final dataset with {num_samples} sample(s) saved.")

# ===================== 5. Calculate and Print Mean LULC Changes ===================== #
print("\nStep 5: Calculating and printing mean LULC changes...")
cva_columns = ["CVA_Magnitude", "CVA_Direction"]
for col in cva_columns:
    if col in samples_gdf.columns:
        mean_change = samples_gdf[col].mean()
        print(f"  - Mean value for '{col}': {mean_change:.4f}")
print("Step 5 Complete.\n")


Step 1: Loading 200 river points from '200Sampling.shp'...
✅ Successfully loaded 200 points from the shapefile.
Step 1 Complete.

Step 2: Interpolating initial features...
Step 2 Complete.

Step 3: Calculating hydrological and LULC features...
  - Aligning rasters...



  river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid


  - Loading DEM and NDWI...
  - Reading vector files (brickfields and industries)...
  - Counting nearby brickfields and industries...
  - Counting complete.
  - Calculating Euclidean-based flow-path distances...
  - Starting LULC extraction loop...
    - Processing LULC for year 2017...
    - Processing LULC for year 2018...
    - Processing LULC for year 2019...
    - Processing LULC for year 2020...
    - Processing LULC for year 2021...
    - Processing LULC for year 2022...
  - LULC extraction loop complete.
  - Calculating LULC changes using Change Vector Analysis (CVA)...
Step 3 Complete.

Step 4: Saving final output...
✅ Final dataset with 200 sample(s) saved.

Step 5: Calculating and printing mean LULC changes...
  - Mean value for 'CVA_Magnitude': 0.5950
  - Mean value for 'CVA_Direction': 0.1850
Step 5 Complete.



  samples_gdf.to_file(f"data/{output_name}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [1]:
import os
import numpy as np
import pandas as pd
import rasterio
import geopandas as gpd
from rasterio.features import shapes
from rasterio.warp import reproject, Resampling
from shapely.geometry import shape, Point
from scipy.spatial import cKDTree
from pyproj import CRS, Transformer

# ===================== 1. Load 200 River Points from File ===================== #
print("Step 1: Loading 200 river points from '200Sampling.shp'...")
sampling_file_path = "200Sampling.shp"
try:
    # Load the user's pre-created shapefile
    river_sample_gdf = gpd.read_file(sampling_file_path)
    num_samples = len(river_sample_gdf)
    
    # Ensure the GeoDataFrame has the correct CRS for further processing
    if river_sample_gdf.crs != "EPSG:4326":
        river_sample_gdf = river_sample_gdf.to_crs("EPSG:4326")
    
    # Ensure all geometries are points
    river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid
    
except FileNotFoundError:
    print(f"❌ Error: The file '{sampling_file_path}' was not found. Please ensure it is in the correct directory.")
    exit()
except Exception as e:
    print(f"❌ Error: Could not read '{sampling_file_path}'. Please check the file's integrity and format. Error: {e}")
    exit()

river_coords = np.array([[p.x, p.y] for p in river_sample_gdf.geometry])
print(f"✅ Successfully loaded {num_samples} points from the shapefile.")
print("Step 1 Complete.\n")

# ===================== 2. Interpolate Initial Features ===================== #
print("Step 2: Interpolating initial features...")
try:
    data = pd.read_csv("../data/RainySeason.csv")
except FileNotFoundError:
    print("❌ Error: RainySeason.csv not found. Please check the file path.")
    exit()

coords = data[['Long', 'Lat']].values
features_to_interpolate = ['CrR', 'NiR', 'CuR', 'AsR', 'CdR', 'PbR', 'MR', 'SandR', 'SiltR', 'ClayR', 'FeR', "RI"]
numeric_features = data[features_to_interpolate]

def idw_interpolation(known_coords, known_values, query_coords, power=2):
    tree = cKDTree(known_coords)
    dists, idxs = tree.query(query_coords, k=4)
    dists[dists == 0] = 1e-10
    weights = 1 / (dists ** power)
    weights /= weights.sum(axis=1)[:, None]
    return np.sum(weights * known_values[idxs], axis=1)

interpolated_features = np.zeros((len(river_coords), numeric_features.shape[1]))
for i, col in enumerate(numeric_features.columns):
    interpolated_features[:, i] = idw_interpolation(coords, numeric_features[col].values, river_coords)

river_df = pd.DataFrame(interpolated_features, columns=numeric_features.columns)
river_df['Long'] = river_coords[:, 0]
river_df['Lat'] = river_coords[:, 1]
river_df['Source'] = 'River_Interpolated'
print("Step 2 Complete.\n")

# ===================== 3. Calculate Hydrological and LULC Features (Optimized) ===================== #
print("Step 3: Calculating hydrological and LULC features...")
dem_path = "DEMF.tif"
ndwi_path = "CalIndices/ndwi.tif"
aligned_ndwi_path = "ndwi_aligned.tif"

print("  - Aligning rasters...")
def align_rasters(base_raster_path, match_raster_path, out_raster_path):
    try:
        with rasterio.open(base_raster_path) as base:
            base_meta = base.meta.copy()
            with rasterio.open(match_raster_path) as match:
                data = match.read(1)
                reprojected = np.empty((base.height, base.width), dtype=np.float32)
                reproject(
                    source=data,
                    destination=reprojected,
                    src_transform=match.transform,
                    src_crs=match.crs,
                    dst_transform=base.transform,
                    dst_crs=base.crs,
                    resampling=Resampling.bilinear
                )
                kwargs = base_meta
                with rasterio.open(out_raster_path, 'w', **kwargs) as dst:
                    dst.write(reprojected, 1)
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error: Could not open raster files for alignment. Please check the file paths. Error: {e}")
        exit()
    return out_raster_path

align_rasters(dem_path, ndwi_path, aligned_ndwi_path)

print("  - Loading DEM and NDWI...")
with rasterio.open(dem_path) as dem_src:
    dem_transform = dem_src.transform
    dem_crs = dem_src.crs
    dem_resolution = dem_src.res[0]

# Create samples_gdf from the loaded points
samples_gdf = gpd.GeoDataFrame(river_df, geometry=gpd.points_from_xy(river_df.Long, river_df.Lat), crs="EPSG:4326")
samples_gdf = samples_gdf.to_crs(dem_crs)

print("  - Reading vector files (brickfields and industries)...")
try:
    brickfields_path = "brick_field_point.shp"
    industries_path = "industry_point.shp"
    brickfields = gpd.read_file(brickfields_path).to_crs(dem_crs)
    industries = gpd.read_file(industries_path).to_crs(dem_crs)
except Exception as e:
    print(f"❌ Error: Could not read shapefiles. Please check the file paths and ensure they are valid. Error: {e}")
    exit()

samples_gdf["geometry"] = samples_gdf.geometry.centroid
brickfields["geometry"] = brickfields.geometry.centroid
industries["geometry"] = industries.geometry.centroid

# --- ADDED: Calculate number of brickfields and industries within a buffer ---
print("  - Counting nearby brickfields and industries...")
buffer_distance = 1000 # 1000 meters
samples_gdf['num_brick_field'] = 0
samples_gdf['num_industry'] = 0

samples_buffered = samples_gdf.copy()
samples_buffered['geometry'] = samples_buffered.geometry.buffer(buffer_distance)

# Spatial join to count brickfields
sjoin_brick = gpd.sjoin(samples_buffered, brickfields, how="left", predicate="intersects")
brick_counts = sjoin_brick.groupby(sjoin_brick.index).size()
samples_gdf['num_brick_field'] = samples_gdf.index.map(brick_counts).fillna(0).astype(int)

# Spatial join to count industries
sjoin_ind = gpd.sjoin(samples_buffered, industries, how="left", predicate="intersects")
ind_counts = sjoin_ind.groupby(sjoin_ind.index).size()
samples_gdf['num_industry'] = samples_gdf.index.map(ind_counts).fillna(0).astype(int)

print("  - Counting complete.")
# -----------------------------------------------------------------------------

def world_to_pixel(transform, x, y):
    col, row = ~transform * (x, y)
    return int(row), int(col)

def compute_distances_euclidean(points_gdf, targets_gdf, transform, resolution):
    target_pixels = np.array([world_to_pixel(transform, x, y) for x, y in zip(targets_gdf.geometry.x, targets_gdf.geometry.y)])
    tree = cKDTree(target_pixels)
    distances = []
    for px, py in zip(points_gdf.geometry.x, points_gdf.geometry.y):
        start = world_to_pixel(transform, px, py)
        dist_pixels, _ = tree.query(start)
        dist_meters = dist_pixels * resolution
        distances.append(dist_meters)
    return np.array(distances)

print("  - Calculating Euclidean-based flow-path distances...")
samples_gdf["hydro_dist_brick"] = compute_distances_euclidean(samples_gdf, brickfields, dem_transform, dem_resolution)
samples_gdf["hydro_dist_ind"] = compute_distances_euclidean(samples_gdf, industries, dem_transform, dem_resolution)

print("  - Starting LULC extraction loop...")
lulc_dir = "LULCMerged"
years = [2017, 2018, 2019, 2020, 2021, 2022]
for y in years:
    lulc_path = os.path.join(lulc_dir, f"LULC{y}.tif")
    print(f"    - Processing LULC for year {y}...")
    try:
        with rasterio.open(lulc_path) as lulc_src:
            # Corrected line: convert GeoDataFrame geometry to a list of (x, y) tuples
            lulc_values = [x[0] for x in lulc_src.sample([(p.x, p.y) for p in samples_gdf.geometry])]
            samples_gdf[f"LULC_{y}"] = lulc_values
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error processing {lulc_path}: {e}")
        # Add a placeholder column with NaNs if the file can't be read
        samples_gdf[f"LULC_{y}"] = np.nan
        continue
print("  - LULC extraction loop complete.")

# --- UPDATED: Calculate LULC Change using Change Vector Analysis (CVA) ---
print("  - Calculating LULC changes using Change Vector Analysis (CVA)...")
if "LULC_2017" in samples_gdf.columns and "LULC_2022" in samples_gdf.columns:
    lulc_change_vector = samples_gdf["LULC_2022"].astype(float) - samples_gdf["LULC_2017"].astype(float)
    samples_gdf["CVA_Magnitude"] = np.abs(lulc_change_vector)
    samples_gdf["CVA_Direction"] = lulc_change_vector
else:
    print("❌ Error: LULC_2017 and LULC_2022 columns not found. Cannot perform CVA.")
print("Step 3 Complete.\n")

# ===================== 4. Save Final Output ===================== #
print("Step 4: Saving final output...")
os.makedirs("data", exist_ok=True)
output_name = f"Samples_{num_samples}"
samples_gdf.to_file(f"data/{output_name}.shp")
samples_gdf.drop(columns="geometry").to_csv(f"data/{output_name}test.csv", index=False)

print(f"✅ Final dataset with {num_samples} sample(s) saved.")

# ===================== 5. Calculate and Print Mean LULC Changes ===================== #
print("\nStep 5: Calculating and printing mean LULC changes...")
cva_columns = ["CVA_Magnitude", "CVA_Direction"]
for col in cva_columns:
    if col in samples_gdf.columns:
        mean_change = samples_gdf[col].mean()
        print(f"  - Mean value for '{col}': {mean_change:.4f}")
print("Step 5 Complete.\n")


Step 1: Loading 200 river points from '200Sampling.shp'...



  river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid


✅ Successfully loaded 200 points from the shapefile.
Step 1 Complete.

Step 2: Interpolating initial features...
Step 2 Complete.

Step 3: Calculating hydrological and LULC features...
  - Aligning rasters...
  - Loading DEM and NDWI...
  - Reading vector files (brickfields and industries)...
  - Counting nearby brickfields and industries...
  - Counting complete.
  - Calculating Euclidean-based flow-path distances...
  - Starting LULC extraction loop...
    - Processing LULC for year 2017...
    - Processing LULC for year 2018...
    - Processing LULC for year 2019...
    - Processing LULC for year 2020...
    - Processing LULC for year 2021...
    - Processing LULC for year 2022...
  - LULC extraction loop complete.
  - Calculating LULC changes using Change Vector Analysis (CVA)...
Step 3 Complete.

Step 4: Saving final output...
✅ Final dataset with 200 sample(s) saved.

Step 5: Calculating and printing mean LULC changes...
  - Mean value for 'CVA_Magnitude': 0.5950
  - Mean value f

  samples_gdf.to_file(f"data/{output_name}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


# 100 Samples

In [1]:
import os
import numpy as np
import pandas as pd
import rasterio
import geopandas as gpd
from rasterio.features import shapes
from rasterio.warp import reproject, Resampling
from shapely.geometry import shape, Point
from scipy.spatial import cKDTree
from pyproj import CRS, Transformer

# ===================== 1. Load 200 River Points from File ===================== #
print("Step 1: Loading 200 river points from '100Sampling.shp'...")
sampling_file_path = "100Sampling.shp"
try:
    # Load the user's pre-created shapefile
    river_sample_gdf = gpd.read_file(sampling_file_path)
    num_samples = len(river_sample_gdf)
    
    # Ensure the GeoDataFrame has the correct CRS for further processing
    if river_sample_gdf.crs != "EPSG:4326":
        river_sample_gdf = river_sample_gdf.to_crs("EPSG:4326")
    
    # Ensure all geometries are points
    river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid
    
except FileNotFoundError:
    print(f"❌ Error: The file '{sampling_file_path}' was not found. Please ensure it is in the correct directory.")
    exit()
except Exception as e:
    print(f"❌ Error: Could not read '{sampling_file_path}'. Please check the file's integrity and format. Error: {e}")
    exit()

river_coords = np.array([[p.x, p.y] for p in river_sample_gdf.geometry])
print(f"✅ Successfully loaded {num_samples} points from the shapefile.")
print("Step 1 Complete.\n")

# ===================== 2. Interpolate Initial Features ===================== #
print("Step 2: Interpolating initial features...")
try:
    data = pd.read_csv("../data/RainySeason.csv")
except FileNotFoundError:
    print("❌ Error: RainySeason.csv not found. Please check the file path.")
    exit()

coords = data[['Long', 'Lat']].values
features_to_interpolate = ['CrR', 'NiR', 'CuR', 'AsR', 'CdR', 'PbR', 'MR', 'SandR', 'SiltR', 'ClayR', 'FeR', "RI"]
numeric_features = data[features_to_interpolate]

def idw_interpolation(known_coords, known_values, query_coords, power=2):
    tree = cKDTree(known_coords)
    dists, idxs = tree.query(query_coords, k=4)
    dists[dists == 0] = 1e-10
    weights = 1 / (dists ** power)
    weights /= weights.sum(axis=1)[:, None]
    return np.sum(weights * known_values[idxs], axis=1)

interpolated_features = np.zeros((len(river_coords), numeric_features.shape[1]))
for i, col in enumerate(numeric_features.columns):
    interpolated_features[:, i] = idw_interpolation(coords, numeric_features[col].values, river_coords)

river_df = pd.DataFrame(interpolated_features, columns=numeric_features.columns)
river_df['Long'] = river_coords[:, 0]
river_df['Lat'] = river_coords[:, 1]
river_df['Source'] = 'River_Interpolated'
print("Step 2 Complete.\n")

# ===================== 3. Calculate Hydrological and LULC Features (Optimized) ===================== #
print("Step 3: Calculating hydrological and LULC features...")
dem_path = "DEMF.tif"
ndwi_path = "CalIndices/ndwi.tif"
aligned_ndwi_path = "ndwi_aligned.tif"

print("  - Aligning rasters...")
def align_rasters(base_raster_path, match_raster_path, out_raster_path):
    try:
        with rasterio.open(base_raster_path) as base:
            base_meta = base.meta.copy()
            with rasterio.open(match_raster_path) as match:
                data = match.read(1)
                reprojected = np.empty((base.height, base.width), dtype=np.float32)
                reproject(
                    source=data,
                    destination=reprojected,
                    src_transform=match.transform,
                    src_crs=match.crs,
                    dst_transform=base.transform,
                    dst_crs=base.crs,
                    resampling=Resampling.bilinear
                )
                kwargs = base_meta
                with rasterio.open(out_raster_path, 'w', **kwargs) as dst:
                    dst.write(reprojected, 1)
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error: Could not open raster files for alignment. Please check the file paths. Error: {e}")
        exit()
    return out_raster_path

align_rasters(dem_path, ndwi_path, aligned_ndwi_path)

print("  - Loading DEM and NDWI...")
with rasterio.open(dem_path) as dem_src:
    dem_transform = dem_src.transform
    dem_crs = dem_src.crs
    dem_resolution = dem_src.res[0]

# Create samples_gdf from the loaded points
samples_gdf = gpd.GeoDataFrame(river_df, geometry=gpd.points_from_xy(river_df.Long, river_df.Lat), crs="EPSG:4326")
samples_gdf = samples_gdf.to_crs(dem_crs)

print("  - Reading vector files (brickfields and industries)...")
try:
    brickfields_path = "brick_field_point.shp"
    industries_path = "industry_point.shp"
    brickfields = gpd.read_file(brickfields_path).to_crs(dem_crs)
    industries = gpd.read_file(industries_path).to_crs(dem_crs)
except Exception as e:
    print(f"❌ Error: Could not read shapefiles. Please check the file paths and ensure they are valid. Error: {e}")
    exit()

samples_gdf["geometry"] = samples_gdf.geometry.centroid
brickfields["geometry"] = brickfields.geometry.centroid
industries["geometry"] = industries.geometry.centroid

# --- ADDED: Calculate number of brickfields and industries within a buffer ---
print("  - Counting nearby brickfields and industries...")
buffer_distance = 500 # 500 meters
samples_gdf['num_brick_field'] = 0
samples_gdf['num_industry'] = 0

samples_buffered = samples_gdf.copy()
samples_buffered['geometry'] = samples_buffered.geometry.buffer(buffer_distance)

# Spatial join to count brickfields
sjoin_brick = gpd.sjoin(samples_buffered, brickfields, how="left", predicate="intersects")
brick_counts = sjoin_brick.groupby(sjoin_brick.index).size()
samples_gdf['num_brick_field'] = samples_gdf.index.map(brick_counts).fillna(0).astype(int)

# Spatial join to count industries
sjoin_ind = gpd.sjoin(samples_buffered, industries, how="left", predicate="intersects")
ind_counts = sjoin_ind.groupby(sjoin_ind.index).size()
samples_gdf['num_industry'] = samples_gdf.index.map(ind_counts).fillna(0).astype(int)

print("  - Counting complete.")
# -----------------------------------------------------------------------------

def world_to_pixel(transform, x, y):
    col, row = ~transform * (x, y)
    return int(row), int(col)

def compute_distances_euclidean(points_gdf, targets_gdf, transform, resolution):
    target_pixels = np.array([world_to_pixel(transform, x, y) for x, y in zip(targets_gdf.geometry.x, targets_gdf.geometry.y)])
    tree = cKDTree(target_pixels)
    distances = []
    for px, py in zip(points_gdf.geometry.x, points_gdf.geometry.y):
        start = world_to_pixel(transform, px, py)
        dist_pixels, _ = tree.query(start)
        dist_meters = dist_pixels * resolution
        distances.append(dist_meters)
    return np.array(distances)

print("  - Calculating Euclidean-based flow-path distances...")
samples_gdf["hydro_dist_brick"] = compute_distances_euclidean(samples_gdf, brickfields, dem_transform, dem_resolution)
samples_gdf["hydro_dist_ind"] = compute_distances_euclidean(samples_gdf, industries, dem_transform, dem_resolution)

print("  - Starting LULC extraction loop...")
lulc_dir = "LULCMerged"
years = [2017, 2018, 2019, 2020, 2021, 2022]
for y in years:
    lulc_path = os.path.join(lulc_dir, f"LULC{y}.tif")
    print(f"    - Processing LULC for year {y}...")
    try:
        with rasterio.open(lulc_path) as lulc_src:
            # Corrected line: convert GeoDataFrame geometry to a list of (x, y) tuples
            lulc_values = [x[0] for x in lulc_src.sample([(p.x, p.y) for p in samples_gdf.geometry])]
            samples_gdf[f"LULC_{y}"] = lulc_values
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error processing {lulc_path}: {e}")
        # Add a placeholder column with NaNs if the file can't be read
        samples_gdf[f"LULC_{y}"] = np.nan
        continue
print("  - LULC extraction loop complete.")

# --- UPDATED: Calculate LULC Change using Change Vector Analysis (CVA) ---
print("  - Calculating LULC changes using Change Vector Analysis (CVA)...")
if "LULC_2017" in samples_gdf.columns and "LULC_2022" in samples_gdf.columns:
    lulc_change_vector = samples_gdf["LULC_2022"].astype(float) - samples_gdf["LULC_2017"].astype(float)
    samples_gdf["CVA_Magnitude"] = np.abs(lulc_change_vector)
    samples_gdf["CVA_Direction"] = lulc_change_vector
else:
    print("❌ Error: LULC_2017 and LULC_2022 columns not found. Cannot perform CVA.")
print("Step 3 Complete.\n")

# ===================== 4. Save Final Output ===================== #
print("Step 4: Saving final output...")
os.makedirs("data", exist_ok=True)
output_name = f"Samples_{num_samples}"
samples_gdf.to_file(f"data/{output_name}.shp")
samples_gdf = samples_gdf.round(2)
samples_gdf.drop(columns="geometry").to_csv(f"data/{output_name}.csv", index=False)

print(f"✅ Final dataset with {num_samples} sample(s) saved.")

# ===================== 5. Calculate and Print Mean LULC Changes ===================== #
print("\nStep 5: Calculating and printing mean LULC changes...")
cva_columns = ["CVA_Magnitude", "CVA_Direction"]
for col in cva_columns:
    if col in samples_gdf.columns:
        mean_change = samples_gdf[col].mean()
        print(f"  - Mean value for '{col}': {mean_change:.4f}")
print("Step 5 Complete.\n")

Step 1: Loading 200 river points from '100Sampling.shp'...



  river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid


✅ Successfully loaded 100 points from the shapefile.
Step 1 Complete.

Step 2: Interpolating initial features...
Step 2 Complete.

Step 3: Calculating hydrological and LULC features...
  - Aligning rasters...
  - Loading DEM and NDWI...
  - Reading vector files (brickfields and industries)...
  - Counting nearby brickfields and industries...
  - Counting complete.
  - Calculating Euclidean-based flow-path distances...
  - Starting LULC extraction loop...
    - Processing LULC for year 2017...
    - Processing LULC for year 2018...
    - Processing LULC for year 2019...
    - Processing LULC for year 2020...
    - Processing LULC for year 2021...
    - Processing LULC for year 2022...
  - LULC extraction loop complete.
  - Calculating LULC changes using Change Vector Analysis (CVA)...
Step 3 Complete.

Step 4: Saving final output...
✅ Final dataset with 100 sample(s) saved.

Step 5: Calculating and printing mean LULC changes...
  - Mean value for 'CVA_Magnitude': 0.0400
  - Mean value f

  samples_gdf.to_file(f"data/{output_name}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [13]:
# ===================== 3. Calculate Hydrological and LULC Features (Optimized) ===================== #
print("Step 3: Calculating hydrological and LULC features...")
dem_path = "DEMF.tif"
ndwi_path = "CalIndices/ndwi.tif"
aligned_ndwi_path = "ndwi_aligned.tif"

print("  - Aligning rasters...")
def align_rasters(base_raster_path, match_raster_path, out_raster_path):
    try:
        with rasterio.open(base_raster_path) as base:
            base_meta = base.meta.copy()
            with rasterio.open(match_raster_path) as match:
                data = match.read(1)
                reprojected = np.empty((base.height, base.width), dtype=np.float32)
                reproject(
                    source=data,
                    destination=reprojected,
                    src_transform=match.transform,
                    src_crs=match.crs,
                    dst_transform=base.transform,
                    dst_crs=base.crs,
                    resampling=Resampling.bilinear
                )
                kwargs = base_meta
                with rasterio.open(out_raster_path, 'w', **kwargs) as dst:
                    dst.write(reprojected, 1)
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error: Could not open raster files for alignment. Please check the file paths. Error: {e}")
        exit()
    return out_raster_path

align_rasters(dem_path, ndwi_path, aligned_ndwi_path)

print("  - Loading DEM and NDWI...")
with rasterio.open(dem_path) as dem_src:
    dem_transform = dem_src.transform
    dem_crs = dem_src.crs
    dem_resolution = dem_src.res[0]

# Create samples_gdf from the loaded points
samples_gdf = gpd.GeoDataFrame(river_df, geometry=gpd.points_from_xy(river_df.Long, river_df.Lat), crs="EPSG:4326")
samples_gdf = samples_gdf.to_crs(dem_crs)

print("  - Reading vector files (brickfields and industries)...")
try:
    brickfields_path = "brick_field_point.shp"
    industries_path = "industry_point.shp"
    brickfields = gpd.read_file(brickfields_path).to_crs(dem_crs)
    industries = gpd.read_file(industries_path).to_crs(dem_crs)
except Exception as e:
    print(f"❌ Error: Could not read shapefiles. Please check the file paths and ensure they are valid. Error: {e}")
    exit()

samples_gdf["geometry"] = samples_gdf.geometry.centroid
brickfields["geometry"] = brickfields.geometry.centroid
industries["geometry"] = industries.geometry.centroid

# --- ADDED: Calculate number of brickfields and industries within a buffer ---
print("  - Counting nearby brickfields and industries...")
buffer_distance = 1000 # 1000 meters
samples_gdf['num_brick_field'] = 0
samples_gdf['num_industry'] = 0

samples_buffered = samples_gdf.copy()
samples_buffered['geometry'] = samples_buffered.geometry.buffer(buffer_distance)

# Spatial join to count brickfields
sjoin_brick = gpd.sjoin(samples_buffered, brickfields, how="left", predicate="intersects")
brick_counts = sjoin_brick.groupby(sjoin_brick.index).size()
samples_gdf['num_brick_field'] = samples_gdf.index.map(brick_counts).fillna(0).astype(int)

# Spatial join to count industries
sjoin_ind = gpd.sjoin(samples_buffered, industries, how="left", predicate="intersects")
ind_counts = sjoin_ind.groupby(sjoin_ind.index).size()
samples_gdf['num_industry'] = samples_gdf.index.map(ind_counts).fillna(0).astype(int)

print("  - Counting complete.")
# -----------------------------------------------------------------------------

def world_to_pixel(transform, x, y):
    col, row = ~transform * (x, y)
    return int(row), int(col)

def compute_distances_euclidean(points_gdf, targets_gdf, transform, resolution):
    target_pixels = np.array([world_to_pixel(transform, x, y) for x, y in zip(targets_gdf.geometry.x, targets_gdf.geometry.y)])
    tree = cKDTree(target_pixels)
    distances = []
    for px, py in zip(points_gdf.geometry.x, points_gdf.geometry.y):
        start = world_to_pixel(transform, px, py)
        dist_pixels, _ = tree.query(start)
        dist_meters = dist_pixels * resolution
        distances.append(dist_meters)
    return np.array(distances)

print("  - Calculating Euclidean-based flow-path distances...")
samples_gdf["hydro_dist_brick"] = compute_distances_euclidean(samples_gdf, brickfields, dem_transform, dem_resolution)
samples_gdf["hydro_dist_ind"] = compute_distances_euclidean(samples_gdf, industries, dem_transform, dem_resolution)

print("  - Starting LULC extraction loop...")
lulc_dir = "LULCMerged"
years = [2017, 2018, 2019, 2020, 2021, 2022]
for y in years:
    lulc_path = os.path.join(lulc_dir, f"LULC{y}.tif")
    print(f"    - Processing LULC for year {y}...")
    try:
        with rasterio.open(lulc_path) as lulc_src:
            # Corrected line: convert GeoDataFrame geometry to a list of (x, y) tuples
            lulc_values = [x[0] for x in lulc_src.sample([(p.x, p.y) for p in samples_gdf.geometry])]
            samples_gdf[f"LULC_{y}"] = lulc_values
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error processing {lulc_path}: {e}")
        # Add a placeholder column with NaNs if the file can't be read
        samples_gdf[f"LULC_{y}"] = np.nan
        continue
print("  - LULC extraction loop complete.")


# --- UPDATED: Calculate LULC Change using a Transition Matrix ---
print("  - Calculating LULC changes using a Transition Matrix...")
if "LULC_2017" in samples_gdf.columns and "LULC_2022" in samples_gdf.columns:
    # Create a transition matrix using pandas crosstab
    lulc_transition_matrix = pd.crosstab(
        samples_gdf["LULC_2017"],
        samples_gdf["LULC_2022"],
        rownames=['LULC 2017'],
        colnames=['LULC 2022']
    )
    print("✅ LULC Transition Matrix:")
    print(lulc_transition_matrix)
else:
    print("❌ Error: LULC_2017 and LULC_2022 columns not found. Cannot perform Transition Matrix analysis.")
print("Step 3 Complete.\n")

# ===================== 4. Save Final Output ===================== #
print("Step 4: Saving final output...")
os.makedirs("data", exist_ok=True)
output_name = f"Samples_{num_samples}"
samples_gdf.to_file(f"data/{output_name}.shp")
samples_gdf.drop(columns="geometry").to_csv(f"data/{output_name}1.csv", index=False)

# --- ADDED: Save the transition matrix to a separate CSV file ---
if 'lulc_transition_matrix' in locals():
    lulc_transition_matrix.to_csv("data/LULC_Transition_Matrix.csv")
    print("✅ LULC transition matrix saved to 'data/LULC_Transition_Matrix.csv'.")
else:
    print("❌ LULC transition matrix was not created, so it cannot be saved.")
print(f"✅ Final dataset with {num_samples} sample(s) saved.")

Step 3: Calculating hydrological and LULC features...
  - Aligning rasters...
  - Loading DEM and NDWI...
  - Reading vector files (brickfields and industries)...
  - Counting nearby brickfields and industries...
  - Counting complete.
  - Calculating Euclidean-based flow-path distances...
  - Starting LULC extraction loop...
    - Processing LULC for year 2017...
    - Processing LULC for year 2018...
    - Processing LULC for year 2019...
    - Processing LULC for year 2020...
    - Processing LULC for year 2021...
    - Processing LULC for year 2022...
  - LULC extraction loop complete.
  - Calculating LULC changes using a Transition Matrix...
✅ LULC Transition Matrix:
LULC 2022   1  4  7
LULC 2017          
1          97  0  0
4           0  1  0
5           1  0  0
7           0  0  1
Step 3 Complete.

Step 4: Saving final output...
✅ LULC transition matrix saved to 'data/LULC_Transition_Matrix.csv'.
✅ Final dataset with 100 sample(s) saved.


  samples_gdf.to_file(f"data/{output_name}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [10]:
df = pd.read_csv("data/Samples_100.csv", index_col=False)
df.head()

Unnamed: 0,CrR,NiR,CuR,AsR,CdR,PbR,MR,SandR,SiltR,ClayR,...,hydro_dist_brick,hydro_dist_ind,LULC_2017,LULC_2018,LULC_2019,LULC_2020,LULC_2021,LULC_2022,CVA_Magnitude,CVA_Direction
0,84.18,21.78,43.81,9.03,2.63,51.47,32.24,21.32,45.71,32.47,...,1622.59,2350.53,1,1,1,1,1,1,0.0,0.0
1,86.05,21.19,43.03,8.82,2.64,51.08,32.25,21.37,45.32,32.9,...,770.25,1351.45,1,1,1,1,1,1,0.0,0.0
2,88.61,20.39,41.98,8.53,2.65,50.63,32.29,21.52,44.66,33.55,...,1248.12,503.12,1,1,1,1,1,1,0.0,0.0
3,90.62,19.78,41.16,8.28,2.66,50.47,32.35,21.69,44.0,34.15,...,2003.28,1400.06,1,1,1,1,1,1,0.0,0.0
4,89.63,20.08,41.54,8.48,2.66,50.03,32.24,21.54,44.6,33.6,...,1662.69,1915.44,7,7,7,7,7,7,0.0,0.0


In [11]:
df.tail()

Unnamed: 0,CrR,NiR,CuR,AsR,CdR,PbR,MR,SandR,SiltR,ClayR,...,hydro_dist_brick,hydro_dist_ind,LULC_2017,LULC_2018,LULC_2019,LULC_2020,LULC_2021,LULC_2022,CVA_Magnitude,CVA_Direction
95,24.59,41.34,76.67,16.08,2.81,107.79,32.37,23.55,41.39,35.95,...,344.83,3440.29,1,1,1,1,1,1,0.0,0.0
96,16.93,43.33,79.61,15.51,2.95,121.0,32.82,17.04,44.98,38.81,...,260.11,3112.6,1,1,1,1,1,1,0.0,0.0
97,14.04,44.79,80.52,16.79,2.91,117.86,32.97,14.37,49.67,37.68,...,316.23,2152.32,1,1,1,1,1,1,0.0,0.0
98,19.89,43.18,78.25,17.24,2.78,107.0,32.67,19.43,47.15,35.23,...,411.17,1775.18,1,1,1,1,1,1,0.0,0.0
99,14.4,41.37,83.48,13.05,3.26,145.71,32.64,17.71,37.4,44.8,...,1686.9,3080.91,4,5,5,4,5,4,0.0,0.0


In [7]:
import os
import numpy as np
import pandas as pd
import rasterio
import geopandas as gpd
from rasterio.features import shapes
from rasterio.warp import reproject, Resampling
from shapely.geometry import shape, Point
from scipy.spatial import cKDTree
from pyproj import CRS, Transformer

# ===================== 1. Load 200 River Points from File ===================== #
print("Step 1: Loading 200 river points from '100Sampling.shp'...")
sampling_file_path = "100Sampling.shp"
try:
    # Load the user's pre-created shapefile
    river_sample_gdf = gpd.read_file(sampling_file_path)
    num_samples = len(river_sample_gdf)
    
    # Ensure the GeoDataFrame has the correct CRS for further processing
    if river_sample_gdf.crs != "EPSG:4326":
        river_sample_gdf = river_sample_gdf.to_crs("EPSG:4326")
    
    # Ensure all geometries are points
    river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid
    
except FileNotFoundError:
    print(f"❌ Error: The file '{sampling_file_path}' was not found. Please ensure it is in the correct directory.")
    exit()
except Exception as e:
    print(f"❌ Error: Could not read '{sampling_file_path}'. Please check the file's integrity and format. Error: {e}")
    exit()

river_coords = np.array([[p.x, p.y] for p in river_sample_gdf.geometry])
print(f"✅ Successfully loaded {num_samples} points from the shapefile.")
print("Step 1 Complete.\n")

# ===================== 2. Interpolate Initial Features ===================== #
print("Step 2: Interpolating initial features...")
try:
    # Fix: Specify the correct encoding for the CSV file
    data = pd.read_csv("../data/WinterSeason1.csv")
except FileNotFoundError:
    print("❌ Error: RainySeason.csv not found. Please check the file path.")
    exit()

coords = data[['Long', 'Lat']].values
features_to_interpolate = ['CrW', 'NiW', 'CuW', 'AsW', 'CdW', 'PbW', 'MW', 'SandW', 'SiltW', 'ClayW', 'FeW', "RI"]
numeric_features = data[features_to_interpolate]

def idw_interpolation(known_coords, known_values, query_coords, power=2):
    tree = cKDTree(known_coords)
    dists, idxs = tree.query(query_coords, k=4)
    dists[dists == 0] = 1e-10
    weights = 1 / (dists ** power)
    weights /= weights.sum(axis=1)[:, None]
    return np.sum(weights * known_values[idxs], axis=1)

interpolated_features = np.zeros((len(river_coords), numeric_features.shape[1]))
for i, col in enumerate(numeric_features.columns):
    interpolated_features[:, i] = idw_interpolation(coords, numeric_features[col].values, river_coords)

river_df = pd.DataFrame(interpolated_features, columns=numeric_features.columns)
river_df['Long'] = river_coords[:, 0]
river_df['Lat'] = river_coords[:, 1]
river_df['Source'] = 'River_Interpolated'
print("Step 2 Complete.\n")

# ===================== 3. Calculate Hydrological and LULC Features (Optimized) ===================== #
print("Step 3: Calculating hydrological and LULC features...")
dem_path = "DEMF.tif"
ndwi_path = "CalIndices/ndwi.tif"
aligned_ndwi_path = "ndwi_aligned.tif"

print("    - Aligning rasters...")
def align_rasters(base_raster_path, match_raster_path, out_raster_path):
    try:
        with rasterio.open(base_raster_path) as base:
            base_meta = base.meta.copy()
            with rasterio.open(match_raster_path) as match:
                data = match.read(1)
                reprojected = np.empty((base.height, base.width), dtype=np.float32)
                reproject(
                    source=data,
                    destination=reprojected,
                    src_transform=match.transform,
                    src_crs=match.crs,
                    dst_transform=base.transform,
                    dst_crs=base.crs,
                    resampling=Resampling.bilinear
                )
                kwargs = base_meta
                with rasterio.open(out_raster_path, 'w', **kwargs) as dst:
                    dst.write(reprojected, 1)
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error: Could not open raster files for alignment. Please check the file paths. Error: {e}")
        exit()
    return out_raster_path

align_rasters(dem_path, ndwi_path, aligned_ndwi_path)

print("    - Loading DEM and NDWI...")
with rasterio.open(dem_path) as dem_src:
    dem_transform = dem_src.transform
    dem_crs = dem_src.crs
    dem_resolution = dem_src.res[0]

# Create samples_gdf from the loaded points
samples_gdf = gpd.GeoDataFrame(river_df, geometry=gpd.points_from_xy(river_df.Long, river_df.Lat), crs="EPSG:4326")
samples_gdf = samples_gdf.to_crs(dem_crs)

print("    - Reading vector files (brickfields and industries)...")
try:
    brickfields_path = "brick_field_point.shp"
    industries_path = "industry_point.shp"
    brickfields = gpd.read_file(brickfields_path).to_crs(dem_crs)
    industries = gpd.read_file(industries_path).to_crs(dem_crs)
except Exception as e:
    print(f"❌ Error: Could not read shapefiles. Please check the file paths and ensure they are valid. Error: {e}")
    exit()

samples_gdf["geometry"] = samples_gdf.geometry.centroid
brickfields["geometry"] = brickfields.geometry.centroid
industries["geometry"] = industries.geometry.centroid

# --- ADDED: Calculate number of brickfields and industries within a buffer ---
print("    - Counting nearby brickfields and industries...")
buffer_distance = 500 # 500 meters
samples_gdf['num_brick_field'] = 0
samples_gdf['num_industry'] = 0

samples_buffered = samples_gdf.copy()
samples_buffered['geometry'] = samples_buffered.geometry.buffer(buffer_distance)

# Spatial join to count brickfields
sjoin_brick = gpd.sjoin(samples_buffered, brickfields, how="left", predicate="intersects")
brick_counts = sjoin_brick.groupby(sjoin_brick.index).size()
samples_gdf['num_brick_field'] = samples_gdf.index.map(brick_counts).fillna(0).astype(int)

# Spatial join to count industries
sjoin_ind = gpd.sjoin(samples_buffered, industries, how="left", predicate="intersects")
ind_counts = sjoin_ind.groupby(sjoin_ind.index).size()
samples_gdf['num_industry'] = samples_gdf.index.map(ind_counts).fillna(0).astype(int)

print("    - Counting complete.")
# -----------------------------------------------------------------------------

def world_to_pixel(transform, x, y):
    col, row = ~transform * (x, y)
    return int(row), int(col)

def compute_distances_euclidean(points_gdf, targets_gdf, transform, resolution):
    target_pixels = np.array([world_to_pixel(transform, x, y) for x, y in zip(targets_gdf.geometry.x, targets_gdf.geometry.y)])
    tree = cKDTree(target_pixels)
    distances = []
    for px, py in zip(points_gdf.geometry.x, points_gdf.geometry.y):
        start = world_to_pixel(transform, px, py)
        dist_pixels, _ = tree.query(start)
        dist_meters = dist_pixels * resolution
        distances.append(dist_meters)
    return np.array(distances)

print("    - Calculating Euclidean-based flow-path distances...")
samples_gdf["hydro_dist_brick"] = compute_distances_euclidean(samples_gdf, brickfields, dem_transform, dem_resolution)
samples_gdf["hydro_dist_ind"] = compute_distances_euclidean(samples_gdf, industries, dem_transform, dem_resolution)

print("    - Starting LULC extraction loop...")
lulc_dir = "LULCMerged"
years = [2017, 2018, 2019, 2020, 2021, 2022]
for y in years:
    lulc_path = os.path.join(lulc_dir, f"LULC{y}.tif")
    print(f"      - Processing LULC for year {y}...")
    try:
        with rasterio.open(lulc_path) as lulc_src:
            # Corrected line: convert GeoDataFrame geometry to a list of (x, y) tuples
            lulc_values = [x[0] for x in lulc_src.sample([(p.x, p.y) for p in samples_gdf.geometry])]
            samples_gdf[f"LULC_{y}"] = lulc_values
    except rasterio.errors.RasterioIOError as e:
        print(f"❌ Error processing {lulc_path}: {e}")
        # Add a placeholder column with NaNs if the file can't be read
        samples_gdf[f"LULC_{y}"] = np.nan
        continue
print("    - LULC extraction loop complete.")

# --- UPDATED: Calculate LULC Change using Change Vector Analysis (CVA) ---
print("    - Calculating LULC changes using Change Vector Analysis (CVA)...")
if "LULC_2017" in samples_gdf.columns and "LULC_2022" in samples_gdf.columns:
    lulc_change_vector = samples_gdf["LULC_2022"].astype(float) - samples_gdf["LULC_2017"].astype(float)
    samples_gdf["CVA_Magnitude"] = np.abs(lulc_change_vector)
    samples_gdf["CVA_Direction"] = lulc_change_vector
else:
    print("❌ Error: LULC_2017 and LULC_2022 columns not found. Cannot perform CVA.")
print("Step 3 Complete.\n")

# ===================== 4. Save Final Output ===================== #
print("Step 4: Saving final output...")
os.makedirs("data", exist_ok=True)
output_name = f"Samples_{num_samples}W"
samples_gdf.to_file(f"data/{output_name}W.shp")
samples_gdf = samples_gdf.round(2)
samples_gdf.drop(columns="geometry").to_csv(f"data/{output_name}.csv", index=False)

print(f"✅ Final dataset with {num_samples} sample(s) saved.")

# ===================== 5. Calculate and Print Mean LULC Changes ===================== #
print("\nStep 5: Calculating and printing mean LULC changes...")
cva_columns = ["CVA_Magnitude", "CVA_Direction"]
for col in cva_columns:
    if col in samples_gdf.columns:
        mean_change = samples_gdf[col].mean()
        print(f"    - Mean value for '{col}': {mean_change:.4f}")
print("Step 5 Complete.\n")

Step 1: Loading 200 river points from '100Sampling.shp'...
✅ Successfully loaded 100 points from the shapefile.
Step 1 Complete.

Step 2: Interpolating initial features...
Step 2 Complete.

Step 3: Calculating hydrological and LULC features...
    - Aligning rasters...



  river_sample_gdf["geometry"] = river_sample_gdf.geometry.centroid


    - Loading DEM and NDWI...
    - Reading vector files (brickfields and industries)...
    - Counting nearby brickfields and industries...
    - Counting complete.
    - Calculating Euclidean-based flow-path distances...
    - Starting LULC extraction loop...
      - Processing LULC for year 2017...
      - Processing LULC for year 2018...
      - Processing LULC for year 2019...
      - Processing LULC for year 2020...
      - Processing LULC for year 2021...
      - Processing LULC for year 2022...
    - LULC extraction loop complete.
    - Calculating LULC changes using Change Vector Analysis (CVA)...
Step 3 Complete.

Step 4: Saving final output...
✅ Final dataset with 100 sample(s) saved.

Step 5: Calculating and printing mean LULC changes...
    - Mean value for 'CVA_Magnitude': 0.0400
    - Mean value for 'CVA_Direction': -0.0400
Step 5 Complete.



  samples_gdf.to_file(f"data/{output_name}W.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
