In [2]:
import pandas as pd
import geopandas as gpd

# # --- Configuration ---
# UPDATE this with the actual path to your input CSV file
input_csv_path = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v5.csv"
# UPDATE this with the desired path for your output shapefile
output_shapefile_path = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_sites.shp"

# --- Main Script ---

try:
    # 1. Read the CSV file into a Pandas DataFrame
    print(f"Reading data from: {input_csv_path}")
    df = pd.read_csv(input_csv_path)

    # 2. Define the columns to keep
    columns_to_keep = ['site_reference', 'latitudelongitude']
    
    # It seems 'latitude' and 'longitude' are in a single column. Let's split it.
    # If they are separate columns already, you can adjust this part.
    # Assuming the format is "latitude,longitude"
    if 'latitudelongitude' in df.columns:
        # print("Splitting 'latitudelongitude' column into 'latitude' and 'longitude'...")
        # df[['latitude', 'longitude']] = df['latitudelongitude'].astype(str).str.split(',', expand=True)
        
        # Convert new columns to numeric, coercing errors
        df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
        df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
        
        # Drop rows where conversion failed
        df.dropna(subset=['latitude', 'longitude'], inplace=True)
        
        columns_to_keep = ['site_reference', 'latitude', 'longitude']
    else:
        # If you have separate 'latitude' and 'longitude' columns already
        columns_to_keep = ['site_reference', 'latitude', 'longitude']


    print(f"Selecting columns: {columns_to_keep}")
    sites_df = df[columns_to_keep].copy()

    # 3. Drop duplicate rows to get unique sites
    print(f"Original number of rows: {len(sites_df)}")
    sites_df.drop_duplicates(inplace=True)
    print(f"Number of unique sites after dropping duplicates: {len(sites_df)}")

    # 4. Create a GeoDataFrame
    # The geometry is created from the longitude and latitude columns
    print("Converting Pandas DataFrame to GeoPandas GeoDataFrame...")
    gdf = gpd.GeoDataFrame(
        sites_df,
        geometry=gpd.points_from_xy(sites_df.longitude, sites_df.latitude)
    )

    # 5. Set the Coordinate Reference System (CRS)
    # WGS84 (EPSG:4326) is the standard for latitude/longitude data
    print("Setting Coordinate Reference System (CRS) to EPSG:4326 (WGS84)...")
    gdf.set_crs(epsg=4326, inplace=True)

    # 6. Save the GeoDataFrame to a shapefile
    print(f"Saving GeoDataFrame to shapefile: {output_shapefile_path}")
    gdf.to_file(output_shapefile_path, driver='ESRI Shapefile')

    print("\nProcessing complete!")
    print(f"Shapefile '{output_shapefile_path}' created successfully.")
    print("\nFirst 5 rows of the final GeoDataFrame:")
    print(gdf.head())

except FileNotFoundError:
    print(f"Error: The file was not found at '{input_csv_path}'. Please update the path.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")



Reading data from: /explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v5.csv


  df = pd.read_csv(input_csv_path)


Selecting columns: ['site_reference', 'latitude', 'longitude']
Original number of rows: 642655
Number of unique sites after dropping duplicates: 2139
Converting Pandas DataFrame to GeoPandas GeoDataFrame...
Setting Coordinate Reference System (CRS) to EPSG:4326 (WGS84)...
Saving GeoDataFrame to shapefile: /explore/nobackup/people/spotter5/anna_v/v2/v2_model_sites.shp

Processing complete!
Shapefile '/explore/nobackup/people/spotter5/anna_v/v2/v2_model_sites.shp' created successfully.

First 5 rows of the final GeoDataFrame:
                                      site_reference  latitude   longitude  \
0  Bonanza Creek Experimental Forest_FP1A-Sandbar...     64.68 -148.316667   
1  Bonanza Creek Experimental Forest_FP2A-Alder-P...     64.68 -148.316667   
2  Bonanza Creek Experimental Forest_FP3A-Poplar_...     64.68 -148.316667   
3  Bonanza Creek Experimental Forest_FP4A-White S...     64.68 -148.316667   
4  Bonanza Creek Experimental Forest_FP5A- Black ...     64.68 -148.316667   

 

  gdf.to_file(output_shapefile_path, driver='ESRI Shapefile')
  ogr_write(


In [3]:
import geopandas as gpd

# Load the shapefile
gdf = gpd.read_file("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_sites.shp")

# Count the occurrences of each site in the 'site_reference' column
site_counts = gdf['site_refer'].value_counts()

# Filter to find which sites appear more than once
duplicate_sites = site_counts[site_counts > 1]

# Print the results
if duplicate_sites.empty:
    print("✅ All sites appear only once.")
else:
    print("🚨 Found sites with more than 1 count:")
    print(duplicate_sites)

🚨 Found sites with more than 1 count:
Storflaket_StorflaketDSS1_agg_chamber    2
Storflaket_StorflaketDSS2_agg_chamber    2
Storflaket_StorflaketDSS3_agg_chamber    2
Storflaket_StorflaketDSS4_agg_chamber    2
Storflaket_StorflaketDSS5_agg_chamber    2
                                        ..
Stordalen_StordalenSE2_agg_chamber       2
Stordalen_StordalenSE3_agg_chamber       2
Stordalen_StordalenSE4_agg_chamber       2
Stordalen_StordalenSE5_agg_chamber       2
Stordalen_StordalenPBS5_agg_chamber      2
Name: site_refer, Length: 157, dtype: int64
