Read in packages

In [22]:
import pandas as pd
import os
import numpy as np
import xarray as xr
import rioxarray
import glob
import random
import geopandas as gpd
from sklearn.utils import shuffle
from MightyMosaic import MightyMosaic
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


In [10]:
# Load your data
df = gpd.read_file('/explore/nobackup/people/spotter5/cnn_mapping/raw_files/nbac_1985.shp')
df2 = gpd.read_file('/explore/nobackup/people/spotter5/cnn_mapping/raw_files/ak_mtbs_1985.shp').to_crs(df.crs)

df['AOI'] = 'NBAC'
df2['AOI'] = 'MTBS'


df = pd.concat([df, df2], ignore_index = True)

df = df[['AOI', 'ID']]
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Preparing indices for entire data shuffling
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split data into 5 folds
fold = 0
results = []
for train_val_index, test_index in kf.split(df_shuffled):
    # Extract test set (20% of total)
    test_df = df_shuffled.iloc[test_index]

    # Extract the remaining data for this fold
    train_val_df = df_shuffled.iloc[train_val_index]

    # Further split remaining data into training (70% of total, 87.5% of remaining) and validation (10% of total, 12.5% of remaining)
    train_df, val_df = train_test_split(train_val_df, test_size=0.125, random_state=42)  # 0.125 * 0.8 = 0.1 ~ 10% of total

    # Save each fold's datasets to CSV files
    train_df.to_csv(f'/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/train_fold_{fold}.csv', index=False)
    val_df.to_csv(f'/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/val_fold_{fold}.csv', index=False)
    test_df.to_csv(f'/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_{fold}.csv', index=False)

    fold += 1
    print(f"Fold {fold}: Train {train_df.shape}, Validation {val_df.shape}, Test {test_df.shape}")
    results.append((train_df.shape, val_df.shape, test_df.shape))


Fold 1: Train (8697, 2), Validation (1243, 2), Test (2485, 2)
Fold 2: Train (8697, 2), Validation (1243, 2), Test (2485, 2)
Fold 3: Train (8697, 2), Validation (1243, 2), Test (2485, 2)
Fold 4: Train (8697, 2), Validation (1243, 2), Test (2485, 2)
Fold 5: Train (8697, 2), Validation (1243, 2), Test (2485, 2)


New try

In [29]:
import geopandas as gpd
import pandas as pd
from sklearn.model_selection import train_test_split

# Load and prepare data
df = gpd.read_file('/explore/nobackup/people/spotter5/cnn_mapping/raw_files/nbac_1985.shp')
df2 = gpd.read_file('/explore/nobackup/people/spotter5/cnn_mapping/raw_files/ak_mtbs_1985.shp').to_crs(df.crs)

df['AOI'] = 'NBAC'
df2['AOI'] = 'MTBS'

# Concatenate the datasets
df = pd.concat([df, df2], ignore_index=True)
df = df[['AOI', 'ID']]

# Create a unique identifier
df['unique_id'] = df['AOI'].astype(str) + '_' + df['ID'].astype(str)

# Verify uniqueness of 'unique_id'
if df['unique_id'].duplicated().any():
    print("There are duplicates in the unique identifier.")
else:
    print("All unique identifiers are unique.")

# Shuffle once to randomize
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Initialize a set to track used indices
used_indices = set()
num_samples = len(df)
fold_size = num_samples // 5

# Ensure non-overlapping test sets
for fold in range(5):
    # Identify available indices
    available_indices = list(set(range(num_samples)) - used_indices)
    
    # For the last fold, include all remaining samples
    if fold == 4:
        test_indices = available_indices
    else:
        test_indices = pd.Series(available_indices).sample(n=fold_size, random_state=fold).tolist()
    used_indices.update(test_indices)
    
    # Select test, training, and validation sets
    test_df = df.iloc[test_indices]
    train_val_df = df.drop(test_indices)
    
    # Split train_val_df into training and validation sets (80% train, 20% val)
    train_df, val_df = train_test_split(train_val_df, test_size=0.125, random_state=fold)  # 0.125 * 0.8 = 0.1
    
    # Save datasets to CSV
    train_df.to_csv(f'/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/train_fold_{fold}.csv', index=False)
    val_df.to_csv(f'/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/val_fold_{fold}.csv', index=False)
    test_df.to_csv(f'/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_{fold}.csv', index=False)
    
    print(f"Fold {fold}: Train {train_df.shape}, Validation {val_df.shape}, Test {test_df.shape}")


All unique identifiers are unique.
Fold 0: Train (8697, 3), Validation (1243, 3), Test (2485, 3)
Fold 1: Train (8697, 3), Validation (1243, 3), Test (2485, 3)
Fold 2: Train (8697, 3), Validation (1243, 3), Test (2485, 3)
Fold 3: Train (8697, 3), Validation (1243, 3), Test (2485, 3)
Fold 4: Train (8697, 3), Validation (1243, 3), Test (2485, 3)


In [33]:
import pandas as pd

# Define file paths for each fold's test set
# test_files = [
#     '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_0.csv',
#     '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_1.csv',
#     '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_2.csv',
#     '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_3.csv',
#     '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_4.csv'
# ]

test_files = [
    '/explore/nobackup/people/spotter5/cnn_mapping/Russia/test_fold_0.csv',
    '/explore/nobackup/people/spotter5/cnn_mapping/Russia/test_fold_1.csv',
    '/explore/nobackup/people/spotter5/cnn_mapping/Russia/test_fold_2.csv',
    '/explore/nobackup/people/spotter5/cnn_mapping/Russia/test_fold_3.csv',
    '/explore/nobackup/people/spotter5/cnn_mapping/Russia/test_fold_4.csv'
]


# Load each test set and concatenate them into a single DataFrame
test_dfs = [pd.read_csv(file) for file in test_files]
all_test_df = pd.concat(test_dfs, ignore_index=True)

# Check for duplicate entries based on both 'AOI' and 'ID' columns
# duplicates = all_test_df[all_test_df.duplicated(subset=['AOI', 'ID'], keep=False)]
duplicates = all_test_df[all_test_df.duplicated(subset=['ID'], keep=False)]

# Print results
if duplicates.empty:
    print("No overlap found between the test sets.")
else:
    print("Overlap found between the test sets:")
    print(duplicates)


No overlap found between the test sets.


In [25]:
duplicates.shape

(2354, 2)

In [20]:
test_id = test_df['ID']
train_id = train_df['ID']
val_id = val_df['ID']

#
common_elements = set(train_id).intersection(val_id)

# Convert the set back to a list, if needed
common_elements_list = list(common_elements)

print(common_elements_list)

[1025, 518, 7, 511, 523, 1036, 1039, 529, 19, 532, 533, 1047, 535, 1052, 541, 30, 1055, 543, 1057, 549, 550, 555, 45, 48, 562, 563, 50, 1077, 53, 1083, 1084, 574, 66, 1092, 582, 585, 83, 599, 89, 602, 1115, 603, 1119, 1120, 610, 100, 103, 104, 619, 1132, 1134, 115, 1142, 119, 1144, 121, 1146, 637, 1150, 641, 643, 645, 137, 651, 1167, 656, 145, 655, 1173, 151, 671, 673, 675, 676, 165, 173, 695, 184, 189, 702, 703, 709, 202, 724, 216, 730, 733, 228, 231, 753, 245, 249, 767, 257, 773, 276, 277, 795, 797, 798, 800, 289, 802, 808, 819, 820, 308, 312, 836, 328, 329, 841, 333, 334, 340, 352, 354, 355, 867, 869, 870, 368, 882, 370, 884, 372, 371, 887, 383, 384, 897, 901, 397, 915, 916, 924, 415, 421, 425, 432, 946, 950, 440, 953, 442, 464, 466, 474, 476, 993, 481, 995, 998, 999, 487, 493, 498, 1020, 509, 1023]


In [4]:
df.to_csv('/explore/nobackup/people/spotter5/cnn_mapping/Russia/delete.csv')

In [5]:
df.shape

(7740, 4)

For some reason I don't have all the fires in my val sets, need to see why



In [23]:
# Load each test fold
one = pd.read_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_0.csv', usecols=['ID'])
two = pd.read_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_1.csv', usecols=['ID'])
three = pd.read_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_2.csv', usecols=['ID'])
four = pd.read_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_3.csv', usecols=['ID'])
five = pd.read_csv('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/test_fold_4.csv', usecols=['ID'])

# Concatenate all test IDs with an indicator column for each fold
one['fold'] = 0
two['fold'] = 1
three['fold'] = 2
four['fold'] = 3
five['fold'] = 4
merged = pd.concat([one, two, three, four, five])

# Count occurrences of each ID across all folds
id_counts = merged.groupby('ID').size()

# Get IDs that appear in more than one fold
common_ids = id_counts[id_counts > 1].index.tolist()

if common_ids:
    print("Common IDs found across folds:", common_ids)
else:
    print("No common IDs across folds.")

Common IDs found across folds: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 2

In [17]:
# Merge the dataframes on the 'ID' column
# merged = pd.merge(one, two, on='ID')
# merged = pd.merge(merged, three, on='ID')
# merged = pd.merge(merged, four, on='ID')
# merged = pd.merge(merged, five, on='ID')

merged = pd.concat([one, two, three, four, five])

# Get the common IDs
common_ids = merged['ID'].unique()
print(common_ids)

[    1     4     9 ... 11240 11244 11247]


In [29]:

#12450 ,12375. #12450 is in one
if 12450 in common_ids:
    
    print("in list")
else:
    print('not in list')

not in list


In [4]:
import geopandas as gpd

df = pd.read_csv('/explore/nobackup/people/spotter5/cnn_mapping/Russia/anna_poly_check_ee.csv')
df = df.rename(columns={'Image': 'ID'})
df = df[df['ID'].isin(['29690000000000-0000000000', '29690000000000-0000023296']) == False]
df['ID'] = df['ID'].astype(int)

# Load shapefile data and merge with main dataframe
in_shape = gpd.read_file('/explore/nobackup/people/spotter5/cnn_mapping/Russia/anna_polygons.shp')
in_shape['geometry'] = in_shape['geometry'].apply(lambda geom: geom.wkt)
in_shape = pd.DataFrame(in_shape[['Year', 'ID', 'area']])
df = pd.merge(in_shape, df, on='ID', how='inner')
df['Year'] = df['Year'].astype(int)
df = df[df['Keep'] == 'Yes']

df = df.reset_index(drop = True)
df.shape

  in_shape['geometry'] = in_shape['geometry'].apply(lambda geom: geom.wkt)


(7740, 4)

make sure all the merged ids are in df

In [15]:
in_test = merged['ID'].unique()

in_anna = df['ID'].unique()

result = [item for item in in_test if item not in in_anna]

result

[]

In [16]:
result2 = [item for item in in_anna if item not in in_test]
result2


[]

In [9]:
merged.head()

Unnamed: 0,ID
