In [None]:
import dask.dataframe as dd
import dask.array as da
import dask_ml.model_selection as dcv
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Set up Dask client to use multiple GPUs. Note to set temp space
local_directory='/explore/nobackup/people/spotter5/temp_dir',
client = Client(
                n_workers=1, threads_per_worker=1, processes=True, memory_limit='28GB')


# Set up a Dask cluster that assigns each worker to a separate GPU
# cluster = LocalCUDACluster(
#     n_workers=2,  # Number of GPUs you have
#     threads_per_worker=1,  # One thread per worker
#     memory_limit='28GB',  # Set memory limit for each worker
#     local_directory='/explore/nobackup/people/spotter5/temp_dir'  # Temporary directory
# )

# Connect to the Dask client
# client = Client(cluster)
# Create an output directory if it doesn't exist
out_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/xgboost'
os.makedirs(out_path, exist_ok=True)

# Load the data as a Dask DataFrame
# df = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_na.parquet', 
#                      columns=['dNBR', 'dNDVI', 'dNDII', 'y'])

# # Shuffle the dataframe (keeps it in Dask format)
# df = df.shuffle(on='dNBR')

# # Sample 100,000 rows from the Dask DataFrame
# df = df.sample(frac=100000 / len(df), random_state=42)

# sampled_out_path = os.path.join(out_path, 'sampled_100k.parquet')
# df.to_parquet(sampled_out_path, write_index = False)
# print(f"Sampled data saved to {sampled_out_path}")

# df1 = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_mtbs_stratified_sampled_ndsi.parquet')
# df2 = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_nbac_stratified_sampled_ndsi.parquet')
# df3 = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_anna_stratified_sampled_ndsi.parquet')

df1 = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_mtbs_sampled_ndsi.parquet')
df2 = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_nbac_sampled_ndsi.parquet')
df3 = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_anna_sampled_ndsi.parquet')

# df1 = df1.repartition(npartitions=10) 
# df2 = df2.repartition(npartitions=10) 

df = dd.concat([df1, df2, df3])
# df = df.repartition(npartitions=10)

# Specify predictors and target variables, converting directly from Dask DataFrame to Dask Array
X = df[['dNBR', 'dNDVI', 'dNDII']].to_dask_array(lengths=True)
y = df['y'].to_dask_array(lengths=True)

# X = df[['dNBR', 'dNDVI', 'dNDII']].values
# y = df['y'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = dcv.train_test_split(X, y, test_size=0.2, random_state=42, shuffle = True)
# X_train, X_test, y_train, y_test = dcv.train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to DaskDMatrix (XGBoost specific data structure for distributed training)
dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
dtest = xgb.dask.DaskDMatrix(client, X_test, y_test)

# Set up XGBoost parameters for GPU training
params = {
    'objective': 'binary:logistic',  # Binary classification
    'learning_rate': 0.01,
    'max_depth': 8,
    'n_estimators': 1000,
    'tree_method': 'hist',  # Use histogram-based method
    'eval_metric': 'logloss',  # Metric for binary classification
    'device': 'cuda',  # Use CUDA for GPU support
}

# params = {
#     'objective': 'binary:logistic',  # Binary classification
#     'learning_rate': 0.1,
#     'max_depth': 8,
#     'n_estimators': 1000,
#     'tree_method': 'gpu_hist',  # Use GPU-accelerated histogram algorithm
#     'eval_metric': 'logloss',
#     'predictor': 'gpu_predictor'  # Use GPU predictor
# }


# Train the model with early stopping
model = xgb.dask.train(
    client, 
    params, 
    dtrain, 
    num_boost_round=100,
    evals=[(dtest, 'test')],
    early_stopping_rounds=10
)

# Make predictions (the output will be probabilities for binary classification)
y_pred_proba = xgb.dask.predict(client, model, X_test)

# Convert predicted probabilities to binary predictions
y_pred = (y_pred_proba > 0.5).astype(int)

# Convert Dask arrays to NumPy arrays for sklearn metrics
y_pred_np = y_pred.compute()
y_test_np = y_test.compute()

# Calculate classification metrics using sklearn
accuracy = accuracy_score(y_test_np, y_pred_np)
precision = precision_score(y_test_np, y_pred_np, average='binary')
recall = recall_score(y_test_np, y_pred_np, average='binary')
f1 = f1_score(y_test_np, y_pred_np, average='binary')
# Calculate IoU using confusion matrix
cm = confusion_matrix(y_test_np, y_pred_np)
TP = cm[1, 1]  # True Positives
FP = cm[0, 1]  # False Positives
FN = cm[1, 0]  # False Negatives
IoU = TP / (TP + FP + FN)

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"IoU: {IoU}")

# Save the classification metrics to a CSV file
results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'IoU'],
    'Value': [accuracy, precision, recall, f1, IoU]
})

results.to_csv(os.path.join(out_path, 'xgboost_combined_sampled_results_ndsi.csv'), index=False)
print(f"Classification metrics saved to {os.path.join(out_path, 'xgboost_combined_sampled_results_ndsi.csv')}")

2024-09-17 14:20:39,075 - distributed.spill - ERROR - Spill to disk failed; keeping data in memory
Traceback (most recent call last):
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/distributed/spill.py", line 124, in _handle_errors
    yield
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/distributed/spill.py", line 199, in evict
    _, _, weight = self.fast.evict()
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/zict/common.py", line 127, in wrapper
    return func(*args, **kwargs)
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/zict/lru.py", line 227, in evict
    cb(key, value)
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/zict/buffer.py", line 139, in fast_to_slow
    self.slow[key] = value
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/zict/common.py", line 127, in wrapper
    return func(*args, **kwargs)
  File "/hom

In [None]:
't'

In [3]:
results

NameError: name 'results' is not defined

Above trains a small file, do it in batches

In [None]:
import dask.dataframe as dd
from dask.distributed import Client
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import os

# Set up Dask client for distributed training
client = Client(n_workers=4, threads_per_worker=1, processes=True, memory_limit='28GB')

# Output directory
out_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/xgboost'
os.makedirs(out_path, exist_ok=True)

# Read the Parquet directory with specific columns to control memory usage
df = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_na.parquet', 
                     columns=['dNBR', 'dNDVI', 'dNDII', 'y'])

# XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'learning_rate': 0.1,
    'max_depth': 8,
    'eval_metric': 'logloss',  # Metric for binary classification
    'tree_method': 'hist',  # Use histogram-based method for efficient training
    'device': 'cuda',  # Use GPU for training
}

# Get the number of partitions
npartitions = df.npartitions  # Number of partitions in the dataset

# Initialize booster
booster = None

# Process data in chunks (by partition)
for i in range(npartitions):
    print(f"Processing partition {i + 1} of {npartitions}")
    
    # Compute a single partition at a time
    df_partition = df.get_partition(i).compute()

    # Split the partition into features (X) and target (y)
    X_batch = df_partition[['dNBR', 'dNDVI', 'dNDII']].values
    y_batch = df_partition['y'].values
    
    # Convert to DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_batch, label=y_batch)
    
    # If this is the first partition, train the initial model
    if booster is None:
        booster = xgb.train(params, dtrain, num_boost_round=10)  # Initial training on the first batch
    else:
        # Update the model with the next partition
        booster.update(dtrain, iteration=i)

# After training on all partitions, make predictions on the test data
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = dcv.train_test_split(df[['dNBR', 'dNDVI', 'dNDII']].compute(), df['y'].compute(), test_size=0.2, random_state=42)

# Convert test data to DMatrix
dtest = xgb.DMatrix(X_test)

# Make predictions on the test set
y_pred_proba = booster.predict(dtest)

# Convert predicted probabilities to binary predictions
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate classification metrics using sklearn
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

# Calculate IoU using confusion matrix
cm = confusion_matrix(y_test, y_pred)
TP = cm[1, 1]  # True Positives
FP = cm[0, 1]  # False Positives
FN = cm[1, 0]  # False Negatives

# IoU = True Positives / (True Positives + False Positives + False Negatives)
IoU = TP / (TP + FP + FN)

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"IoU: {IoU}")

# Save the classification metrics to a CSV file
results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'IoU'],
    'Value': [accuracy, precision, recall, f1, IoU]
})

results.to_csv(os.path.join(out_path, 'xgboost_classification_results_batch.csv'), index=False)
print(f"Classification metrics saved to {os.path.join(out_path, 'xgboost_classification_results_batch.csv')}")


Processing partition 1 of 11642
Processing partition 2 of 11642
Processing partition 3 of 11642


In [1]:
import dask.dataframe as dd
from dask.distributed import Client
import xgboost as xgb
from dask_ml.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import os

# Set up Dask client for distributed training
client = Client(local_directory='/explore/nobackup/people/spotter5/temp_dir', n_workers=4, threads_per_worker=1, processes=True, memory_limit='28GB')

# Output directory
out_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/xgboost'
os.makedirs(out_path, exist_ok=True)

# Read the Parquet directory lazily (without loading it all into memory)
df = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_na.parquet', 
                     columns=['dNBR', 'dNDVI', 'dNDII', 'y'])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[['dNBR', 'dNDVI', 'dNDII']], df['y'], test_size=0.2, random_state=42)

# Convert to DaskDMatrix for XGBoost
dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
dtest = xgb.dask.DaskDMatrix(client, X_test, y_test)

# XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'learning_rate': 0.1,
    'max_depth': 8,
    'eval_metric': 'logloss',  # Metric for binary classification
    'tree_method': 'hist',  # Use histogram-based method
    'device': 'cuda',  # Use GPU for training
}

# Dictionary to store evaluation results
evals_result = {}

# Train the model with Dask, capturing evaluation results
output = xgb.dask.train(client, params, dtrain, num_boost_round=100, evals=[(dtest, 'test')], evals_result=evals_result, verbose_eval=10)

# Get the trained booster model
booster = output['booster']

# Print loss for each iteration
for i, logloss in enumerate(evals_result['test']['logloss']):
    print(f"Iteration {i + 1}: Log Loss = {logloss}")

# Make predictions on the test set
y_pred_proba = xgb.dask.predict(client, booster, X_test)

# Convert predicted probabilities to binary predictions
y_pred = (y_pred_proba > 0.5).astype(int)

# Convert Dask arrays to NumPy arrays for sklearn metrics
y_pred_np = y_pred.compute()
y_test_np = y_test.compute()

# Calculate classification metrics using sklearn
accuracy = accuracy_score(y_test_np, y_pred_np)
precision = precision_score(y_test_np, y_pred_np, average='binary')
recall = recall_score(y_test_np, y_pred_np, average='binary')
f1 = f1_score(y_test_np, y_pred_np, average='binary')

# Calculate IoU using confusion matrix
cm = confusion_matrix(y_test_np, y_pred_np)
TP = cm[1, 1]  # True Positives
FP = cm[0, 1]  # False Positives
FN = cm[1, 0]  # False Negatives

# IoU = True Positives / (True Positives + False Positives + False Negatives)
IoU = TP / (TP + FP + FN)

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"IoU: {IoU}")

# Save the classification metrics to a CSV file
results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'IoU'],
    'Value': [accuracy, precision, recall, f1, IoU]
})

results.to_csv(os.path.join(out_path, 'xgboost_classification_results_batch.csv'), index=False)
print(f"Classification metrics saved to {os.path.join(out_path, 'xgboost_classification_results_batch.csv')}")


This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.


TypeError: train() got an unexpected keyword argument 'evals_result'

In [2]:
import dask.dataframe as dd
# Read the data in chunks using Dask DataFrame (partitioning the data to avoid loading the full dataset into memory)
df = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_na.parquet', 
                     columns=['dNBR', 'dNDVI', 'dNDII', 'y'])
df.npartitions

11642

In [19]:
't'

't'

In [3]:
import xgboost as xgb
print(xgb.__version__)
from xgboost import DMatrix

# Check if GPU support is available
params = {'tree_method': 'gpu_hist'}
dtrain = DMatrix([[1, 2], [3, 4]], label=[1, 0])
bst = xgb.train(params, dtrain, num_boost_round=2)

print("GPU support is enabled.")


2.1.1



    E.g. tree_method = "hist", device = "cuda"



GPU support is enabled.


In [1]:
import pandas as pd
df = pd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_mtbs_ndsi.parquet',   columns=['dNBR', 'dNDVI', 'dNDII', 'y'])

# df.shape

df = df.sample(n = 100000000)

df.to_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_mtbs_sampled_ndsi.parquet', index = False)

df.shape

(100000000, 4)

In [2]:
import pandas as pd
df = pd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_mtbs_ndsi.parquet',   columns=['dNBR', 'dNDVI', 'dNDII', 'y'])


# Sample 100,000,000 rows from each class (y=0 and y=1)
df_0 = df[df['y'] == 0].sample(n=int(100000000/2), random_state=42)
df_1 = df[df['y'] == 1].sample(n= int(100000000/2), random_state=42)

# Concatenate the stratified samples
df_stratified = pd.concat([df_0, df_1])

# Shuffle the DataFrame (optional)
df_stratified = df_stratified.sample(frac=1, random_state=42)

# Save the stratified sample to a new parquet file
df_stratified.to_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_mtbs_stratified_sampled_ndsi.parquet', 
                         index=False)

In [1]:
import pandas as pd
df = pd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_nbac_ndsi.parquet',   columns=['dNBR', 'dNDVI', 'dNDII', 'y'])

# df.shape

df = df.sample(n = 100000000)

df.to_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_nbac_sampled_ndsi.parquet', index = False)

df.shape

(10247554867, 4)

In [2]:
df = df.sample(n = 100000000)

df.to_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_nbac_sampled_ndsi.parquet', index = False)


In [1]:
import pandas as pd
df = pd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_nbac_ndsi.parquet',  
                     columns=['dNBR', 'dNDVI', 'dNDII', 'y'],
                     engine='pyarrow')


# Sample 100,000,000 rows from each class (y=0 and y=1)
df_0 = df[df['y'] == 0].sample(n=int(100000000/2), random_state=42)
df_1 = df[df['y'] == 1].sample(n= int(100000000/2), random_state=42)

# Concatenate the stratified samples
df_stratified = pd.concat([df_0, df_1])

# Shuffle the DataFrame (optional)
# df_stratified = df_stratified.sample(frac=1, random_state=42)

# Save the stratified sample to a new parquet file
df_stratified.to_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_nbac_stratified_sampled_ndsi.parquet', 
                         index=False)

MemoryError: Unable to allocate 270. GiB for an array with shape (4, 9059240636) and data type float64

In [2]:
import dask.dataframe as dd

# Read the parquet file in smaller partitions
df = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_nbac_ndsi.parquet', 
                     columns=['dNBR', 'dNDVI', 'dNDII', 'y'], engine='pyarrow', chunksize="100MB")

# Create two separate dataframes for y=0 and y=1
df_0 = df[df['y'] == 0]
df_1 = df[df['y'] == 1]

# Sample from each partition incrementally to avoid memory overload
def stratified_sample(df, frac):
    return df.sample(frac=frac, random_state=42)

# Apply chunk-wise sampling for both classes
df_0_sample = df_0.map_partitions(stratified_sample, frac=50000000 / df_0.shape[0].compute())
df_1_sample = df_1.map_partitions(stratified_sample, frac=50000000 / df_1.shape[0].compute())

# Concatenate the samples
df_stratified = dd.concat([df_0_sample, df_1_sample])

# Shuffle the concatenated DataFrame

# Write the output to a parquet file, with the option to write in smaller partitions to reduce memory load
# df_stratified.compute().to_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_nbac_stratified_sampled_ndsi.parquet',
#                          write_index=False, compute_kwargs={'scheduler': 'threads'})

df_stratified.compute().to_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_nbac_stratified_sampled_ndsi.parquet', index = False)




Light GBM

In [2]:
import dask.dataframe as dd
import dask.array as da
import dask_ml.model_selection as dcv
from dask_ml.metrics import r2_score, mean_squared_error
from dask.distributed import Client, wait
import lightgbm as lgb
import pandas as pd
import dask.array as da
import numpy as np
import os

out_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/light_gmb'
os.makedirs(out_path, exist_ok = True)

# Set up Dask client to use multiple GPUs
client = Client(n_workers=4, threads_per_worker=1, processes=True, memory_limit='28GB')
client

# Load the Parquet fil/explore/nobackup/people/spotter5/cnn_mapping/nbac_training
df = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_na.parquet', 
                     columns=['dNBR', 'dNDVI', 'dNDII', 'y'])

#shuffle dataframe
df = df.shuffle(on = 'dNBR')

# Reset the index and select the first 1,000,000 rows
df = df.reset_index(drop=True).head(100000, compute=True)

# Specify predictor variables and target variable
X = df[['dNBR', 'dNDVI', 'dNDII']].to_dask_array(lengths=True)
y = df['y'].to_dask_array(lengths=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = dcv.train_test_split(X, y, test_size=0.2, random_state=42)

# Convert Dask arrays to Dask LightGBM dataset format
dtrain = lgb.DaskDMatrix(client, X_train, y_train)
dtest = lgb.DaskDMatrix(client, X_test, y_test)

# Set up LightGBM parameters for multi-GPU training
params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'max_depth': -1,
    'n_estimators': 1000,
    'metric': 'rmse',
    'device': 'gpu',  # Use GPU
    'gpu_use_dp': True,  # Enable distributed training across multiple GPUs
    'num_gpu': 4  # Number of GPUs available
}
# Train the model in batches using all GPUs
model = lgb.dask.train(
    client=client,
    params=params,
    train_set=dtrain,
    num_boost_round=100,
    valid_sets=[dtest],
    early_stopping_rounds=10,
    verbose_eval=True
)

# Make predictions on the test set
y_pred = lgb.dask.predict(client, model, X_test)

# Calculate R² and RMSE
r2 = r2_score(y_test, y_pred)
rmse = da.sqrt(mean_squared_error(y_test, y_pred))

# Calculate IOU (for regression tasks, we can approximate IOU with a threshold)
threshold = 0.5  # You can adjust this based on your specific problem
iou = da.mean((y_test > threshold) & (y_pred > threshold)) / da.mean((y_test > threshold) | (y_pred > threshold))

# Compute the results (wait for all Dask computations to finish)
r2_result, rmse_result, iou_result = client.compute([r2, rmse, iou], sync=True)

# Print the results
print(f"R²: {r2_result}")
print(f"RMSE: {rmse_result}")
print(f"IOU: {iou_result}")

# Save the results to a CSV file
results = pd.DataFrame({
    'Metric': ['R²', 'RMSE', 'IOU'],
    'Value': [r2_result, rmse_result, iou_result]
})

results.to_csv(os.path.join(out_path, 'delete.csv'), index=False)


Process Dask Worker process (from Nanny):
Process Dask Worker process (from Nanny):
Process Dask Worker process (from Nanny):
Process Dask Worker process (from Nanny):
2024-09-09 14:36:10,847 - distributed.nanny - ERROR - Worker process died unexpectedly
2024-09-09 14:36:10,847 - distributed.nanny - ERROR - Worker process died unexpectedly
2024-09-09 14:36:10,847 - distributed.nanny - ERROR - Worker process died unexpectedly
2024-09-09 14:36:10,847 - distributed.nanny - ERROR - Worker process died unexpectedly
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/spotter5/.conda/envs/lgb_gpu/lib/python3.9/site-packages/distributed/compatibility.py", line 236, in asyncio_run
    return loop.run_until_complete(main)
  File "/home/spotter5/.conda/envs/lgb_gpu/lib/python3.9/asyncio/base_events.py", line 634, in run_until_complete
    self.run_forever()
  File "/home/spotter5/.conda/envs/lgb_gpu/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
  

KeyboardInterrupt: 

In [None]:
results

In [1]:
import lightgbm as lgb
print(lgb.__version__)
print(lgb.LGBMRegressor().gpu_use_dp)

4.5.0


AttributeError: 'LGBMRegressor' object has no attribute 'gpu_use_dp'

In [1]:
import subprocess

# Run nvidia-smi command to get GPU details
gpu_info = subprocess.run(['nvidia-smi', '--query-gpu=name', '--format=csv'], capture_output=True, text=True)

# Print the output
print("Available GPUs:")
print(gpu_info.stdout)

Available GPUs:
name
Tesla V100-SXM2-32GB
Tesla V100-SXM2-32GB
Tesla V100-SXM2-32GB
Tesla V100-SXM2-32GB



In [None]:
import dask.dataframe as dd
import dask.array as da
import dask_ml.model_selection as dcv
from dask_ml.metrics import r2_score, mean_squared_error
from dask.distributed import Client, wait
import lightgbm as lgb
import pandas as pd
import dask.array as da
import numpy as np
import os

out_path = '/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/light_gmb'
os.makedirs(out_path, exist_ok = True)

# Set up Dask client to use multiple GPUs
client = Client(n_workers=4, threads_per_worker=1, processes=True, memory_limit='25GB')
client

# Load the Parquet fil/explore/nobackup/people/spotter5/cnn_mapping/nbac_training
df = dd.read_parquet('/explore/nobackup/people/spotter5/cnn_mapping/nbac_training/parquet_files/all_training_na.parquet', 
                     columns=['dNBR', 'dNDVI', 'dNDII', 'y'])

# # Add a temporary column with random values to shuffle by
# df['shuffle_col'] = np.random.rand(len(df))

# # Shuffle the DataFrame based on the temporary column
# df = df.shuffle(on='shuffle_col')

# # Drop the temporary column after shuffling
# df = df.drop(columns='shuffle_col')

# Reset the index
df = df.reset_index(drop=True)

# Select the first 1,000,000 rows without computing (keep as Dask DataFrame)
df = df.head(100000, npartitions=-1)

# Specify predictor variables and target variable, convert to Dask Arrays
X = df[['dNBR', 'dNDVI', 'dNDII']].to_dask_array(lengths=True)
y = df['y'].to_dask_array(lengths=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Convert Dask arrays to Dask LightGBM dataset format
dtrain = lgb.DaskDMatrix(client, X_train, y_train)
dtest = lgb.DaskDMatrix(client, X_test, y_test)

# Set up LightGBM parameters for multi-GPU training
params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'max_depth': -1,
    'n_estimators': 1000,
    'metric': 'rmse',
    'device': 'gpu',  # Use GPU
    'gpu_platform_id': 0,  # Assuming single GPU platform, adjust if needed
    'gpu_use_dp': True,  # Enable distributed training across multiple GPUs
    'num_gpu': 4,  # Number of GPUs available
    'gpu_device_id': [0, 1, 2, 3]  # List of GPU IDs
}

# Train the model in batches using all GPUs
model = lgb.dask.train(
    client=client,
    params=params,
    train_set=dtrain,
    num_boost_round=100,
    valid_sets=[dtest],
    early_stopping_rounds=10,
    verbose_eval=True
)

# Make predictions on the test set
y_pred = lgb.dask.predict(client, model, X_test)

# Calculate R² and RMSE
r2 = r2_score(y_test, y_pred)
rmse = da.sqrt(mean_squared_error(y_test, y_pred))

# Calculate IOU (for regression tasks, we can approximate IOU with a threshold)
threshold = 0.5  # You can adjust this based on your specific problem
iou = da.mean((y_test > threshold) & (y_pred > threshold)) / da.mean((y_test > threshold) | (y_pred > threshold))

# Compute the results (wait for all Dask computations to finish)
r2_result, rmse_result, iou_result = client.compute([r2, rmse, iou], sync=True)

# Print the results
print(f"R²: {r2_result}")
print(f"RMSE: {rmse_result}")
print(f"IOU: {iou_result}")

# Save the results to a CSV file
results = pd.DataFrame({
    'Metric': ['R²', 'RMSE', 'IOU'],
    'Value': [r2_result, rmse_result, iou_result]
})

results.to_csv(os.path.join(out_path, 'delete.csv'), index=False)


Perhaps you already have a cluster running?
Hosting the HTTP server on port 35901 instead
2024-09-03 17:37:10,687 - distributed.spill - ERROR - Spill to disk failed; keeping data in memory
Traceback (most recent call last):
  File "/home/spotter5/.conda/envs/xgboost/lib/python3.9/site-packages/distributed/spill.py", line 124, in _handle_errors
    yield
  File "/home/spotter5/.conda/envs/xgboost/lib/python3.9/site-packages/distributed/spill.py", line 199, in evict
    _, _, weight = self.fast.evict()
  File "/home/spotter5/.conda/envs/xgboost/lib/python3.9/site-packages/zict/common.py", line 127, in wrapper
    return func(*args, **kwargs)
  File "/home/spotter5/.conda/envs/xgboost/lib/python3.9/site-packages/zict/lru.py", line 227, in evict
    cb(key, value)
  File "/home/spotter5/.conda/envs/xgboost/lib/python3.9/site-packages/zict/buffer.py", line 139, in fast_to_slow
    self.slow[key] = value
  File "/home/spotter5/.conda/envs/xgboost/lib/python3.9/site-packages/zict/common.py", 

In [6]:
results

NameError: name 'results' is not defined