In [None]:
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime, timedelta

from cuml.cluster import DBSCAN as cuDBSCAN
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.metrics import accuracy_score
import cudf

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load data and preprocess
fname = '../generalStatistics/goodData_QSwithTAO_with2hrStatsConv.nc'
ds = xr.open_dataset(fname)
ds['mean_WVEL'] = np.sqrt(ds['mean_U10N_x'] ** 2 + ds['mean_U10N_y'] ** 2)
data = ds.to_dataframe()
df = data.copy()

# Normalize selected features for clustering
selectX = ['Speed Difference (QuikSCAT - TAO)',
           'cos(Direction Difference (QuikSCAT - TAO))',
           'sin(Direction Difference (QuikSCAT - TAO))']

X_norm = df[selectX]
X_norm = (X_norm - X_norm.mean()) / X_norm.std()
X_norm_cudf = cudf.DataFrame.from_pandas(X_norm)

# Run DBSCAN clustering using cuML
dbscan = cuDBSCAN(eps=0.15, min_samples=500)
df['label'] = dbscan.fit_predict(X_norm_cudf).to_pandas()




In [None]:
fig,  axes = plt.subplots(ncols = 2, figsize=(16,6))

ax = axes[0]
# ax.remove()  # Remove the existing second subplot
# ax = fig.add_subplot(121, projection='polar')
xlabel = 'Speed Difference (QuikSCAT - TAO)'
ylabel = 'Direction Difference (QuikSCAT - TAO)'

s = sns.histplot(df, x=xlabel, y = ylabel,#levels=10, 
                hue='label', common_norm=True, cbar = True,
                palette = sns.color_palette("bright"), ax = ax)

ax.grid(visible=True, which='both', axis='both')

ax.text(0.1, 0.95, 'A', horizontalalignment='left',
        verticalalignment='center', transform=ax.transAxes,
        weight = 'heavy', fontsize = 20)

ax.set_xlim(-10,15)

ax = axes[1]
s = sns.countplot(df, x='label',palette = sns.color_palette("bright"), ax = ax)
s.bar_label(s.containers[0])

ax.grid(visible=True, which='both', axis='both')
ax.set_xlabel('cluster label')
#ax.set_ylabel('cluster label')

ax.text(0.1, 0.95, 'B', horizontalalignment='left',
        verticalalignment='center', transform=ax.transAxes,
        weight = 'heavy', fontsize = 20)


In [None]:
# Filter by wind speed > 2 m/s
subDF = df[df['Neutral Wind Speed at 10m (TAO)'] > 2].copy()
subDF['SST-AIRT'] = subDF['Sea Surface Temperature (TAO)'] - subDF['Air Temperature (TAO)']

# Relabel: 0 -> rain (1), others -> no rain (0)
subDF['label'] = subDF['label'].apply(lambda x: 1 if x == 0 else 0)

# Features for classification
features = ['SST-AIRT',
            'Relative Humidity (TAO)',
            'Neutral Wind Speed at 10m (TAO)',
            'mean_WSPD', 'mean_SST', 'mean_AIRT', 'mean_SST - AIRT', 'mean_RELH',
            'std_WSPD', 'std_cosWDIR', 'std_sinWDIR', 'std_SST', 'std_AIRT',
            'std_SST - AIRT', 'std_RELH']

X = subDF[features]
y = subDF['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to cuDF
X_train_cudf = cudf.DataFrame.from_pandas(X_train)
y_train_cudf = cudf.Series(y_train)
X_test_cudf = cudf.DataFrame.from_pandas(X_test)
y_test_cudf = cudf.Series(y_test)

# Train cuML Random Forest
model = cuRF(
    n_estimators=300,
    max_depth=20,
    max_features='auto',
    min_samples_leaf=2,
    min_samples_split=5,
    bootstrap=True,
    random_state=42
)
model.fit(X_train_cudf, y_train_cudf)
y_pred = model.predict(X_test_cudf)

# Evaluate
print("Accuracy:", accuracy_score(y_test_cudf, y_pred))
print("\nClassification Report:\n", classification_report(y_test.to_numpy(), y_pred.to_array()))

# Optional: plot histogram of clusters
fig, axes = plt.subplots(ncols=2, figsize=(16, 6))
sns.histplot(df, x='Speed Difference (QuikSCAT - TAO)', y='Direction Difference (QuikSCAT - TAO)',
             hue='label', palette=sns.color_palette("bright"), ax=axes[0], cbar=True)
sns.countplot(x='label', data=df, palette=sns.color_palette("bright"), ax=axes[1])
axes[1].bar_label(axes[1].containers[0])
plt.tight_layout()
plt.show()