In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist, cosine
from sklearn.preprocessing import normalize
import tensorflow as tf
from joblib import Parallel, delayed

from matplotlib import cm
import matplotlib.cbook as cbook
import matplotlib.colors as colors

import pandas as pd
import xarray as xr
from netCDF4 import Dataset, num2date, date2num
from datetime import datetime, timedelta
import os

import seaborn as sns
import cmocean as cmocn

%matplotlib inline

2024-08-06 11:12:43.610949: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-06 11:12:43.640668: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-06 11:12:43.640695: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-06 11:12:43.642101: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-06 11:12:43.647324: I tensorflow/core/platform/cpu_feature_gua

In [2]:
ds = xr.open_dataset('../rainFlagRemovedBuoyDataBadQualityRemovedMatchup.nc')
df = ds.to_dataframe()
df_orig = df.copy()

In [3]:
df['Speed Difference (QuikSCAT - TAO)'] = df['Wind Speed (QuikSCAT)'] - df['Wind Speed (TAO)']

df['Wind Direction (TAO)'] = (-(df['Wind Direction (TAO)'] - 90.0) + 360)%360
df['Wind Direction (QuikSCAT)'] = (-(df['Wind Direction (QuikSCAT)'] - 90.0) + 360)%360
df['mean WDIR 30min'] = (-(df['mean WDIR 30min'] - 90.0) + 360)%360
df['mean WDIR 60min'] = (-(df['mean WDIR 60min'] - 90.0) + 360)%360
df['mean WDIR 120min'] = (-(df['mean WDIR 120min'] - 90.0) + 360)%360

df['Direction Difference (QuikSCAT - TAO)'] = ((df['Wind Direction (QuikSCAT)'] - df['Wind Direction (TAO)'])+360)%360
dirDiff = np.array(df['Direction Difference (QuikSCAT - TAO)'])
dirDiff[dirDiff > 180] -= 360
df['Direction Difference (QuikSCAT - TAO)'] = dirDiff

df['Speed Difference (QuikSCAT - TAO 30 min mean)'] = df['Wind Speed (QuikSCAT)'] - df['mean WSPD 30min']
df['Direction Difference (QuikSCAT - TAO 30 min mean)'] = ((df['Wind Direction (QuikSCAT)'] - df['mean WDIR 30min'])+360)%360
dirDiff = np.array(df['Direction Difference (QuikSCAT - TAO 30 min mean)'])
dirDiff[dirDiff > 180] -= 360
df['Direction Difference (QuikSCAT - TAO 30 min mean)'] = dirDiff

df['Speed Difference (QuikSCAT - TAO 1 hr mean)'] = df['Wind Speed (QuikSCAT)'] - df['mean WSPD 60min']
df['Direction Difference (QuikSCAT - TAO 1 hr mean)'] = ((df['Wind Direction (QuikSCAT)'] - df['mean WDIR 60min'])+360)%360
dirDiff = np.array(df['Direction Difference (QuikSCAT - TAO 1 hr mean)'])
dirDiff[dirDiff > 180] -= 360
df['Direction Difference (QuikSCAT - TAO 1 hr mean)'] = dirDiff

df['Speed Difference (QuikSCAT - TAO 2 hr mean)'] = df['Wind Speed (QuikSCAT)'] - df['mean WSPD 120min']
df['Direction Difference (QuikSCAT - TAO 2 hr mean)'] = ((df['Wind Direction (QuikSCAT)'] - df['mean WDIR 120min'])+360)%360
dirDiff = np.array(df['Direction Difference (QuikSCAT - TAO 2 hr mean)'])
dirDiff[dirDiff > 180] -= 360
df['Direction Difference (QuikSCAT - TAO 2 hr mean)'] = dirDiff


df['Zonal Neutral Wind Speed at 10m (TAO)'] = df['Neutral Wind Speed at 10m (TAO)']*np.cos(np.deg2rad(df['Wind Direction (TAO)']))
df['Meridional Neutral Wind Speed at 10m (TAO)'] = df['Neutral Wind Speed at 10m (TAO)']*np.sin(np.deg2rad(df['Wind Direction (TAO)']))

df['Zonal Neutral Wind Speed at 10m (QuikSCAT)'] = df['Wind Speed (QuikSCAT)']*np.cos(np.deg2rad(df['Wind Direction (QuikSCAT)']))
df['Meridional Neutral Wind Speed at 10m (QuikSCAT)'] = df['Wind Speed (QuikSCAT)']*np.sin(np.deg2rad(df['Wind Direction (QuikSCAT)']))

df['Zonal Wind Speed Difference (QuikSCAT - TAO)'] = df['Zonal Neutral Wind Speed at 10m (QuikSCAT)'] - df['Zonal Neutral Wind Speed at 10m (TAO)']
df['Meridional Wind Speed Difference (QuikSCAT - TAO)'] = df['Meridional Neutral Wind Speed at 10m (QuikSCAT)'] - df['Meridional Neutral Wind Speed at 10m (TAO)']

df['cos(Direction Difference (QuikSCAT - TAO))'] = np.cos(np.deg2rad(df['Direction Difference (QuikSCAT - TAO)']))
df['sin(Direction Difference (QuikSCAT - TAO))'] = np.sin(np.deg2rad(df['Direction Difference (QuikSCAT - TAO)']))

In [4]:
x = abs(df['Speed Difference (QuikSCAT - TAO)'])
x = x - np.mean(x)/np.std(x)

y = abs(df['Speed Difference (QuikSCAT - TAO)'])
y = y - np.mean(y)/np.std(y)

df['distance from origin'] = np.sqrt(x**2 + y**2)

In [6]:
selectX = ['Speed Difference (QuikSCAT - TAO)',
           'cos(Direction Difference (QuikSCAT - TAO))',
           'sin(Direction Difference (QuikSCAT - TAO))'] #,
           'distance from origin']

X = df[selectX]
normX = (X - X.mean(axis=0))/ X.std(axis=0)
#normX = normalize(X.to_numpy())

# Function to compute silhouette score for one sample
def silhouette_score_sample(i, X, labels, metric='euclidean'):
    mask = np.ones(len(X), dtype=bool)
    mask[i] = False
    current_label = labels[i]
    
    # Calculate a(i)
    a_i = np.mean(pairwise_distances([X[i]], X[labels == current_label], metric=metric)[0])
    
    # Calculate b(i)
    b_i = np.inf
    for label in np.unique(labels):
        if label == current_label:
            continue
        b_i = min(b_i, np.mean(pairwise_distances([X[i]], X[labels == label], metric=metric)[0]))
    
    return (b_i - a_i) / max(a_i, b_i)
    
def getScore(n_cluster, X):
    # Fit KMeans
    kmeans = KMeans(n_clusters=n_cluster, random_state=42, n_init='auto').fit(X)
    labels = kmeans.labels_

    # # Fit Cosine Similarity
    # data = X
    # labels, centroids = kmeans_cosine(data, n_cluster)

    # gmm = GaussianMixture(n_components=n_cluster, random_state=0)
    # gmm.fit(X)
    
    # Predict the cluster for each data point
    # labels = gmm.predict(X)
    
    # Calculate silhouette scores in parallel
    n_jobs = -1  # Use all available cores
    silhouette_scores = Parallel(n_jobs=n_jobs)(
        delayed(silhouette_score_sample)(i, X, labels) for i in range(len(X))
    )  
    # Calculate the average silhouette score
    average_silhouette_score = np.mean(silhouette_scores)

    return average_silhouette_score

### Function to calculate cosine similarity
def kmeans_cosine(X, n_clusters, max_iter=300, tol=1e-4):
    # Normalize the data to make it suitable for cosine similarity
    X_normalized = normalize(X)

    # Randomly initialize the centroids
    centroids = X_normalized[np.random.choice(X_normalized.shape[0], n_clusters, replace=False)]
    #centroids = X[np.random.choice(X.shape[0], n_clusters, replace=False)]

    for i in range(max_iter):
        # Compute cosine distances between points and centroids
        distances = cdist(X_normalized, centroids, metric='cosine')
        #distances = cdist(X, centroids, metric='cosine')

        # Assign clusters based on the closest centroids
        clusters = np.argmin(distances, axis=1)

        # Calculate new centroids
        new_centroids = np.array([X_normalized[clusters == j].mean(axis=0) for j in range(n_clusters)])
        #new_centroids = np.array([X[clusters == j].mean(axis=0) for j in range(n_clusters)])

        # # Normalize new centroids
        new_centroids = normalize(new_centroids)

        # Check for convergence
        if np.allclose(centroids, new_centroids, atol=tol):
            break

        centroids = new_centroids

    return clusters, centroids

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 46)

In [None]:
range_n_clusters = np.arange(2,10)
silhouette_scores = []
for n_cluster in range_n_clusters:
    print(f'cluster : {n_cluster}')
    silhouette_scores.append(getScore(n_cluster, normX.to_numpy()))

silhouette_scores = np.array(silhouette_scores)
index = np.argmax(silhouette_scores)
best_n_clusters = range_n_clusters[index]
best_score = silhouette_scores[index]

In [None]:
# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.title('Silhouette Score vs. Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(range_n_clusters)
plt.grid(True)
#plt.show()

print(f'The optimal number of clusters is: {best_n_clusters} with a silhouette score of {best_score}')