In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from glob import glob
%matplotlib inline

In [None]:
folderLoc ='/srv/data2/srai_poseidon/srai_poseidon/observation/SatelliteVsBuoy/downloads/Buoy/extractedGZ/WINDS/'
fileList = glob(folderLoc + '*_xrr_COARE3p5_2000_withRAIN_2hrMeanVar.nc')

In [None]:
count = 0
for fname in fileList:
    print(fname.lstrip(folderLoc).rstrip('_xrr_COARE3p5_2000_withRAIN_2hrMeanVar.nc'))
    ds = xr.open_dataset(fname)
    ds['mean_WVEL'] = np.sqrt(ds['mean_U10N_x'] **2 + ds['mean_U10N_y']**2)
    selectLabels = ['mean_WVEL', 'mean_WSPD', 'std_WSPD', 
                'std_cosWDIR' , 'std_sinWDIR', 
                'mean_SST', 'std_SST', 
                'mean_AIRT', 'std_AIRT',
                'mean_RELH', 'std_RELH',
                'mean_RAIN']
    subDS = ds[selectLabels]
    selectMask = ~np.isnan(subDS['mean_WVEL'].to_numpy())
    subDS = subDS.isel(TIME=selectMask)
    stdLabels = ['std_WSPD', 'std_cosWDIR', 'std_sinWDIR', 'std_SST', 'std_AIRT', 'std_RELH']
    for label in stdLabels:
        mask = np.isnan(subDS[label].to_numpy())
        #print(label, np.sum(mask))
        subDS[label] = xr.where(mask, 0.0, subDS[label])
    data = subDS.to_dataframe()
    if count == 0:
        allDF = data
    else:
        allDF = pd.concat([allDF, data])
    ds.close()
    count+= 1
    
            

In [None]:

fname = 'T_09N_140W_xrr_COARE3p5_2000_withRAIN_2hrMeanVar.nc'
fname2 = 'T_09N_140W_xrr_MatchUp_720_mins_2000.nc'
ds = xr.open_dataset(folderLoc + fname)
ds

ds2 = xr.open_dataset(folderLoc + fname2)

In [None]:
ds['mean_WVEL'] = np.sqrt(ds['mean_U10N_x'] **2 + ds['mean_U10N_y']**2)

In [None]:
selectLabels = ['mean_WVEL', 'mean_WSPD', 'std_WSPD', 
                'std_cosWDIR' , 'std_sinWDIR', 
                'mean_SST', 'std_SST', 
                'mean_AIRT', 'std_AIRT',
                'mean_RELH', 'std_RELH',
                'mean_RAIN']
subDS = ds[selectLabels]

In [None]:
selectMask = ~np.isnan(subDS['mean_WVEL'].to_numpy())
subDS = subDS.isel(TIME=selectMask)
stdLabels = ['std_WSPD', 'std_cosWDIR', 'std_sinWDIR', 'std_SST', 'std_AIRT', 'std_RELH']
for label in stdLabels:
    mask = np.isnan(subDS[label].to_numpy())
    print(label, np.sum(mask))
    subDS[label] = xr.where(mask, 0.0, subDS[label])
    

In [None]:
plt.figure(figsize=(20,5))
# ds.WSPD_10N.sel(HEIGHT=10).isel(TIME=slice(1,2000)).plot(x='TIME')
# ds.mean_WSPD_10N.isel(TIME=slice(1,2000)).plot(x='TIME', alpha = 0.5)
# ds.mean_WVEL.isel(TIME=slice(1,2000)).plot(x='TIME', alpha = 0.5)
ds.mean_RAIN.isel(TIME=slice(1,2000)).plot(x='TIME', alpha = 0.5)
ds.std_WSPD.isel(TIME=slice(1,2000)).plot(x='TIME', alpha = 0.5)
#ds.mean_RELH.isel(TIME=slice(1,2000)).plot(x='TIME', alpha = 0.5)
ds.mean_SST.isel(TIME=slice(1,2000)).plot(x='TIME', alpha = 0.5)

In [None]:
data = subDS.to_dataframe()

In [None]:
data

In [None]:
# Define the RAIN label based on the meanRAIN column and the threshold of 5mm/hr
data['RAIN_LABEL'] = data['mean_RAIN'].apply(lambda x: 1 if x > 5 else 0)

# Features to use for classification
features = ['mean_WSPD', 'mean_SST', 'mean_AIRT', 'mean_RELH', 'std_WSPD', 'std_cosWDIR', 'std_sinWDIR', 'std_SST', 'std_AIRT', 'std_RELH']

# Split the dataset into features (X) and target (y)
X = data[features]
y = data['RAIN_LABEL']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a RandomForest Classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
len(y_t

In [None]:
falsePos = np.logical_and(y_test ==0, y_pred == 1)
falseNeg = np.logical_and(y_test ==1, y_pred == 0)

In [None]:
np.sum(falsePos)

In [None]:
np.sum(falseNeg)

In [None]:
len(y_test)

In [None]:
(30+283)/51317*100

In [None]:
WSPD2_std = ds2['std. dev. WSPD_10N 120min']
WSPD2_mean = ds2['mean WSPD_10N 120min']

In [None]:
WSPD_mean = ds['mean_WSPD_10N']
WSPD_std = ds['std_WSPD_10N']

In [None]:
plt.figure(figsize=(20,5))
timeArr = ds2.QS_TIME.sel(QS_TIME = slice(datetime(2000,1,1), datetime(2000,5,31)))
WSPD2_std.sel(QS_TIME = timeArr, method='nearest').plot(x='QS_TIME')
WSPD_std.sel(TIME = timeArr, method='nearest').plot(x='TIME')