In [1]:
import numpy as np
import pandas as pd
import xarray as xr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from netCDF4 import Dataset, date2num, num2date
from datetime import datetime, timedelta
import os

2024-04-24 16:01:39.220062: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
latList = [-9, -8, -5, -2, 0, 2, 5, 8, 9]
lonList = [-95, -110, -125, -140, -155, -170, -180, 165]

ylen = len(latList)
xlen = len(lonList)

taskList = []

for latId  in range(ylen):
    for lonId in range(xlen):
        taskList.append([latList[latId], lonList[lonId]])

ntasks = len(taskList)

i = 0
tlen = 0


metaDataIndex = ['time',
                 'lat', 
                 'lon',
                 'U10N',
                 'U10N_dir',
                 'SST',
                 'RH',
                 'AIRT',
                 'RAIN']

nvars =len(metaDataIndex)
metaData = np.empty((0,nvars), dtype=float)

allDS = xr.Dataset()

nfiles = 0
for task in taskList:
    lat = task[0]
    lon = task[1]

    LAT = lat
    LON = lon

    if lat < 0:
        latUnits = 'S'
    else:
        latUnits = 'N'

    if lon < 0:
        lonUnits = 'W'
        LON += 360
    else:
        lonUnits = 'E'
    
    lat=abs(lat)
    lon=abs(lon)

    dataFileName = f'../../downloads/Buoy/extractedGZ/WINDS/T_{lat:02d}{latUnits}_{lon:03d}{lonUnits}_withRAIN_COARE3p5_2000.nc'
    deployFileName = f'../../downloads/Buoy/extractedGZ/WINDS/T_{lat:02d}{latUnits}_{lon:03d}{lonUnits}/T_{lat:02d}{latUnits}_{lon:03d}{lonUnits}_DeploymentDates.nc'
    
    if os.path.isfile(dataFileName):
        print('yes')
        df = xr.open_dataset(dataFileName)
        df = df.sortby('time')
        
        ndata = len(df['time'])
        lat_da = xr.DataArray(data=np.ones(ndata) * LAT, dims=['time'],
                             attrs = {
                                 'units': 'degrees North',
                                 'long_name': 'buoy lat location'
                             })
        lon_da = xr.DataArray(data=np.ones(ndata) * LON, dims=['time'],
                             attrs = {
                                 'units': 'degrees East',
                                 'long_name': 'buoy longitude location'
                             })
        df['lat'] = lat_da
        df['lon'] = lon_da

        

        tval = df['time'].to_numpy()

        # twoHrVarWspd = np.zeros((ndata), dtype=float)*np.nan
        # twoHrVarWdir = np.zeros((ndata), dtype=float)*np.nan

        twoHrStdWspd = np.zeros((ndata), dtype=float)*np.nan
        twoHrStdWdir = np.zeros((ndata), dtype=float)*np.nan

        npoints4VarDir = np.zeros((ndata), dtype=float)*np.nan
        npoints4VarSpd = np.zeros((ndata), dtype=float)*np.nan

        wspd = df['U10N'].to_numpy()
        wdir = df['U10_direction'].to_numpy()
        
        for i in range(6,ndata-6):
            thisTvalBox = tval[i-6:i+6]
            tdiff = thisTvalBox - tval[i]
            mask = abs(tdiff) < np.timedelta64(timedelta(seconds=3600))

            thisWspdBox = wspd[i-6:i+6]
            thisWdirBox = wdir[i-6:i+6]
        
            spdArr = thisWspdBox[mask]
            dirArr = thisWdirBox[mask]

            npoints4varSpd = np.sum(~np.isnan(spdArr))
            npoints4varDir = np.sum(~np.isnan(dirArr))

            if npoints4varSpd > 5 and npoints4varDir > 5:
                twoHrStdWspd[i] = np.nanstd(spdArr)
                twoHrStdWdir[i] = np.nanstd(dirArr) 
        
               


        twoHrStdWspd_da = xr.DataArray(data=twoHrStdWspd, dims=['time'],
                             attrs = {
                                 'units': 'm/sec',
                                 'long_name': 'std. deviation of wind speed in two-hour running window'
                             })

        twoHrStdWdir_da = xr.DataArray(data=twoHrStdWdir, dims=['time'],
                             attrs = {
                                 'units': 'm/sec',
                                 'long_name': 'std. deviation of wind direction in two-hour running window'
                             })

        npoints4VarSpd_da = xr.DataArray(data=npoints4VarSpd, dims=['time'],
                             attrs = {
                                 'units': 'N/A',
                                 'long_name': 'number of data used for std. deviation of wind speed in two-hour running window'
                             })

        npoints4VarDir_da = xr.DataArray(data=npoints4VarDir, dims=['time'],
                             attrs = {
                                 'units': 'N/A',
                                 'long_name': 'number of data used for std. deviation of wind direction in two-hour running window'
                             })
            
        df['twoHrStdWspd'] = twoHrStdWspd_da
        df['twoHrStdWdir'] = twoHrStdWdir_da
        df['npoints4VarSpd'] = npoints4VarSpd_da
        df['npoints4VarDir'] = npoints4VarDir_da
            
        ds2 = Dataset(deployFileName)
        cdfTime = ds2.variables['startDate']
        timeUnit = cdfTime.units
        timeArr = np.array(cdfTime)
        cftimes=num2date(timeArr, timeUnit)
        startDates = np.array([datetime(dtm.year, dtm.month, dtm.day, dtm.hour, dtm.minute, dtm.second) for dtm in cftimes])
        startDates = np.array([np.datetime64(dt) for dt in startDates])

        cdfTime = ds2.variables['endDate']
        timeUnit = cdfTime.units
        timeArr = np.array(cdfTime)
        cftimes=num2date(timeArr, timeUnit)
        endDates = np.array([datetime(dtm.year, dtm.month, dtm.day, dtm.hour, dtm.minute, dtm.second) for dtm in cftimes])
        endDates = np.array([np.datetime64(dt) for dt in endDates])

        depNum = np.zeros((ndata), dtype=int)
        timeArr = df['time'].to_numpy()
        
        c = 1
        for i in range(1,len(startDates)):
            mask = timeArr >= startDates[i]
            mask *= timeArr <= endDates[i]
            depNum[mask] = c
            c = c+1

        depClass_da = xr.DataArray(data=depNum, dims=['time'],
                             attrs = {
                                 'units': 'N/A',
                                 'long_name': 'deployment label. Data in same deployment have same deployment label'
                             })
        
        df['Deployment Classifier'] = depClass_da

        if nfiles == 0:
            allDS = df
        else:
            allDS = xr.concat((allDS, df), dim='time')

        nfiles += 1


yes
yes


ValueError: cannot reindex or align along dimension 'Deployment Classifier' because the (pandas) index has duplicate values

In [4]:
df

In [6]:
allDS = xr.Dataset(coords=("time"))

ValueError: dictionary update sequence element #0 has length 1; 2 is required

In [None]:
i = 100
thisTvalBox = tval[i-6:i+6]
tdiff = thisTvalBox - tval[i]

In [None]:
mask = abs(tdiff) < np.timedelta64(timedelta(seconds=3600))

In [None]:
mask

In [None]:
len(df['time']), len(empty_DS['time'])

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(empty_DS['time'])

In [None]:
empty_DS['time'][0:50]

In [None]:
df = xr.concat([empty_DS, df], dim='time')

In [None]:
df

In [None]:
list(df.keys())

In [None]:
timestamp = (tval[0] - np.datetime64('1970-01-01T00:00:00'))/ np.timedelta64(1, 's')
v1 = datetime.utcfromtimestamp(timestamp)

timestamp = (tval[1] - np.datetime64('1970-01-01T00:00:00'))/ np.timedelta64(1, 's')
v2 = datetime.utcfromtimestamp(timestamp)

timestamp, v1,np.datetime64(v2)

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.plot(df['RAIN'])

In [None]:
df

In [None]:
startDates

In [None]:
# Assuming you have loaded your data into a DataFrame called 'data'

# Define your threshold for rain (8mm/hr)
rain_threshold = 8

# Label data based on precipitation rate
data['label'] = np.where(data['precipitation_rate'] >= rain_threshold, 1, 0)

# Drop rows with missing values
data.dropna(inplace=True)

# Define features and target variable
features = ['sea_surface_temperature', 'wind_speed', 'wind_direction', 
            'wind_speed_variance', 'wind_direction_variance', 
            'air_temperature', 'air_humidity']
target = 'label'

# Split data into features and target
X = data[features]
y = data[target]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')