In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import glob
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.metrics import pairwise_distances

%matplotlib inline

In [2]:
def extract_data(file):
    #Extract the data you want from file
    altitude_lh = file.altitude_lh.data
    surf_rain = file.surf_rain.data
    latent_heating = file.latent_heating.data

    lat = file.latitude.data
    lon = file.longitude.data
    time = file.time.data
    
    #create grid of altitude, lat, and lon coordinates
    LAT, ALTITUDE, LON = np.meshgrid(lat, altitude_lh, lon)

    #size of lat and lon as variables
    nlat = len(lat)
    nlon = len(lon)
    nalt = len(altitude_lh)

    #reshape as column vector (note the indicing is now column*ncolumns+row)
    surf_rain = np.reshape(surf_rain,[nlat*nlon])
    LH = np.reshape(latent_heating,[nalt,nlat*nlon])
    ALTITUDE = np.reshape (ALTITUDE,[nalt,nlat*nlon])
    LON = np.reshape (LON,[nalt,nlat*nlon])
    LAT = np.reshape (LAT,[nalt,nlat*nlon])

    #Remove values with NaN and zero rainfall
    surf_R = surf_rain[~np.isnan(surf_rain)]
    surf_r = surf_R[np.nonzero(surf_R)]

    Lat_Heat = LH[:,~np.isnan(surf_rain)]
    Lat_Heat = Lat_Heat[:,np.nonzero(surf_R)]
    Lat_Heat = np.squeeze(Lat_Heat)

    ALTITUDE = ALTITUDE[:,~np.isnan(surf_rain)]
    ALTITUDE = ALTITUDE[:,np.nonzero(surf_R)]
    ALTITUDE = np.squeeze(ALTITUDE)

    LAT = LAT[:,~np.isnan(surf_rain)]
    LAT = LAT[:,np.nonzero(surf_R)]
    LAT = np.squeeze(LAT)

    LON = LON[:,~np.isnan(surf_rain)]
    LON = LON[:,np.nonzero(surf_R)]
    LON = np.squeeze(LON)

    #Remove any profiles where there is missing latent heat info
    surf_r = surf_r[~pd.isnull(Lat_Heat).any(axis=0)]
    LAT = LAT[:,~pd.isnull(Lat_Heat).any(axis=0)]
    LON = LON[:,~pd.isnull(Lat_Heat).any(axis=0)]
    ALTITUDE = ALTITUDE[:,~pd.isnull(Lat_Heat).any(axis=0)]
    Lat_Heat = Lat_Heat[:,~pd.isnull(Lat_Heat).any(axis=0)]
    Time = np.repeat(time,len(surf_r))
    
    return Lat_Heat.T, surf_r.T, ALTITUDE.T, LAT.T, LON.T, Time.T

In [None]:
##months = ['01','02','03','04','05','06','07','08','09','10','11','12']
for m in range(len(months)):
    Lat_Heat = []
    surf_r = []
    ALTITUDE = []
    LAT = []
    LON = []
    TIME = []
    count = 0
    for file in glob.glob("/Users/Lauren/Documents/NOAA/Precipitation/**/"+months[m]+"/*.nc4", recursive=True):
        L, S, A, la, lo, Ti = extract_data(xr.open_dataset(file))
        if count==0:
            Lat_Heat = L
            ALTITUDE = A
            LAT = la
            LON = lo
            TIME = Ti
            count += 1
            print(Lat_Heat.shape)
        else:
            Lat_Heat = np.concatenate((Lat_Heat,L),axis =0)
            ALTITUDE = np.concatenate((ALTITUDE,A),axis =0)
            LAT = np.concatenate((LAT,la),axis =0)
            LON = np.concatenate((LON,lo),axis =0)
            TIME = np.concatenate((TIME,Ti),axis =0)
        surf_r = np.append(surf_r,S)
    test = xr.Dataset(
        data_vars = {'Latitude': (('time', 'altitude'),LAT), 
                     'Longitude': (('time', 'altitude'),LON), 
                     'Latent_Heat': (('time', 'altitude'), Lat_Heat),
                     'surface_rain': (('time'), surf_r)},
        coords = {'time': TIME,
                 'altitude': ALTITUDE[0,:]})
    print(test)
    test.to_netcdf(path = "EPO_1998_"+months[m]+".nc4", compute = True)


In [2]:
LH = []
SR = []
Longitude = []
Latitude = []
count = 1
for file in glob.glob("/Users/Lauren/Documents/NOAA/Precipitation/*.nc4"):
    ds = xr.open_dataset(file)
    if count==1: 
        LH = ds.Latent_Heat.data
        count +=1
    else:
        LH = np.concatenate((LH,ds.Latent_Heat.data),axis=0)
    SR = np.append(SR,ds.surface_rain.data)
    Latitude = np.append(Latitude,ds.Latitude.data[:,1])
    Longitude = np.append(Longitude,ds.Longitude.data[:,1])

In [3]:
#combine the latent heat and rain rate at surface together for trainin data
Xdata = np.concatenate((LH,SR.reshape(len(SR),1)),axis = 1)
Xdata = Xdata[np.where(SR>5),:]
Xdata = np.squeeze(Xdata)

#divide by standard deviation to avoid one metric pulling harder than others
MIN = np.min(Xdata,axis=0)
MAX = np.max(Xdata,axis=0)
np.seterr(divide='ignore', invalid='ignore')
Xdata_scaled = np.subtract(Xdata,MIN)
Xdata_scaled = np.divide(Xdata_scaled,MAX-MIN)
#Xdata_scaled[np.isnan(Xdata_scaled)] = 0

In [2]:
model = DBSCAN(eps=.05, min_samples=100)
#model = KMeans(n_clusters=3, random_state=0)
#print(centers.shape)
model.fit(Xdata_scaled[0:100000,:])

#centers = model.cluster_centers_
labels = model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print(n_clusters_)
print('Done')

#plt.pcolormesh(centers[:,0:-1]*SDEV[None,0:-1])
#plt.colorbar(orientation='vertical')

NameError: name 'DBSCAN' is not defined

In [1]:
test = Xdata_scaled[0:100000,:] 
cat0 = np.mean(test[labels==0,:],axis=0)
cat1 = np.mean(test[labels==1,:],axis=0)
cat2 = np.mean(test[labels==2,:],axis=0)

print(test[labels==-1,:].shape)
#print(test[labels==1,:].shape)
print(test[labels==0,:].shape)

plt.plot(test[labels==-1,:].T)


#plt.plot(cat0)
#plt.plot(cat1)
#plt.plot(cat2)

NameError: name 'Xdata_scaled' is not defined

labels = model.labels_
d = {'lat': Latitude, 'lon': Longitude, 'label': labels}
d = pd.DataFrame(data=d)
df = d.groupby(d.columns.tolist(),as_index=False).size()
axes = np.array(df.axes)
values = np.array(df.values)
print(np.array(df.axes))
print(np.array(df.values))


In [None]:
from mpl_toolkits.basemap import Basemap
lats = axes[:,:,0]
lons = axes[:,:,1]
cate = axes[:,:,2]


# How much to zoom from coordinates (in degrees)
zoom_scale = 3

# Setup the bounding box for the zoom and bounds of the map
bbox = [np.min(lats)-zoom_scale,np.max(lats)+zoom_scale,\
        np.min(lons)-zoom_scale,np.max(lons)+zoom_scale]

fig, ax = plt.subplots(figsize=(12,7))
# Define the projection, scale, the corners of the map, and the resolution.
m = Basemap(projection='merc',llcrnrlat=bbox[0],urcrnrlat=bbox[1],\
            llcrnrlon=bbox[2],urcrnrlon=bbox[3],lat_ts=10,resolution='i')

# Draw coastlines and fill continents and water with color
m.drawcoastlines()
m.fillcontinents(color='#CCCCCC',lake_color='lightblue')

# draw parallels, meridians, and color boundaries
m.drawparallels(np.arange(bbox[0],bbox[1],(bbox[1]-bbox[0])/5),labels=[1,0,0,0])
m.drawmeridians(np.arange(bbox[2],bbox[3],(bbox[3]-bbox[2])/5),labels=[0,0,0,1],rotation=15)
m.drawmapboundary(fill_color='lightblue')

# format colors for elevation range
alt_min = np.min(values)
alt_max = np.max(values)
cmap = plt.get_cmap('gist_earth')
normalize = matplotlib.colors.Normalize(vmin=alt_min, vmax=alt_max)

# plot elevations with different colors using the numpy interpolation mapping tool
# the range [50,200] can be changed to create different colors and ranges
for ii in range(0,len(values)):
    x,y = m(lons[ii],lats[ii])
    color_interp = np.interp(values[ii],[alt_min,alt_max],[50,200])
    plt.plot(x,y,3,marker='o',color=cmap(int(color_interp)))

# format the colorbar 
cax, _ = matplotlib.colorbar.make_axes(ax)
cbar = matplotlib.colorbar.ColorbarBase(cax, cmap=cmap,norm=normalize,label='Frequency')


In [20]:
test = SR[np.where(SR>5)]
xdata = Xdata[np.where(SR>5),:]
print(len(test)/len(SR))
#plt.hist(SR, bins='auto') 

0.12550147392335598
