# W210 Wind Data Cleaning

Import Packages

In [6]:
import pandas as pd
import numpy as np
import netCDF4 as ncdf
import os
import datetime
from math import pi
import requests

In [3]:
# from google.colab import drive
# drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [4]:
# ! ls "/content/gdrive/MyDrive/W210-data"

MERRA2_400.tavg3_3d_asm_Nv.20211231.nc4  shapefiles_zcta
MERRA2_400.tavgM_2d_slv_Nx.202112.nc4


Look at data features

In [5]:
in_dir = "/content/gdrive/MyDrive/W210-data/"
os.listdir(in_dir)
wind_1day = ncdf.Dataset(in_dir+'MERRA2_400.tavg3_3d_asm_Nv.20211231.nc4', mode = 'r')
list(wind_1day.variables)

['lon',
 'lat',
 'lev',
 'time',
 'CLOUD',
 'DELP',
 'EPV',
 'H',
 'O3',
 'OMEGA',
 'PHIS',
 'PL',
 'PS',
 'QI',
 'QL',
 'QV',
 'RH',
 'SLP',
 'T',
 'U',
 'V']

Looking for lat/lon ranges

In [None]:
# wind_1day['lat'][:]
# wind_1day['lon'][:]

In [6]:
# Look for latitudes from -34 to 6
wind_1day['lat'][110:193]

masked_array(data=[-3.5000000e+01, -3.4500000e+01, -3.4000000e+01,
                   -3.3500000e+01, -3.3000000e+01, -3.2500000e+01,
                   -3.2000000e+01, -3.1500000e+01, -3.1000000e+01,
                   -3.0500000e+01, -3.0000000e+01, -2.9500000e+01,
                   -2.9000000e+01, -2.8500000e+01, -2.8000000e+01,
                   -2.7500000e+01, -2.7000000e+01, -2.6500000e+01,
                   -2.6000000e+01, -2.5500000e+01, -2.5000000e+01,
                   -2.4500000e+01, -2.4000000e+01, -2.3500000e+01,
                   -2.3000000e+01, -2.2500000e+01, -2.2000000e+01,
                   -2.1500000e+01, -2.1000000e+01, -2.0500000e+01,
                   -2.0000000e+01, -1.9500000e+01, -1.9000000e+01,
                   -1.8500000e+01, -1.8000000e+01, -1.7500000e+01,
                   -1.7000000e+01, -1.6500000e+01, -1.6000000e+01,
                   -1.5500000e+01, -1.5000000e+01, -1.4500000e+01,
                   -1.4000000e+01, -1.3500000e+01, -1.3000000e

In [7]:
# Check the number of coordinates and number of elements in lat variables after slicing 
lats = [x/10 for x in range(-350,65,5)]

In [8]:
len(wind_1day['lat'][110:193]) == len(lats)

True

In [9]:
#Look for longitudes -74 to -34
wind_1day['lon'][168:235]

masked_array(data=[-75.   , -74.375, -73.75 , -73.125, -72.5  , -71.875,
                   -71.25 , -70.625, -70.   , -69.375, -68.75 , -68.125,
                   -67.5  , -66.875, -66.25 , -65.625, -65.   , -64.375,
                   -63.75 , -63.125, -62.5  , -61.875, -61.25 , -60.625,
                   -60.   , -59.375, -58.75 , -58.125, -57.5  , -56.875,
                   -56.25 , -55.625, -55.   , -54.375, -53.75 , -53.125,
                   -52.5  , -51.875, -51.25 , -50.625, -50.   , -49.375,
                   -48.75 , -48.125, -47.5  , -46.875, -46.25 , -45.625,
                   -45.   , -44.375, -43.75 , -43.125, -42.5  , -41.875,
                   -41.25 , -40.625, -40.   , -39.375, -38.75 , -38.125,
                   -37.5  , -36.875, -36.25 , -35.625, -35.   , -34.375,
                   -33.75 ],
             mask=False,
       fill_value=1e+20)

In [10]:
# Build list of longitude labels
lons = [x/1000 for x in range(-75000,-33125,625)]

In [11]:
# Label validation
len(wind_1day['lat'][110:193]) == len(lats), len(wind_1day['lon'][168:235]) == len(lons)

(True, True)

Getting surface wind speeds for the desired slices

This is the part we will want to loop through for all of the wind datafiles. If possible, we should try to put them all in one CSV at the end.

In [12]:
u_cube = wind_1day['U'][:,71,110:193,168:235]
v_cube = wind_1day['V'][:,71,110:193,168:235]

In [13]:
pd.DataFrame(u_cube[0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
0,2.162307,2.731154,3.552444,5.208694,6.456740,5.249709,3.082717,1.254836,4.429397,-1.075974,...,-3.073533,-2.856736,-2.985642,-3.350877,-3.838181,-4.127244,-4.363572,-4.693650,-5.207322,-5.752244
1,1.863479,2.387893,3.234084,4.665725,5.839553,5.874709,4.136428,-0.604295,2.211623,-0.485154,...,-2.730760,-2.774705,-3.135056,-3.658494,-4.316697,-4.590135,-4.752244,-5.095994,-5.609666,-6.137010
2,1.480666,2.010940,2.857131,4.288772,4.554397,5.021194,3.852248,-1.117967,0.889113,0.319106,...,-3.030564,-3.122361,-3.605760,-4.269822,-4.941697,-5.299119,-5.396775,-5.510056,-5.650681,-5.726853
3,1.031447,1.604201,2.456252,3.808303,3.771194,4.362014,3.258498,-1.474900,0.409682,-0.153855,...,-3.687791,-3.912400,-4.510056,-4.955369,-5.441697,-5.679978,-5.595994,-5.429978,-5.084275,-4.834275
4,0.540824,1.210647,2.082229,3.245803,3.262404,3.733108,2.090041,-2.190721,-0.124314,-0.194627,...,-4.265916,-4.760056,-5.047166,-5.010056,-5.152635,-5.105760,-4.922166,-4.619431,-4.490525,-4.853806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,0.631850,-2.867478,-2.835252,-3.568650,-6.265916,-5.344041,-3.652635,-1.928025,-1.196580,-1.194627,...,-7.303025,-6.685838,-6.359666,-6.248338,-6.234666,-6.334275,-6.461228,-6.267869,-6.154588,-5.914353
79,0.886794,-2.946580,-2.849900,-2.675096,-6.154588,-6.240525,-5.502244,-4.250291,-3.080369,-1.697556,...,-7.545213,-6.994431,-6.597947,-6.406541,-6.443650,-6.613572,-6.642869,-6.344041,-6.090135,-5.773728
80,1.057815,-2.216111,-1.894822,-1.222947,-3.469041,-5.392869,-5.400681,-5.250291,-4.568650,-3.096971,...,-7.752244,-7.293260,-6.918260,-6.597947,-6.463181,-6.543260,-6.556931,-6.342088,-6.131150,-5.898728
81,0.800063,-1.237107,-1.255174,-1.427049,-1.296189,-4.213181,-5.111619,-4.678025,-4.250291,-4.490525,...,-8.013963,-7.506150,-7.105760,-6.758103,-6.519822,-6.445603,-6.428025,-6.312791,-6.181931,-6.076463


In [14]:
# Convert numpy to DataFrames using melt and then merged together
dfs = []
times = [0,  180,  360,  540,  720,  900, 1080, 1260]
for x in range(0, len(times)):
    melted_V = pd.DataFrame(v_cube[x]).reset_index().melt('index')
    melted_V.columns = ['lat_key', 'lon_key', 'V']
    melted_V['lat'] = melted_V['lat_key'].apply(lambda x: lats[x])
    melted_V['lon'] = melted_V['lon_key'].apply(lambda x: lons[x])
    melted_U = pd.DataFrame(u_cube[x]).reset_index().melt('index')
    melted_U.columns = ['lat_key', 'lon_key', 'U']
    melted_U['lat'] = melted_U['lat_key'].apply(lambda x: lats[x])
    melted_U['lon'] = melted_U['lon_key'].apply(lambda x: lons[x])
    dfs.append(pd.merge(melted_V, melted_U)[['lat','lon','U','V']])
    dfs[x]['Time'] = times[x] / 60
merged_df = pd.concat(dfs)

In [15]:
# Add wind vector calculations.
merged_df['wspd'] = np.sqrt(merged_df['U']**2+merged_df['V']**2)
merged_df['wdir'] = np.arctan2(merged_df['U'], merged_df['V']) * 180/pi

In [16]:
# Examine reuslts.
merged_df

Unnamed: 0,lat,lon,U,V,Time,wspd,wdir
0,-35.0,-75.00,2.162307,2.162307,0.0,3.057964,45.0
1,-34.5,-75.00,1.863479,1.863479,0.0,2.635357,45.0
2,-34.0,-75.00,1.480666,1.480666,0.0,2.093978,45.0
3,-33.5,-75.00,1.031447,1.031447,0.0,1.458687,45.0
4,-33.0,-75.00,0.540824,0.540824,0.0,0.764841,45.0
...,...,...,...,...,...,...,...
5556,4.0,-33.75,-5.914353,-5.914353,21.0,8.364159,-135.0
5557,4.5,-33.75,-5.773728,-5.773728,21.0,8.165285,-135.0
5558,5.0,-33.75,-5.898728,-5.898728,21.0,8.342062,-135.0
5559,5.5,-33.75,-6.076463,-6.076463,21.0,8.593416,-135.0


In [38]:
# read through the text file with all the download links
file_text = open('/Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/subset_M2T3NVASM_5.12.4_20221015_145010.txt', 'r')
Lines = file_text.readlines()
# len(Lines)
# Lines[1:]
Links_2021 = Lines[1:]
len(Links_2021)

365

In [21]:
# i = 0
# FILENAME = "/Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/2021_"+str(i+1)+".nc4"
# FILENAME

# example_str = "ttps://goldsmr5.gesdisc.eosdis.nasa.gov/daac-bin/OTF/HTTP_services.cgi?FILENAME=%2Fdata%2FMERRA2%2FM2T3NVASM.5.12.4%2F2021%2F01%2FMERRA2_400.tavg3_3d_asm_Nv.20210101.nc4&FORMAT=bmM0Lw&BBOX=-34%2C-74%2C6%2C-34&LABEL=MERRA2_400.tavg3_3d_asm_Nv.20210101.SUB.nc&SHORTNAME=M2T3NVASM&SERVICE=L34RS_MERRA2&LAYERS=LAYER_72&VERSION=1.02&DATASET_VERSION=5.12.4&VARIABLES=RH%2CT%2CU%2CV"
# example_str.split("asm_Nv.")[1][:8]

'20210101'

In [34]:
!mkdir wind_data

In [39]:
def read_clean_wind():
    
    df = pd.DataFrame()

    for i in range(len(Links_2021)):
        url_i = str(Links_2021[i])
        date_i = url_i.split("asm_Nv.")[1][:8]
        FILENAME = "/Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/wind_data/wind_" + str(date_i) + ".nc4"
        result = requests.get(url_i)
        try:
            result.raise_for_status()
            f = open(FILENAME,'wb')
            f.write(result.content)
            f.close()
            print('contents of URL written to '+FILENAME)
        except:
            print('requests.get() returned an error code '+str(result.status_code))
            
        data = ncdf.Dataset(FILENAME, mode='r')
        u_cube = data['U'][:,0,:,:]
        v_cube = data['V'][:,0,:,:]
        lats = [x/10 for x in range(-340, 65, 5)]
        lons = [x/1000 for x in range(-73750, -33875, 625)]
        dfs = []
        times = [0,  180,  360,  540,  720,  900, 1080, 1260]
          # times = [1080, 1260]
        for x in range(0, len(times)):
            melted_V = pd.DataFrame(u_cube[x]).reset_index().melt('index')
            melted_V.columns = ['lat_key', 'lon_key', 'V']
            melted_V['lat'] = melted_V['lat_key'].apply(lambda x: lats[x])
            melted_V['lon'] = melted_V['lon_key'].apply(lambda x: lons[x])
            melted_U = pd.DataFrame(u_cube[x]).reset_index().melt('index')
            melted_U.columns = ['lat_key', 'lon_key', 'U']
            melted_U['lat'] = melted_U['lat_key'].apply(lambda x: lats[x])
            melted_U['lon'] = melted_U['lon_key'].apply(lambda x: lons[x])
            dfs.append(pd.merge(melted_V, melted_U)[['lat','lon','U','V']])
            dfs[x]['Time_Hour'] = times[x] / 60
            merged_df = pd.concat(dfs)
            merged_df['date'] = date_i
        df = pd.concat([df, merged_df], axis=0)
    return(df)


In [40]:
df_2021_test = read_clean_wind()

contents of URL written to /Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/wind_data/wind_20210101.nc4
contents of URL written to /Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/wind_data/wind_20210102.nc4
contents of URL written to /Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/wind_data/wind_20210103.nc4
contents of URL written to /Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/wind_data/wind_20210104.nc4
contents of URL written to /Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/wind_data/wind_20210105.nc4
contents of URL written to /Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/wind_data/wind_20210106.nc4
contents of URL written to /Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/wind_data/wind_20210107.nc4
contents of URL written to /Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/wind_data/wind_20210108.nc4
contents of URL written to /Users/ChangLiu/Documents/UC 

In [41]:
df_2021_test

Unnamed: 0,lat,lon,U,V,Time_Hour,date
0,-34.0,-73.750,2.627565,2.627565,0.0,20210101
1,-33.5,-73.750,2.264283,2.264283,0.0,20210101
2,-33.0,-73.750,1.987672,1.987672,0.0,20210101
3,-32.5,-73.750,1.760988,1.760988,0.0,20210101
4,-32.0,-73.750,1.514512,1.514512,0.0,20210101
...,...,...,...,...,...,...
5179,4.0,-34.375,-6.566580,-6.566580,21.0,20211231
5180,4.5,-34.375,-6.697440,-6.697440,21.0,20211231
5181,5.0,-34.375,-6.933768,-6.933768,21.0,20211231
5182,5.5,-34.375,-7.027518,-7.027518,21.0,20211231


In [42]:
df_2021_test.to_csv("/Users/ChangLiu/Documents/UC Berkeley MIDS/210 Capstone/amazon/wind_data/combined_wind_data_2021.csv")