### Downloading the data

In [None]:
import os
import urllib.request

data_dir = "./data/weather/"
if not os.path.exists(data_dir):
    print("creating weather directory")
    os.system("mkdir -p ./data/weather")

# download weather observations
base_url = "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/"
years = list(range(2018, 2020))
for year in years:
    fn = str(year) + ".csv.gz"
    if not os.path.isfile(data_dir + fn):
        print(f"Downloading {base_url+fn} to {data_dir+fn}")
        urllib.request.urlretrieve(base_url + fn, data_dir + fn)

# download weather station metadata
station_meta_url = (
    "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt"
)
if not os.path.isfile(data_dir + "ghcnd-stations.txt"):
    print("Downloading station meta..")
    urllib.request.urlretrieve(
        station_meta_url, data_dir + "ghcnd-stations.txt"
    )

### Loading into cudf


In [1]:
import cudf
import cupy as cp
import pycuda.autoprimaryctx

column_names = [
    "station_id", "date",
    "type", "val",
    "m_flag", "q_flag",
    "s_flag", "obs_time"
]
usecols = column_names[0:4]
weather_df = cudf.read_csv('data/weather/2018.csv.gz', names=column_names, usecols=usecols)

In [2]:
weather_df.tail()

Unnamed: 0,station_id,date,type,val
35117206,WZ004455110,20181231,TAVG,244
35117207,ZI000067775,20181231,TMAX,285
35117208,ZI000067775,20181231,TMIN,166
35117209,ZI000067775,20181231,PRCP,0
35117210,ZI000067775,20181231,TAVG,226


### Using a grid stride pattern to double all rainfall measurements

In [3]:
rainfall_df = weather_df[weather_df["type"] == "PRCP"]

In [4]:
rainfall_df.tail()

Unnamed: 0,station_id,date,type,val
35117184,WA007848390,20181231,PRCP,0
35117186,WA010101860,20181231,PRCP,79
35117190,WA012084750,20181231,PRCP,1011
35117198,WF000917530,20181231,PRCP,351
35117209,ZI000067775,20181231,PRCP,0


In [5]:
import pycuda.autoprimaryctx
from pycuda.compiler import SourceModule

mod = SourceModule("""
    __global__ void doublify(int64_t *a, int N)
    {
      int stride = blockDim.x * gridDim.x;
      for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += stride) {
        if (i < N) {
          a[i] *= 2;
        }
      }
    }
    """)
func = mod.get_function("doublify")

In [6]:
size = cp.int32(len(rainfall_df['val']))

func(rainfall_df['val'], size, block=(256,1,1), grid=(4096,))

In [7]:
rainfall_df.tail()

Unnamed: 0,station_id,date,type,val
35117184,WA007848390,20181231,PRCP,0
35117186,WA010101860,20181231,PRCP,158
35117190,WA012084750,20181231,PRCP,2022
35117198,WF000917530,20181231,PRCP,702
35117209,ZI000067775,20181231,PRCP,0


### Let's try a more complex operation, converting the measurements to inches

In [71]:
weather_df = cudf.read_csv('data/weather/2018.csv.gz', names=column_names, usecols=usecols)

# cast val to float
rainfall_df = weather_df[weather_df["type"] == "PRCP"].astype({'val': 'float64'})

In [72]:
rainfall_df.tail()

Unnamed: 0,station_id,date,type,val
35117184,WA007848390,20181231,PRCP,0.0
35117186,WA010101860,20181231,PRCP,79.0
35117190,WA012084750,20181231,PRCP,1011.0
35117198,WF000917530,20181231,PRCP,351.0
35117209,ZI000067775,20181231,PRCP,0.0


In [73]:
mod2 = SourceModule("""
    static constexpr float mm_to_inches_factor = 0.0393701;

    __global__ void mm_to_inches(double *a, int N)
    {
      int stride = blockDim.x * gridDim.x;
      for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += stride) {
        if (i < N) {
          a[i] = a[i] * mm_to_inches_factor * 0.1;
        }
      }
    }
    """)
func = mod2.get_function("mm_to_inches")

In [74]:
func(rainfall_df['val'], size, block=(256,1,1), grid=(4096,))

In [75]:
rainfall_df.tail()

Unnamed: 0,station_id,date,type,val
35117184,WA007848390,20181231,PRCP,0.0
35117186,WA010101860,20181231,PRCP,0.311024
35117190,WA012084750,20181231,PRCP,3.980317
35117198,WF000917530,20181231,PRCP,1.381891
35117209,ZI000067775,20181231,PRCP,0.0
