### Downloading the data

In [None]:
import os
import urllib.request

data_dir = "./data/weather/"
if not os.path.exists(data_dir):
    print("creating weather directory")
    os.system("mkdir -p ./data/weather")

# download weather observations
base_url = "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/"
years = list(range(2018, 2020))
for year in years:
    fn = str(year) + ".csv.gz"
    if not os.path.isfile(data_dir + fn):
        print(f"Downloading {base_url+fn} to {data_dir+fn}")
        urllib.request.urlretrieve(base_url + fn, data_dir + fn)

# download weather station metadata
station_meta_url = (
    "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt"
)
if not os.path.isfile(data_dir + "ghcnd-stations.txt"):
    print("Downloading station meta..")
    urllib.request.urlretrieve(
        station_meta_url, data_dir + "ghcnd-stations.txt"
    )

### Loading into cudf


In [2]:
import cudf
import cupy as cp
import pycuda.autoprimaryctx

column_names = [
    "station_id", "date",
    "type", "val",
    "m_flag", "q_flag",
    "s_flag", "obs_time"
]
usecols = column_names[0:4]
weather_df = cudf.read_csv('data/weather/2018.csv.gz', names=column_names, usecols=usecols)

In [3]:
weather_df.tail()

Unnamed: 0,station_id,date,type,val
35117206,WZ004455110,20181231,TAVG,244
35117207,ZI000067775,20181231,TMAX,285
35117208,ZI000067775,20181231,TMIN,166
35117209,ZI000067775,20181231,PRCP,0
35117210,ZI000067775,20181231,TAVG,226


### Using a grid stride pattern to double all rainfall measurements

In [4]:
rainfall_df = weather_df[weather_df["type"] == "PRCP"]

In [5]:
rainfall_df.tail()

Unnamed: 0,station_id,date,type,val
35117184,WA007848390,20181231,PRCP,0
35117186,WA010101860,20181231,PRCP,79
35117190,WA012084750,20181231,PRCP,1011
35117198,WF000917530,20181231,PRCP,351
35117209,ZI000067775,20181231,PRCP,0


In [6]:
import pycuda.autoprimaryctx
from pycuda.compiler import SourceModule

mod = SourceModule("""
    __global__ void doublify(int64_t *a, int N)
    {
      int stride = blockDim.x * gridDim.x;
      for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += stride) {
        if (i < N) {
          a[i] *= 2;
        }
      }
    }
    """)
func = mod.get_function("doublify")

In [7]:
size = cp.int32(len(rainfall_df['val']))

func(rainfall_df['val'], size, block=(256,1,1), grid=(4096,))

In [8]:
rainfall_df.tail()

Unnamed: 0,station_id,date,type,val
35117184,WA007848390,20181231,PRCP,0
35117186,WA010101860,20181231,PRCP,158
35117190,WA012084750,20181231,PRCP,2022
35117198,WF000917530,20181231,PRCP,702
35117209,ZI000067775,20181231,PRCP,0


### Let's try a more complex operation, converting the measurements to inches

In [9]:
weather_df = cudf.read_csv('data/weather/2018.csv.gz', names=column_names, usecols=usecols)

# cast val to float
rainfall_df = weather_df[weather_df["type"] == "PRCP"].astype({'val': 'float64'})

In [10]:
rainfall_df.tail()

Unnamed: 0,station_id,date,type,val
35117184,WA007848390,20181231,PRCP,0.0
35117186,WA010101860,20181231,PRCP,79.0
35117190,WA012084750,20181231,PRCP,1011.0
35117198,WF000917530,20181231,PRCP,351.0
35117209,ZI000067775,20181231,PRCP,0.0


In [11]:
mod2 = SourceModule("""
    static constexpr float mm_to_inches_factor = 0.0393701;

    __global__ void mm_to_inches(double *a, int N)
    {
      int stride = blockDim.x * gridDim.x;
      for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += stride) {
        if (i < N) {
          a[i] = a[i] * mm_to_inches_factor * 0.1;
        }
      }
    }
    """)
func = mod2.get_function("mm_to_inches")

In [12]:
func(rainfall_df['val'], size, block=(256,1,1), grid=(4096,))

In [13]:
rainfall_df.tail()

Unnamed: 0,station_id,date,type,val
35117184,WA007848390,20181231,PRCP,0.0
35117186,WA010101860,20181231,PRCP,0.311024
35117190,WA012084750,20181231,PRCP,3.980317
35117198,WF000917530,20181231,PRCP,1.381891
35117209,ZI000067775,20181231,PRCP,0.0


### Integration with external libraries, generating a column of random values

In [14]:
weather_df = cudf.read_csv('data/weather/2018.csv.gz', names=column_names, usecols=usecols)

In [15]:
size = cp.int32(len(weather_df))

In [16]:
mod3 = SourceModule('''
#include <thrust/random.h>

extern "C" {

    __global__ void random_column(double* a, int N)
    {
      thrust::default_random_engine rng;
      
      int stride = blockDim.x * gridDim.x;
      for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += stride) {
        if (i < N) {
          rng.discard(i);
          thrust::uniform_real_distribution<float> rand01(0,1);
          double r = rand01(rng);
          a[i] = r;
        }
      }
    }
}

''', no_extern_c=True)

In [17]:
func = mod3.get_function("random_column")

In [18]:
weather_df['random_col'] = cp.zeros(len(weather_df))

In [19]:
func(weather_df['random_col'], size, block=(256,1,1), grid=(4096,))

In [20]:
weather_df

Unnamed: 0,station_id,date,type,val,random_col
0,AE000041196,20180101,TMAX,259,0.000022
1,AE000041196,20180101,TMIN,112,0.085032
2,AE000041196,20180101,TAVG,186,0.601353
3,AEM00041194,20180101,TMAX,250,0.891611
4,AEM00041194,20180101,PRCP,0,0.967956
...,...,...,...,...,...
35117206,WZ004455110,20181231,TAVG,244,0.895549
35117207,ZI000067775,20181231,TMAX,285,0.610949
35117208,ZI000067775,20181231,TMIN,166,0.600397
35117209,ZI000067775,20181231,PRCP,0,0.592128


### Reversing the rows in a column

In [57]:
import pycuda.autoprimaryctx
from pycuda.compiler import SourceModule
import cudf
import cupy as cp

df = cudf.DataFrame({'col': [i for i in range(10000000)]})
length = cp.int32(len(df['col'])) - 1

In [58]:
df

Unnamed: 0,col
0,0
1,1
2,2
3,3
4,4
...,...
9999995,9999995
9999996,9999996
9999997,9999997
9999998,9999998


In [59]:
mod = SourceModule('''
__global__ void reverse_row(long* reverse_row, int N)
{      
  int stride = blockDim.x * gridDim.x;
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += stride) {
    if (i < N - i) {
        int row1 = reverse_row[i];
        int row2 = reverse_row[N - i];
        reverse_row[i] = row2;
        reverse_row[N - i] = row1;
    }
  }
}
''')

In [60]:
func = mod.get_function('reverse_row')

In [61]:
func(df['col'], length, block=(256,1,1), grid=(4096,))

In [62]:
df

Unnamed: 0,col
0,9999999
1,9999998
2,9999997
3,9999996
4,9999995
...,...
9999995,4
9999996,3
9999997,2
9999998,1
