# Data Generation

Based on a lookup table of cities and their mean temperatures we want to generate a CSV file containing `n` rows of random temperatures.

To generate each row we choose a random city from the lookup table, then generate a random temperature from a normal distribution around the mean temp. We assume the standard deviation is `10.0` for all cities.

In [6]:
import math
import cupy as cp
from numba import cuda
import cudf
from pathlib import Path
import time

In [7]:
def generate_chunk(filename, chunksize, std, lookup_df):
    """Generate some sample data based on the lookup table."""
    # Generate a normal distibution around zero for each row in our output, we will fix the mean for each one in a minute
    temps = cp.random.normal(0, std, int(chunksize))
    # Choose a random city from the lookup table for each row in our output
    cities = cp.random.random_integers(0, len(lookup_df) - 1, int(chunksize))

    @cuda.jit
    def offset_kernel(temps, cities, lookup_values):
        """Lookup the mean of each city and offset the random temperature by the mean"""
        i = cuda.grid(1)
        if i < len(temps):
            temps[i] = temps[i] + lookup_values[cities[i]]

    # Offset each city by it's mean value
    offset_kernel[math.ceil(len(temps) / 128), 128](
        temps, cities, lookup_df.mean_temp.values
    )

    # Convert our arrays to a Dataframe
    output_df = cudf.DataFrame({"city": cities, "temp": temps})
    # Convert the random city index to the city name
    output_df.city = output_df.city.astype(int).map(lookup_df.city)
    # Round the temprature to one decimal place
    output_df.temp = output_df.temp.round(decimals=1)
    # Append this chunk to the output file
    with open(filename, "a") as fh:
        output_df.to_csv(fh, sep=";", chunksize=10_000_000, header=False, index=False)

## Configuration

In [8]:
n = 1_000_000_000  # Number of rows of data to generate

lookup_df = cudf.read_csv(
    "lookup.csv"
)  # Load our lookup table of cities and their mean temperatures
std = 10.0  # We assume temperatures are normally distributed with a standard deviation of 10
chunksize = (
    2e8  # Number of rows to generate in one go (tweak this based on your GPU RAM)
)
filename = Path(f"data_{int(n / 1e9)}b.txt")  # Choose where to write to
if filename.exists():
    filename.unlink()

## Run the data generation

In [9]:
%%time
# Loop over chunks and generate data
start = time.time()
for i in range(int(n / chunksize)):
    generate_chunk(filename, chunksize, std, lookup_df)
    percent_complete = int(((i + 1) * chunksize) / n * 100)
    time_taken = int(time.time() - start)
    time_remaining = int((time_taken / percent_complete) * 100) - time_taken
    print(
        f"Writing {int(n / 1e9)} billion rows to {filename}: {percent_complete}% in {time_taken}s ({time_remaining}s remaining)",
        end="\r",
    )
print()

Writing 1 billion rows to data_1b.txt: 100% in 23s (0s remaining)
CPU times: user 8.43 s, sys: 15.7 s, total: 24.2 s
Wall time: 23.5 s


## Check the files

In [5]:
!ls -lh data_*.txt

-rw-r--r-- 1 rapids conda 129G Jan 18 14:55 data_10b.txt
-rw-r--r-- 1 rapids conda  13G Jan 18 14:41 data_1b.txt
-rw-r--r-- 1 rapids conda  26G Jan 18 14:18 data_2b.txt
