In [1]:
import cudf
import numpy as np
from numba import cuda
import math

In [2]:
PERCENT_TRAIN = 0.8

In [3]:
#Numba Kernel to calculate Haversine distance
@cuda.jit
def haversine_kernel(lat1, lon1, lat2, lon2, outputCol):
    iRow = cuda.grid(1)
    p = 0.017453292519943295 # Pi/180
    if iRow < outputCol.size:
        a = 0.5 - math.cos((lat2[iRow] - lat1[iRow]) * p)/2 + math.cos(lat1[iRow] * p) * \
            math.cos(lat2[iRow] * p) * (1 - math.cos((lon2[iRow] - lon1[iRow]) * p)) / 2                                 
        outputCol[iRow] = 12734 * math.asin(math.sqrt(a))
    
def haversine_distance(gdf):
    nRows = gdf.shape[0]
    blockSize = 128
    blockCount = nRows // blockSize + 1
    lat1_arr = gdf['pickup_latitude'].to_gpu_array()
    lon1_arr = gdf['pickup_longitude'].to_gpu_array()
    lat2_arr = gdf['dropoff_latitude'].to_gpu_array()
    lon2_arr = gdf['dropoff_longitude'].to_gpu_array()
                                   
    outputCol = cuda.device_array ( shape=(nRows), dtype=lat1_arr.dtype.name)
    
    haversine_kernel[(blockCount),(blockSize)](lat1_arr, lon1_arr, lat2_arr, lon2_arr, outputCol)
    gdf.add_column(name='h_distance', data = outputCol)
    return gdf

#Numba Kernel to calculate day of the week from Date
@cuda.jit
def day_of_the_week_kernel(output ,year, month, day):
    iRow = cuda.grid(1)
    if iRow < output.size:
        year[iRow] -= month[iRow] < 3
        month[iRow] = (month[iRow] + 9)%12 + 1
        output[iRow] = (year[iRow] + int(year[iRow]/4) - int(year[iRow]/100) + int(year[iRow]/400) + math.floor(2.6*month[iRow] - 0.2) + day[iRow] -1) % 7
    
def day_of_week(gdf):
    nRows = gdf.shape[0]
    blockSize = 128
    blockCount = nRows // blockSize + 1
    year_arr = gdf['year'].to_gpu_array()
    month_arr = gdf['month'].to_gpu_array()
    day_arr = gdf['day'].to_gpu_array()
    outputCol = cuda.device_array ( shape=(nRows), dtype=day_arr.dtype.name)
    
    day_of_the_week_kernel[(blockCount),(blockSize)](outputCol, year_arr, month_arr, day_arr)
    gdf.add_column(name='day_of_week', data = outputCol)
    gdf['day_of_week'] = gdf['day_of_week'].astype('float32')
    return gdf
    
import pandas as pd
def gpu_read_csv(file_path):
    names  = ['vendor_id','pickup_datetime','dropoff_datetime','passenger_count','trip_distance','pickup_longitude',
              'pickup_latitude','rate_code','store_and_fwd','dropoff_longitude','dropoff_latitude','payment_type',
              'fare_amount','surcharge','mta_tax','tip_amount','tolls_amount','total_amount']
    
    dtypes = ['category','date','date','int','float64','float64','float64','category','category','float64','float64',
              'category','float64','float64','float64','float64','float64','float64']

    df = cudf.read_csv(file_path, dtype=dtypes, names=names,skiprows=1)
    return df

def null_workaround(df, **kwargs):
    for column, data_type in df.dtypes.items():
        if str(data_type) in ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']:
            df[column] = df[column].fillna(-1)
    return df

def clean_data(df):
    drop_list = [
        'dropoff_datetime', 'payment_type', 'surcharge', 'mta_tax',
        'tip_amount', 'tolls_amount', 'total_amount'
    ]

    for column in drop_list:
        df.drop_column(column)
        
    df = null_workaround(df)
        
    df_fare = df.query('fare_amount > 0 and fare_amount < 500')
    del(df)
    
    df_pass = df_fare.query('passenger_count > 0 and passenger_count < 6')
    del(df_fare)
    
    df_picklong = df_pass.query('pickup_longitude > -75 and pickup_longitude < -73')
    del(df_pass)
    
    df_droplong = df_picklong.query('dropoff_longitude > -75 and dropoff_longitude < -73')
    del(df_picklong)
    
    df_picklat = df_droplong.query('pickup_latitude > 40 and pickup_latitude < 42')
    del(df_droplong)
    
    df_droplat = df_picklat.query('dropoff_latitude > 40 and dropoff_latitude < 42')
    del(df_picklat)
    
    return df_droplat
    
def add_features(df):
    df['hour'] = df['pickup_datetime'].dt.hour
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    
    df.drop_column('pickup_datetime')
    
    df = day_of_week(df)
    df['is_weekend'] = (df['day_of_week']/4).floor()
    df = haversine_distance(df)
    return df
    

def process_data(train_path):
    df = gpu_read_csv(train_path)
    df = clean_data(df)
    df = add_features(df)
    return df

In [None]:
month = 1
start = 2016
end = 2016
year = start

DATA_TRAIN_PATH = "/datasets/test/raw/taxi"
DATA_FEATURE_PATH = "/datasets/test/taxi"

while year <= end:
    current_part_path = DATA_TRAIN_PATH + "/yellow_tripdata_" + str(year) + "-" + f"{month:02d}" + ".csv"
    
    train_part_path_pq = DATA_FEATURE_PATH + "/parquet/train/yellow_tripdata_" + str(year) + "-" + str(month) + ".parquet"    
    test_part_path_pq = DATA_FEATURE_PATH + "/parquet/test/yellow_tripdata_" + str(year) + "-" + str(month) + ".parquet"
    
    train_part_path_csv = DATA_FEATURE_PATH + "/csv/train/yellow_tripdata_" + str(year) + "-" + str(month) + ".csv" 
    test_part_path_csv = DATA_FEATURE_PATH + "/csv/test/yellow_tripdata_" + str(year) + "-" + str(month) + ".csv"
    
    print(current_part_path)
    df = process_data(current_part_path)
    month += 1
    
    msk = np.random.rand(len(df)) < PERCENT_TRAIN
    
    train = df[msk]
    test = df[~msk]
    
    print(train.shape)
    print(test.shape)
    
    train.to_parquet(train_part_path_pq)
    test.to_parquet(test_part_path_pq)
    
    train.to_pandas().to_csv(train_part_path_csv, header=False)
    test.to_pandas().to_csv(train_part_path_csv, header=False)
    
    del train
    del test
    del df
    
    if month > 12:
        month = 1
        year += 1
    

/datasets/test/raw/taxi/yellow_tripdata_2016-01.csv
(8277180, 17)
(2070522, 17)
/datasets/test/raw/taxi/yellow_tripdata_2016-02.csv
(8643079, 17)
(2162796, 17)
/datasets/test/raw/taxi/yellow_tripdata_2016-03.csv
