In [1]:
from time import perf_counter
import numpy as np
import pandas as pd
import random
from numba import jit
import string

from lightautoml.reader.gpu.seq_reader_gpu import DictToCudfSeqReader
from lightautoml.reader.gpu.seq_reader_gpu import DictToDaskCudfSeqReader
from lightautoml.reader.base import DictToPandasSeqReader
from lightautoml.tasks import Task

from lightautoml.transformers.seq import SeqNumCountsTransformer
from lightautoml.transformers.seq import SeqStatisticsTransformer
from lightautoml.transformers.seq import GetSeqTransformer
from lightautoml.transformers.gpu.seq_gpu import SeqNumCountsTransformer_gpu
from lightautoml.transformers.gpu.seq_gpu import SeqStatisticsTransformer_gpu
from lightautoml.transformers.gpu.seq_gpu import GetSeqTransformer_gpu

RANDS_CHARS = np.array(list(string.ascii_letters + string.digits),
                       dtype=(np.str_, 1))

@jit(nopython=True)
def gen_cols(n_cols):
    cols = [""]*n_cols
    for i in range(n_cols):
        cols[i] = "col_" + str(i)
    return cols

def gen_string_data(n, n_str):
    string_db = ["algorithm", "analog", "app", "application", "array",
                 "backup", "bandwidth", "binary", "bit", "byte"]#,
                 #"bitmap", "blog", "bookmark", "boot", "broadband",
                 #"browser" , "buffer", "bug"]
    inds = np.random.randint(0, len(string_db), (n, n_str))
    output = np.empty(inds.shape, dtype=object)
    for i in range(inds.shape[0]):
        for j in range(inds.shape[1]):
            output[i][j] = string_db[inds[i][j]]

    return output

def gen_data_single_target(n: int, n_num: int, n_cat: int, n_date: int,
         n_str: str, max_n_cat: int, n_ids: int, max_ids: list = None,
         cols: list = None):
    n_cols = n_num+n_cat+n_str+n_date+n_ids
    cols = gen_cols(n_cols) if cols is None else cols
    data = np.random.random((n, n_num))*100-50

    category_data = np.random.randint(0, np.random.randint(1,max_n_cat),
                                      (n, n_cat))
    string_data = gen_string_data(n, n_str)

    string_data = np.reshape(string_data, (n, n_str))

    date_data = np.random.randint(0, 1000, (n, n_date))\
                               .astype(np.dtype("timedelta64[D]")) \
                              + np.datetime64("2018-01-01")

    if max_ids is None:
        id_data = np.arange(n, dtype=int)[:, np.newaxis]\
                  *np.ones(n_ids, dtype=int)[:, np.newaxis].T
        for elem in id_data.T:
            np.random.shuffle(elem)
    else:
        id_data = np.array(np.random.random((n, n_ids))*max_ids//1,
                           dtype=int)

    data = pd.DataFrame(data, columns = cols[:n_num]).astype('f')
    
    ix = [(row, col) for row in range(data.shape[0]) \
                     for col in range(data.shape[1])]
    #for row, col in random.sample(ix, int(round(.1*len(ix)))):
    #    data.iat[row, col] = np.nan
    
    nn = len(data.columns)
    for i in range(n_cat):
        data[cols[nn+i]] = pd.Series(category_data[:,i]).astype('f')
    nn = len(data.columns)
    for i in range(n_str):
        data[cols[nn+i]] = pd.Series(string_data[:,i]).astype(object)
    nn = len(data.columns)
    for i in range(n_date):
        data[cols[nn+i]] = pd.Series(date_data[:,i])
    nn = len(data.columns)
    for i in range(n_ids):
        data[cols[nn+i]] = pd.Series(id_data[:, i])

    data['TARGET'] = pd.Series(np.random.randint(0, 5, n)).astype('i')

    return 'TARGET', cols, data

## Data preparation
n, n_num, n_cat, n_date, n_str = 5000, 3, 2, 2, 1
max_n_cat, n_ids = 10, 1
cols_data1 = ["a","b","c","d","e","str1",
              "date1", "date2", "data1_id"]
_, _, data1 = gen_data_single_target(n, n_num, n_cat, 
              n_date, n_str, max_n_cat, n_ids, cols=cols_data1)

n, n_num, n_cat, n_date, n_str = 35000, 2, 2, 0, 0
max_n_cat, n_ids = 5, 1
cols_data2 = ["h","i","j","k", "data2_id"]
_, _, data2 = gen_data_single_target(n, n_num, n_cat, 
              n_date, n_str, max_n_cat, n_ids, cols=cols_data2)

max_ids = [50, 100]
n, n_num, n_cat, n_date = 100000, 4, 6, 2
n_str, max_n_cat, n_ids = 2, 15, 2
target, cols, train = gen_data_single_target(n, n_num, n_cat, 
                     n_date, n_str, max_n_cat, n_ids, max_ids)

n = 200
_, _, test = gen_data_single_target(n, n_num, n_cat, 
                     n_date, n_str, max_n_cat, n_ids, max_ids)
seq_params = {
         'data1':{'case': 'ids',
                  'params': {},
                  'scheme': {'to': 'plain', 
                             'from_id': 'data1_id',
                             'to_id': 'col_14'},
                 },
         'data2':{'case': 'ids',
                  'params': {},
                  'scheme': {'to': 'plain',
                             'from_id': 'data2_id',
                             'to_id': 'col_15'},
                      },
          }
seq_data = {'data1': data1[cols_data1],
            'data2': data2[cols_data2]              
           }
X_train = {'plain':train , 
           'seq': seq_data
          }
X_test = {'plain':test , 
           'seq': seq_data
          }
name = 'data2'
#
#
#
#
#DATA1 THROWS ERROR ONLY ON GPU
#
#
#
#
## Data preparation finished

task = Task('reg', metric='mae')
task_gpu = Task('reg', metric='mae', device='gpu')
task_mgpu = Task('reg', metric='mae', device='mgpu')
roles={'target': target}

reader = DictToPandasSeqReader(task=task, seq_params=seq_params)

In [2]:
res = reader.fit_read(X_train, roles=roles)

Feats was rejected during automatic roles guess: ['col_0', 'col_1', 'col_14', 'col_15', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'col_11']


  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))


In [3]:
from lightautoml.reader.gpu.seq_gpu import IDSInd_gpu, TopInd_gpu
import cudf

In [4]:
name = 'data2'

ids_gpu = IDSInd_gpu(
    scheme = reader.seq_params[name].get("scheme", None),
    **reader.seq_params[name]["params"])

In [5]:
cpu_data = res.seq_data[name].data
cudf_data = cudf.DataFrame.from_pandas(cpu_data, nan_as_null=False)
cudf_train = cudf.DataFrame.from_pandas(train, nan_as_null=False)

In [6]:
%%time

ids = cudf_data.to_pandas().reset_index().groupby(ids_gpu.scheme['from_id'])['index'].apply(list).to_dict()

CPU times: user 456 ms, sys: 11.2 ms, total: 467 ms
Wall time: 465 ms


In [7]:
%%time

r = cudf_data.reset_index().groupby(ids_gpu.scheme['from_id'])['index'].agg('collect').to_pandas().to_dict()

CPU times: user 27.8 ms, sys: 0 ns, total: 27.8 ms
Wall time: 25.4 ms


In [8]:
#ids

In [9]:
#r

In [10]:
%%time

res1 = cudf_train[ids_gpu.scheme["to_id"]].to_pandas().map(ids).values


CPU times: user 21 ms, sys: 448 µs, total: 21.4 ms
Wall time: 20.2 ms


In [11]:
%%time

res2 = cudf_train[ids_gpu.scheme["to_id"]].map(ids).to_pandas().values

CPU times: user 35.7 ms, sys: 0 ns, total: 35.7 ms
Wall time: 34.3 ms


In [12]:
res1

array([list([9540]), list([1689]), list([33804]), ..., list([7121]),
       list([27653]), list([12244])], dtype=object)

In [13]:
res2

array([array([9540]), array([1689]), array([33804]), ..., array([7121]),
       array([27653]), array([12244])], dtype=object)

In [14]:
task = Task('reg', metric='mae')
task_gpu = Task('reg', metric='mae', device='gpu')
task_mgpu = Task('reg', metric='mae', device='mgpu')
roles={'target': target}

reader = DictToPandasSeqReader(task=task, seq_params=seq_params)    
res = reader.fit_read(X_train, roles=roles)
reader_gpu = DictToCudfSeqReader(task=task_gpu,
                                seq_params=seq_params, n_jobs=1)
res_gpu = reader_gpu.fit_read(X_train, roles=roles)
reader_mgpu = DictToDaskCudfSeqReader(task=task_mgpu, cv=3,
               n_jobs = 1, npartitions=2, seq_params=seq_params)
res_mgpu = reader_mgpu.fit_read(X_train, roles=roles)

counts = SeqNumCountsTransformer()
counts_gpu = SeqNumCountsTransformer_gpu()

stats = SeqStatisticsTransformer()
stats_gpu = SeqStatisticsTransformer_gpu()

seq = GetSeqTransformer(name=name)
seq_gpu = GetSeqTransformer_gpu(name=name)

  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))


Feats was rejected during automatic roles guess: ['col_0', 'col_1', 'col_14', 'col_15', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'col_11']




In [15]:
%%time

counts.fit(res.seq_data[name])
out_counts = counts.transform(res.seq_data[name])

CPU times: user 161 ms, sys: 215 µs, total: 162 ms
Wall time: 160 ms


In [16]:
%%time

counts_gpu.fit(res_gpu.seq_data[name])
out_counts_gpu = counts_gpu.transform(res_gpu.seq_data[name])


CPU times: user 18 ms, sys: 313 µs, total: 18.3 ms
Wall time: 16.6 ms


In [17]:
%%time

counts_gpu.fit(res_mgpu.seq_data[name])
out_counts_mgpu = counts_gpu.transform(res_mgpu.seq_data[name])


CPU times: user 813 ms, sys: 27.5 ms, total: 840 ms
Wall time: 838 ms


In [18]:
%%time

stats.fit(res.seq_data[name])
out_stats = stats.transform(res.seq_data[name])

CPU times: user 210 ms, sys: 296 µs, total: 210 ms
Wall time: 209 ms


In [19]:
%%time

stats_gpu.fit(res_gpu.seq_data[name])
out_stats_gpu = stats_gpu.transform(res_gpu.seq_data[name])

KeyError: 0

In [20]:
%%time

stats_gpu.fit(res_mgpu.seq_data[name])
out_stats_mgpu = stats_gpu.transform(res_mgpu.seq_data[name])



CPU times: user 723 ms, sys: 15.7 ms, total: 738 ms
Wall time: 732 ms


In [21]:
%%time

seq.fit(res)
out_seq = seq.transform(res)

CPU times: user 4.9 ms, sys: 126 µs, total: 5.02 ms
Wall time: 4.25 ms


In [22]:
%%time

seq_gpu.fit(res_gpu)
out_seq_gpu = seq_gpu.transform(res_gpu)


CPU times: user 439 µs, sys: 0 ns, total: 439 µs
Wall time: 444 µs


In [23]:
%%time

seq_gpu.fit(res_mgpu)
out_seq_mgpu = seq_gpu.transform(res_mgpu)

CPU times: user 445 ms, sys: 20 ms, total: 465 ms
Wall time: 461 ms


