# Questions/Thoughts

In [1]:
import pandas as pd
import numpy as np
import queue
import cProfile
import multiprocessing as mp
from tools import table_io

In [2]:
NUM_THREADS = 4

In [3]:
def join_on_column(A, B, a_column, b_column):
    """Merges 2 tables where a_column and b_column are equal.
    
    Assumes that a_column and b_column contain unique keys.
    This is *much* faster than filtered_cartesian product.
    However, it can be replaced with a map/reduce:
        1. map over rows of A
        2. select row with matching key in B
        3. return concatenation of row from A with row from B
        4. reduce list of concatenated rows by converting to dataframe
    
    Params:
        - A: a table (pandas dataframe object)
        - B: a table (pandas dataframe object)
        - a_column: string name of column of keys in A
        - b_column: string name of column of keys in B
    
    Returns:
        pandas dataframe
        
    Note:
        - The returned dataframe does not contain a column with the label
            b_column.
    """
    B_renamed  = B.rename(columns={b_column: a_column}, inplace=False)
    return pd.merge(A, B_renamed, on=a_column)

In [4]:
def filter_chunked_table(table_name, filtered_name, num_chunks, filter_fun):
    """Filters table in chunks.
    
    Args:
        filter_fun: takes a table chunk and returns a filtered table chunk
    """
    for i in range(num_chunks):
        chunk = table_io.read_table_chunk(table_name, i+1, num_chunks)
        chunk_filtered = filter_fun(chunk)
        table_io.write_table_chunk(filtered_name, chunk_filtered, i+1, num_chunks)

In [5]:
def assign_bucket_hash(item, total_buckets):
    return hash(item) % total_buckets

def assign_bucket_range(item, total_buckets):
    """Assigns item to a bucket associated to a particular range.
    Note: bucket ranges must be consistent across chunks.
    """
    raise NotImplementedError

In [6]:
# Modified this for multithreading
def split_chunk_into_buckets(i, table_name, column, total_chunks, assign_bucket_fn, total_buckets):
    """Splits a chunk into buckets and writes them
    """
    chunk = table_io.read_table_chunk(table_name, i, total_chunks)
    buckets_list = [list() for _ in range(total_buckets)]
    
    for idx, row in chunk.iterrows():
        bucket = assign_bucket_fn(row[column], total_buckets)
        buckets_list[bucket].append(row)
    buckets = [pd.DataFrame(rows, columns=chunk.columns) for rows in buckets_list]
        
    table_io.write_buckets(buckets, table_name, i, total_chunks, column)
    return True

def reduce_buckets_to_chunk(bucket_id, table_name, column, total_chunks, total_buckets, new_table_name):
    bucket_list = []
    for chunk_id in range(1, total_chunks + 1):
        bucket = table_io.read_bucket(table_name, chunk_id, total_chunks, column, bucket_id, total_buckets)
        bucket_list.append(bucket)
    chunk = pd.concat(bucket_list, ignore_index=True)
    table_io.write_table_chunk(new_table_name, chunk, bucket_id, total_buckets)

In [7]:
def split_table_into_buckets_threaded(table_name, column, total_chunks, assign_bucket_fn,
                                      total_buckets, num_threads=NUM_THREADS):
    # Is this the proper way, or is it better to modify split_chunk_into_buckets?
    args = [(i + 1, table_name, column, total_chunks, assign_bucket_fn, total_buckets)
            for i in range(total_buckets)]
    with mp.Pool(num_threads) as p:
        return all(p.starmap(split_chunk_into_buckets, args))
        
def merge_buckets_threaded(table_name, column, total_chunks, total_buckets,
                           new_table_name="", num_threads=NUM_THREADS):
    if not new_table_name:
        new_table_name = table_name
    args = [(i + 1, table_name, column, total_chunks, total_buckets, new_table_name)
           for i in range(total_buckets)]
    with mp.Pool(num_threads) as p:
        return all(p.starmap(reduce_buckets_to_chunk, args))

In [13]:
def join_buckets_on_column(a_name, a_column, b_name, b_column, bucket_id, total_buckets, out_name):
    a_bucket = table_io.read_table_chunk(a_name, bucket_id, total_buckets)
    b_bucket = table_io.read_table_chunk(b_name, bucket_id, total_buckets)
    joined_bucket = join_on_column(a_bucket, b_bucket, a_column, b_column)
    table_io.write_table_chunk(out_name, joined_bucket, bucket_id, total_buckets)
    return True

In [14]:
def join_on_column_threaded(a_name, a_column, a_total_chunks,
                               b_name, b_column, b_total_chunks,
                               out_name, total_buckets, num_threads=NUM_THREADS):
    # Split into buckets by hash
    split_table_into_buckets_threaded(a_name, a_column, a_total_chunks,
                                      assign_bucket_hash, total_buckets, num_threads)
    split_table_into_buckets_threaded(b_name, b_column, b_total_chunks,
                                      assign_bucket_hash, total_buckets, num_threads)
    # Merge buckets
    a_temp_name = a_name + "_hash_grouped"
    b_temp_name = b_name + "_hash_grouped"
    merge_buckets_threaded(a_name, a_column, a_total_chunks, total_buckets, a_temp_name, num_threads)
    merge_buckets_threaded(b_name, b_column, b_total_chunks, total_buckets, b_temp_name, num_threads)
    args = [(a_temp_name, a_column, b_temp_name, b_column, i + 1, total_buckets, out_name)
            for i in range(total_buckets)]
    with mp.Pool(num_threads) as p:
        return all(p.starmap(join_buckets_on_column, args))

# Try using varying numbers of threads with tables split into 20 chunks

In [15]:
# Filter date_dim
NUM_CHUNKS = 20
NUM_BUCKETS = 20
filter_chunked_table("date_dim", "date_dim_filtered", NUM_CHUNKS, lambda tbl: tbl[tbl["d_year"] == 2000])

In [22]:
exec_template = 'join_on_column_threaded("date_dim_filtered", "d_date_sk", {num_chunks}, ' + \
                          '"store_returns", "sr_returned_date_sk", {num_chunks},' + \
                          '"ctr_joined", {num_buckets}, {num_threads})'

In [23]:
# Using 1 thread
cProfile.run(exec_template.format(num_chunks=NUM_CHUNKS, num_buckets=NUM_BUCKETS, num_threads=1))

         3636 function calls in 48.340 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       40    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:416(parent)
       40    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
        1    0.001    0.001   48.339   48.339 <ipython-input-14-5bd60ebc15df>:1(join_on_column_threaded)
        1    0.000    0.000    0.000    0.000 <ipython-input-14-5bd60ebc15df>:14(<listcomp>)
        2    0.000    0.000   36.488   18.244 <ipython-input-7-81de4cfdaee6>:1(split_table_into_buckets_threaded)
        2    0.000    0.000    0.000    0.000 <ipython-input-7-81de4cfdaee6>:13(<listcomp>)
        2    0.000    0.000    0.000    0.000 <ipython-input-7-81de4cfdaee6>:4(<listcomp>)
        2    0.000    0.000   10.839    5.419 <ipython-input-7-81de4cfdaee6>:9(merge_buckets_threaded)
        1    0.000    0.000   48.340   48.340 <string>:1(<module>)

In [17]:
# Using 2 threads
cProfile.run(exec_template.format(num_chunks=NUM_CHUNKS, num_buckets=NUM_BUCKETS, num_threads=2))

         4267 function calls in 27.244 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       45    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:416(parent)
       45    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
        1    0.001    0.001   27.244   27.244 <ipython-input-14-5bd60ebc15df>:1(join_on_column_threaded)
        1    0.000    0.000    0.000    0.000 <ipython-input-14-5bd60ebc15df>:14(<listcomp>)
        2    0.000    0.000   20.382   10.191 <ipython-input-7-81de4cfdaee6>:1(split_table_into_buckets_threaded)
        2    0.000    0.000    0.000    0.000 <ipython-input-7-81de4cfdaee6>:13(<listcomp>)
        2    0.000    0.000    0.000    0.000 <ipython-input-7-81de4cfdaee6>:4(<listcomp>)
        2    0.000    0.000    6.245    3.123 <ipython-input-7-81de4cfdaee6>:9(merge_buckets_threaded)
        1    0.000    0.000   27.244   27.244 <string>:1(<module>)

In [18]:
# Using 4 threads
cProfile.run(exec_template.format(num_chunks=NUM_CHUNKS, num_buckets=NUM_BUCKETS, num_threads=4))

         5641 function calls in 15.538 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       55    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:416(parent)
       55    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
        1    0.001    0.001   15.538   15.538 <ipython-input-14-5bd60ebc15df>:1(join_on_column_threaded)
        1    0.000    0.000    0.000    0.000 <ipython-input-14-5bd60ebc15df>:14(<listcomp>)
        2    0.000    0.000   11.565    5.783 <ipython-input-7-81de4cfdaee6>:1(split_table_into_buckets_threaded)
        2    0.000    0.000    0.000    0.000 <ipython-input-7-81de4cfdaee6>:13(<listcomp>)
        2    0.000    0.000    0.000    0.000 <ipython-input-7-81de4cfdaee6>:4(<listcomp>)
        2    0.000    0.000    3.551    1.776 <ipython-input-7-81de4cfdaee6>:9(merge_buckets_threaded)
        1    0.000    0.000   15.538   15.538 <string>:1(<module>)

In [19]:
# Using 6 threads
cProfile.run(exec_template.format(num_chunks=NUM_CHUNKS, num_buckets=NUM_BUCKETS, num_threads=6))

         7110 function calls in 10.764 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       65    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:416(parent)
       65    0.000    0.000    0.001    0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
        1    0.001    0.001   10.763   10.763 <ipython-input-14-5bd60ebc15df>:1(join_on_column_threaded)
        1    0.000    0.000    0.000    0.000 <ipython-input-14-5bd60ebc15df>:14(<listcomp>)
        2    0.000    0.000    7.979    3.989 <ipython-input-7-81de4cfdaee6>:1(split_table_into_buckets_threaded)
        2    0.000    0.000    0.000    0.000 <ipython-input-7-81de4cfdaee6>:13(<listcomp>)
        2    0.000    0.000    0.000    0.000 <ipython-input-7-81de4cfdaee6>:4(<listcomp>)
        2    0.000    0.000    2.456    1.228 <ipython-input-7-81de4cfdaee6>:9(merge_buckets_threaded)
        1    0.000    0.000   10.764   10.764 <string>:1(<module>)

In [21]:
# Using 8 threads
cProfile.run(exec_template.format(num_chunks=NUM_CHUNKS, num_buckets=NUM_BUCKETS, num_threads=8))

         8461 function calls in 9.290 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       75    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:416(parent)
       75    0.000    0.000    0.001    0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
        1    0.001    0.001    9.290    9.290 <ipython-input-14-5bd60ebc15df>:1(join_on_column_threaded)
        1    0.000    0.000    0.000    0.000 <ipython-input-14-5bd60ebc15df>:14(<listcomp>)
        2    0.000    0.000    6.688    3.344 <ipython-input-7-81de4cfdaee6>:1(split_table_into_buckets_threaded)
        2    0.000    0.000    0.000    0.000 <ipython-input-7-81de4cfdaee6>:13(<listcomp>)
        2    0.000    0.000    0.000    0.000 <ipython-input-7-81de4cfdaee6>:4(<listcomp>)
        2    0.000    0.000    2.265    1.133 <ipython-input-7-81de4cfdaee6>:9(merge_buckets_threaded)
        1    0.000    0.000    9.290    9.290 <string>:1(<module>)
