In [1]:
import pandas as pd; import numpy as np; import janitor as jn
from numba import njit, prange

In [2]:
from subprocess import Popen
from contextlib import contextmanager
from os import getpid
from time import sleep
from signal import SIGINT

@contextmanager
def perf_stat():
    p = Popen(["perf", "stat", "-p", str(getpid())])
    sleep(0.5)
    yield
    p.send_signal(SIGINT)

In [3]:
@njit(parallel=True)
def _get_indices_dual_non_monotonic_non_equi(
    left_region: np.ndarray,
    right_region: np.ndarray,
    left_index: np.ndarray,
    right_index: np.ndarray,
    starts: np.ndarray,
    counts: np.ndarray,
):
    """
    Retrieves the matching indices
    for the left and right regions.
    Strictly for non-equi joins,
    where only two join conditions are present.
    """
    # two step pass
    # first pass gets the length of the final indices
    # second pass populates the final indices with actual values
    count_indices = np.empty(counts.size, dtype=np.intp)
    total_length = 0
    for num in prange(counts.size):
        l_region = left_region[num]
        size = counts[num]
        start = starts[num]
        counter = 0
        for n in range(size):
            r_region = right_region[start + n]
            out = (l_region <= r_region)
            total_length += out
            counter += out
        count_indices[num] = counter
    start_indices = np.zeros(starts.size, dtype=np.intp)
    start_indices[1:] = np.cumsum(count_indices)[:-1]
    l_index = np.empty(total_length, dtype=np.intp)
    r_index = np.empty(total_length, np.intp)
    for num in prange(starts.size):
        indexer = start_indices[num]
        size = counts[num]
        l_ind = left_index[num]
        r_indexer = starts[num]
        l_region = left_region[num]
        width = count_indices[num]
        # if width == size, 
        # no need for comparision within the iteration
        if width == size:
            for n in range(size):
                l_index[indexer+n] = l_ind
                r_index[indexer+n] = right_index[r_indexer + n]
        else:
            for n in range(size):
                if not width:
                    break
                pos_right = r_indexer + n
                r_region = right_region[pos_right]
                if l_region > r_region:
                    continue
                l_index[indexer] = l_ind
                r_index[indexer] = right_index[pos_right]
                indexer += 1
                width -= 1
    return l_index, r_index

In [28]:
with perf_stat():
    _get_indices_dual_non_monotonic_non_equi((np.array([1, 3, 1, 3, 2, 2]),
 np.array([1, 3, 3, 2, 2, 1, 3]),
 np.array([2, 0, 1, 3, 4, 6]),
 np.array([3, 4, 6, 0, 5, 2, 1]),
 np.array([0, 1, 1, 5, 5, 6]),
 np.array([7, 6, 6, 2, 2, 1])))

FileNotFoundError: [Errno 2] No such file or directory: 'perf'

In [4]:
events = pd.read_csv('/Users/samuel.oranyeli/Downloads/results.csv', parse_dates=['start','end']).iloc[:, 1:]

In [5]:
events.dtypes

id                   int64
name                object
audience             int64
start       datetime64[ns]
sponsor             object
end         datetime64[ns]
dtype: object

In [6]:
a = (events
.conditional_join(
    events,
    ('start', 'end', '<='),
    ('end', 'start', '>='),
    # ('id', 'id', '!='),
    # ('audience','audience','>'),
    use_numba = True,
    df_columns = ['id', 'start', 'end'],
    right_columns = ['id', 'start', 'end'])
)

a

(array([28283, 28282, 28281, ...,     3,     2,     1]),
 array([28283, 28282, 28281, ...,     3,     2,     1]),
 array([25296, 10726, 24103, ...,  3544, 15884, 13367]),
 array([25296, 10726, 24103, ...,  3544, 15884, 13367]),
 array([    0,     1,     2, ..., 29996, 29997, 29998]),
 array([1, 1, 1, ..., 1, 1, 1]))

In [7]:
# %%timeit
# (events
# .conditional_join(
#     events,
#     ('start', 'end', '<='),
#     ('end', 'start', '>='),
#     # ('id', 'id', '!='),
#     # ('audience','audience','>'),
#     use_numba = True,
#     df_columns = ['id', 'start', 'end'],
#     right_columns = ['id', 'start', 'end'])
# )

In [8]:
# %%timeit
# (events
# .conditional_join(
#     events,
#     ('start', 'end', '<='),
#     ('end', 'start', '>='),
#     # ('id', 'id', '!='),
#     # ('audience','audience','>'),
#     use_numba = False,
#     df_columns = ['id', 'start', 'end'],
#     right_columns = ['id', 'start', 'end'])
# )

In [9]:
(events
.conditional_join(
    events,
    ('start', 'end', '<='),
    ('end', 'start', '>='),
    ('id', 'id', '!='),
    # ('audience','audience','>'),
    use_numba = False,
    df_columns = ['id', 'start', 'end'],
    right_columns = ['id', 'start', 'end'])
)

(array([    0,     1,     2, ..., 29996, 29997, 29998]),
 array([    0,     1,     2, ..., 29996, 29997, 29998]))

In [10]:
# # %%timeit
# (events
# .conditional_join(
#     events,
#     ('start', 'end', '>='),
#     ('end', 'start', '<='),
#     ('id', 'id', '!='),
#     # ('audience','audience','>'),
#     use_numba = True,
#     df_columns = ['id', 'start', 'end'],
#     right_columns = ['id', 'start', 'end'])
# )

In [11]:
C=dict(key=[f"c{num}" for num in range(1,8)],
       vol=[35,15,5,35,18,90,17],
       profit=[45,35,55,12,15,55,11],
       unitsSold=[15,10,30,10,15,80,2],
       keyy = range(1,8)
       )
C=pd.DataFrame(C)


D=dict(key=[f'd{num}' for num in range(1,9)],
       vol=[20,50,15,16,40,20,40,2],
       profit=[30,10,12,52,35,20,30,57],
       unitsSold=[20,35,10,12,40,30,5,15],
       keyy=range(8,0,-1)
       )
D=pd.DataFrame(D)



In [12]:
# C = pd.concat([C]*40*40)
# D = pd.concat([D]*40*40)

In [13]:
(C
.conditional_join(
    D, 
    
    ('profit','profit','>='),
    # ('keyy','keyy','=='),
  # ('unitsSold','unitsSold','>='),
  ('vol','vol','<='),
  # ('vol','profit','>'),
    use_numba=True)
)

(array([1, 3, 1, 3, 2, 2]),
 array([1, 3, 3, 2, 2, 1, 3]),
 array([2, 0, 1, 3, 4, 6]),
 array([3, 4, 6, 0, 5, 2, 1]),
 array([0, 1, 1, 5, 5, 6]),
 array([7, 6, 6, 2, 2, 1]))

In [14]:
(C
.conditional_join(
    D, 
    
    ('profit','profit','>='),
    # ('keyy','keyy','=='),
  # ('unitsSold','unitsSold','>='),
  ('vol','vol','<='),
  # ('vol','profit','>'),
    use_numba=False)
)

(array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 4, 6]),
 array([1, 6, 4, 1, 2, 5, 0, 6, 4, 1, 2, 5, 0, 6, 4, 3, 1, 1, 1]))

In [15]:
# %%timeit
# (C
# .conditional_join(
#     D, 
    
#     ('profit','profit','>='),
#     # ('keyy','keyy','=='),
#   # ('unitsSold','unitsSold','>='),
#   ('vol','vol','<='),
#   # ('vol','profit','>'),
#     use_numba=True)
# )

In [16]:
# %%timeit
# (C
# .conditional_join(
#     D, 
    
#     ('profit','profit','>='),
#     # ('keyy','keyy','=='),
#   # ('unitsSold','unitsSold','>='),
#   ('vol','vol','<='),
#   # ('vol','profit','>'),
#     use_numba=False)
# )

In [17]:
C

Unnamed: 0,key,vol,profit,unitsSold,keyy
0,c1,35,45,15,1
1,c2,15,35,10,2
2,c3,5,55,30,3
3,c4,35,12,10,4
4,c5,18,15,15,5
5,c6,90,55,80,6
6,c7,17,11,2,7


In [18]:
D.sort_values('profit')

Unnamed: 0,key,vol,profit,unitsSold,keyy
1,d2,50,10,35,7
2,d3,15,12,10,6
5,d6,20,20,30,3
0,d1,20,30,20,8
6,d7,40,30,5,2
4,d5,40,35,40,4
3,d4,16,52,12,5
7,d8,2,57,15,1


In [19]:
D.sort_values('vol')

Unnamed: 0,key,vol,profit,unitsSold,keyy
7,d8,2,57,15,1
2,d3,15,12,10,6
3,d4,16,52,12,5
0,d1,20,30,20,8
5,d6,20,20,30,3
4,d5,40,35,40,4
6,d7,40,30,5,2
1,d2,50,10,35,7


In [20]:
# %%timeit
# (C
# .conditional_join(
#     D, 
    
#     ('profit','profit','>='),
#     # ('keyy','keyy','=='),
#   # ('unitsSold','unitsSold','>='),
#   ('vol','vol','<='),
#   # ('vol','profit','>'),
#     use_numba=False)
# )

In [21]:
C

Unnamed: 0,key,vol,profit,unitsSold,keyy
0,c1,35,45,15,1
1,c2,15,35,10,2
2,c3,5,55,30,3
3,c4,35,12,10,4
4,c5,18,15,15,5
5,c6,90,55,80,6
6,c7,17,11,2,7


In [22]:
D.sort_values('profit')

Unnamed: 0,key,vol,profit,unitsSold,keyy
1,d2,50,10,35,7
2,d3,15,12,10,6
5,d6,20,20,30,3
0,d1,20,30,20,8
6,d7,40,30,5,2
4,d5,40,35,40,4
3,d4,16,52,12,5
7,d8,2,57,15,1


In [23]:
# f=(C
# .conditional_join(
#     D, 
    
#     ('profit','profit','>='),
#     # ('keyy','keyy','=='),
#    ('unitsSold','unitsSold','>='),
#  # ('vol','vol','<='),
#   # ('vol','profit','>'),
#     use_numba=True)
# )
# f

In [24]:
from string import ascii_lowercase
np.random.seed(1)
n = 20; k = 20
# n = 20_000_000; k = 20_000
mapp = {k:v for v,k in enumerate(ascii_lowercase)}
idx1 = np.random.randint(0, high = 2_00, size = n)
idx2 = np.random.randint(0, high = 3_00, size = n)

d1 = dict(x = np.random.choice(list(ascii_lowercase[:5]), size=n),
          start = np.minimum(idx1, idx2),
          end = np.maximum(idx1, idx2),
          )


d2 = dict(x = np.random.choice(list(ascii_lowercase[:15]), size=k),
          pos1 = np.random.randint(low=60, high = 151, size=k))

d1 = pd.DataFrame(d1)
d2 = pd.DataFrame(d2)
d1 = d1.assign(xx=lambda df: df.x.map(mapp))
d2 = d2.assign(xx=lambda df: df.x.map(mapp))

In [25]:
bla = (d2
.conditional_join(
    d1, 
    ('pos1', 'start', '>'), 
     ('pos1', 'end', '<'), 
   # ('xx', 'xx', '=='), 
    use_numba=True)
)

bla

(array([6, 5, 5, 5, 5, 5, 5, 5, 2, 4, 4, 1, 3, 1, 1, 1, 1, 1, 1, 1]),
 array([6, 6, 6, 6, 2, 4, 5, 6, 5, 6, 5, 6, 6, 6, 5, 1, 6, 6, 3]),
 array([ 8, 17,  2, 15,  5, 13, 11, 18,  0, 16,  9,  7,  1, 19, 12,  6, 14,
         4,  3, 10]),
 array([19, 12,  3, 10, 14,  4,  5,  2, 16,  9,  1, 17, 15, 11,  7, 13,  6,
        18,  8]),
 array([ 0,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  6, 11, 11,
        11, 11, 11]),
 array([18, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 15, 15, 15, 13,  8,  8,
         8,  8,  8]))

In [26]:
x=np.array([6, 6, 6, 6, 2, 4, 5, 6, 5, 6, 5, 6, 6, 6, 5, 1, 6, 6, 3])
x

array([6, 6, 6, 6, 2, 4, 5, 6, 5, 6, 5, 6, 6, 6, 5, 1, 6, 6, 3])

In [27]:
np.partition(bla[1],248842-248212)[248842-248212:]

ValueError: kth(=630) out of bounds (19)

In [None]:
%timeit bla[1][np.argpartition(bla[1],248842-248212)[248842-248212:]]

640 µs ± 56.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
%timeit bla[1][bla[1]>=91]

333 µs ± 16.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
bla[1][bla[1]>=91]

array([91, 91, 91, ..., 91, 91, 91])

In [None]:
%timeit x[x>=6]

KeyboardInterrupt: 

In [None]:
np.partition(x,7)

array([1, 4, 5, 5, 2, 3, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6])

In [None]:
%timeit x[np.argpartition(x,7)[8:]]

1.17 µs ± 4.39 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [None]:
%timeit x[np.argpartition(x,7)]

872 ns ± 2.06 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [None]:
len([3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6])

19

In [None]:
np.searchsorted([3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], 1)

0

In [None]:
bla[3].size

IndexError: tuple index out of range

In [None]:
(bla[5][:2410367]>=91).sum()

0

In [None]:
%timeit bla[5][:2410367][bla[5][:2410367]>=91]

742 ns ± 3.72 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [None]:
%timeit np.sort(bla[1][97:97+14458])

TypeError: 'bool' object is not subscriptable

In [None]:
pd.unique(bla[5])

array([91, 90, 87, 88, 86, 85, 89, 83, 84, 82, 81, 79, 76, 80, 77, 78, 75,
       73, 72, 71, 70, 69, 74, 68, 67, 66, 65, 64, 63, 61, 60, 62, 58, 59,
       56, 55, 54, 57, 53, 51, 52, 49, 48, 50, 45, 47, 44, 46, 42, 43, 40,
       41, 39, 38, 37, 36, 34, 35, 32, 31, 30, 29, 33, 28, 27, 26, 25, 21,
       24, 23, 22, 18, 19, 20, 16, 17, 14, 15, 12, 13, 11,  8, 10,  6,  9,
        7,  4,  5,  3,  1,  2])

In [None]:
bla[1].size

1232

In [None]:
uniques = pd.factorize(bla[5],sort=True)[1]
uniques

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91])

In [None]:
%timeit uniques.searchsorted(91)

289 ns ± 2.08 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [None]:
%timeit uniques.searchsorted(bla[5][:2410367])

57.4 ms ± 182 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
pd.unique(pd.factorize(bla[1],sort=True)[0])

array([81, 60, 70, 74, 66, 76, 52, 77, 43, 78, 46, 61, 63, 65, 59, 56, 72,
       39, 50, 51, 68, 33, 31, 64, 37, 26, 27, 75, 30, 22, 71, 55, 28, 49,
       58, 41, 18, 57, 13, 40, 67, 42, 10, 25, 73, 36, 38, 34, 19,  8, 11,
       79, 45, 54, 47, 29, 20,  5, 16,  3, 35,  7,  6, 24, 23, 53,  1, 69,
       48, 21,  4,  0, 12,  9, 17,  2, 32, 14, 80, 15, 44, 62])