## Joining on Non-Equi Operators

In [15]:
import pandas as pd
import janitor
import numpy as np

In [16]:
#https://stackoverflow.com/q/61948103/7175713 
df1 = pd.DataFrame({'id': [1,1,1,2,2,3], 
                    'value_1': [2,5,7,1,3,4]})

df2 = pd.DataFrame({'id': [1,1,1,1,2,2,2,3], 
                    'value_2A': [0,3,7,12,0,2,3,1], 
                    'value_2B': [1,5,9,15,1,4,6,3]})

df1 = pd.concat([df1]*50_000)
df2 = pd.concat([df2]*500)

In [17]:
df1

Unnamed: 0,id,value_1
0,1,2
1,1,5
2,1,7
3,2,1
4,2,3
...,...,...
1,1,5
2,1,7
3,2,1
4,2,3


In [18]:
df2

Unnamed: 0,id,value_2A,value_2B
0,1,0,1
1,1,3,5
2,1,7,9
3,1,12,15
4,2,0,1
...,...,...,...
3,1,12,15
4,2,0,1
5,2,2,4
6,2,3,6


In [19]:
%%timeit
df1.conditional_join(
        df2,
        ('value_1', 'value_2A', '>='),
        use_numba=True
    )

8.2 s ± 164 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%%timeit
df1.conditional_join(
        df2,
        ('value_1', 'value_2A', '>='),
        use_numba=False
    )

11 s ± 89.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Join on equi and non-equi operators is possible:

In [21]:
df1.conditional_join(
        df2,
        ('id', 'id', '=='),
        ('value_1', 'value_2A', '>='),
        ('value_1', 'value_2B', '<='),
        sort_by_appearance = True
    )

  return method(self._obj, *args, **kwargs)


Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,1,5,1,3,5
1,1,5,1,3,5
2,1,5,1,3,5
3,1,5,1,3,5
4,1,5,1,3,5
...,...,...,...,...,...
124999995,2,3,2,3,6
124999996,2,3,2,2,4
124999997,2,3,2,3,6
124999998,2,3,2,2,4


The default join is inner. left and right joins are supported as well:

In [22]:
df1.conditional_join(
        df2,
        ('id', 'id', '=='),
        ('value_1', 'value_2A', '>='),
        ('value_1', 'value_2B', '<='),
        how='left',
        sort_by_appearance = True
    )

  return method(self._obj, *args, **kwargs)


Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,1,2,,,
1,1,5,1.0,3.0,5.0
2,1,5,1.0,3.0,5.0
3,1,5,1.0,3.0,5.0
4,1,5,1.0,3.0,5.0
...,...,...,...,...,...
125099995,2,3,2.0,2.0,4.0
125099996,2,3,2.0,3.0,6.0
125099997,2,3,2.0,2.0,4.0
125099998,2,3,2.0,3.0,6.0


In [23]:
df1.conditional_join(
        df2,
        ('id', 'id', '=='),
        ('value_1', 'value_2A', '>='),
        ('value_1', 'value_2B', '<='),
        how='right',
        sort_by_appearance = True
    )

  return method(self._obj, *args, **kwargs)


Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,,,1,0,1
1,1.0,5.0,1,3,5
2,1.0,5.0,1,3,5
3,1.0,5.0,1,3,5
4,1.0,5.0,1,3,5
...,...,...,...,...,...
125001495,2.0,3.0,2,3,6
125001496,2.0,3.0,2,3,6
125001497,2.0,3.0,2,3,6
125001498,2.0,3.0,2,3,6


Join on just the non-equi joins is also possible:

In [24]:
df1.conditional_join(
        df2,
        ('value_1', 'value_2A', '>'),
        ('value_1', 'value_2B', '<'),
        how='inner',
        sort_by_appearance = True
    )

  return method(self._obj, *args, **kwargs)


Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,1,2,3,1,3
1,1,2,3,1,3
2,1,2,3,1,3
3,1,2,3,1,3
4,1,2,3,1,3
...,...,...,...,...,...
124999995,3,4,2,3,6
124999996,3,4,1,3,5
124999997,3,4,2,3,6
124999998,3,4,1,3,5


Join on not equal -> !=

In [25]:
df1.conditional_join(
        df2,
        ('id', 'id', "!=")
    )

Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,1,2,2,0,1
1,1,2,2,2,4
2,1,2,2,3,6
3,1,2,2,0,1
4,1,2,2,2,4
...,...,...,...,...,...
724999995,3,4,2,2,4
724999996,3,4,2,3,6
724999997,3,4,2,0,1
724999998,3,4,2,2,4


If the columns from both dataframes have nothing in common, a single indexed column is returned:

In [26]:
(df1.select_columns('value_1')
    .conditional_join(
        df2.select_columns('val*'),
        ('value_1', 'value_2A', '>'),
        ('value_1', 'value_2B', '<'),
    )
)

  return method(self._obj, *args, **kwargs)


Unnamed: 0,value_1,value_2A,value_2B
0,2,1,3
1,2,1,3
2,2,1,3
3,2,1,3
4,2,1,3
...,...,...,...
124999995,4,3,6
124999996,4,3,5
124999997,4,3,6
124999998,4,3,5


Selection of relevant columns within `conditional_join`: 

In [27]:
df1.conditional_join(
        df2,
        ('id', 'id', "<"),
        df_columns = 'id',
        right_columns = 'id'
    )

Unnamed: 0_level_0,left,right
Unnamed: 0_level_1,id,id
0,1,2
1,1,2
2,1,2
3,1,2
4,1,2
...,...,...
349999995,2,3
349999996,2,3
349999997,2,3
349999998,2,3


Column renaming is also possible:

In [28]:
df1.conditional_join(
        df2,
        ('id', 'id', "<"),
        df_columns = {'id':'df_id'},
        right_columns = {'id':'right_id'}
    )

Unnamed: 0,df_id,right_id
0,1,2
1,1,2
2,1,2
3,1,2
4,1,2
...,...,...
349999995,2,3
349999996,2,3
349999997,2,3
349999998,2,3
