## Joining on Non-Equi Operators

In [1]:
import pandas as pd
import janitor
import numpy as np

In [2]:
from io import StringIO

data = """                  A    B   C      D                             E
               0   0   0.0    False  r   1970-01-01 00:00:00.000000001
               1   -1    0.0    False  r   1970-01-01 00:00:00.000000000
"""

df = pd.read_csv(StringIO(data), sep=r'\s{2,}', engine='python', parse_dates=['E'])
df

Unnamed: 0,A,B,C,D,E
0,0,0.0,False,r,1970-01-01 00:00:00.000000001
1,-1,0.0,False,r,1970-01-01 00:00:00.000000000


In [3]:
data = """Integers  Numeric  Floats   Strings  Booleans      Dates   Dates_Right
               0        0      0.0     1.0    r         False   1970-01-01  1970-01-01
               1       -1      0.0     0.0    r         False   1970-01-01  1970-01-01"""

right = pd.read_csv(StringIO(data), sep=r'\s{2,}', engine='python', parse_dates=['Dates','Dates_Right'])
right

Unnamed: 0,Integers,Numeric,Floats,Strings,Booleans,Dates,Dates_Right
0,0,0.0,1.0,r,False,1970-01-01,1970-01-01
1,-1,0.0,0.0,r,False,1970-01-01,1970-01-01


In [4]:
(
        df[["A", "E"]]
        .conditional_join(
            right[["Integers", "Dates"]],
            ("A", "Integers", "<="),
            ("E", "Dates", ">"),
            how="inner",
            use_numba=False,
            sort_by_appearance=False,
        ))

Unnamed: 0,A,E,Integers,Dates
0,0,1970-01-01 00:00:00.000000001,0,1970-01-01


In [5]:
(
        df[["A", "E"]]
        .conditional_join(
            right[["Integers", "Dates"]],
            ("A", "Integers", "<="),
            ("E", "Dates", ">"),
            how="inner",
            use_numba=True,
            sort_by_appearance=False,
        ))

Unnamed: 0,A,E,Integers,Dates
0,0,1970-01-01 00:00:00.000000001,0,1970-01-01


In [6]:
c = {'vol': [35,15,5,35,18,90,17], 'profit':[45,35,55,12,15,55,11], 'unitsSold':[15,10,30,10,15,80,2]}
d = {'vol':[20,50,15,16,40,20,40,2], 'profit':[30,10,12,52,35,20,30,57], 'unitsSold':[20,35,10,12,40,30,5,15]}
c = pd.DataFrame(c)
d = pd.DataFrame(d)

In [7]:
c

Unnamed: 0,vol,profit,unitsSold
0,35,45,15
1,15,35,10
2,5,55,30
3,35,12,10
4,18,15,15
5,90,55,80
6,17,11,2


In [8]:
d

Unnamed: 0,vol,profit,unitsSold
0,20,30,20
1,50,10,35
2,15,12,10
3,16,52,12
4,40,35,40
5,20,20,30
6,40,30,5
7,2,57,15


In [9]:
c.conditional_join(d, ('vol','vol','<'),('profit','profit','>'), use_numba=True).sort_index(axis=1,level=1)

Unnamed: 0_level_0,left,right,left,right,left,right
Unnamed: 0_level_1,profit,profit,unitsSold,unitsSold,vol,vol
0,35,30,10,20,15,20
1,55,30,30,20,5,20
2,12,10,10,35,35,50
3,45,10,15,35,35,50
4,15,10,15,35,18,50
5,11,10,2,35,17,50
6,35,10,10,35,15,50
7,55,10,30,35,5,50
8,55,12,30,10,5,15
9,55,52,30,12,5,16


In [10]:
c.conditional_join(d, ('vol','vol','<'),('profit','profit','>'), ('unitsSold','unitsSold','>'),use_numba=True).sort_index(axis=1,level=1)

01

1
1
1
1
1


Unnamed: 0_level_0,left,right,left,right,left,right
Unnamed: 0_level_1,profit,profit,unitsSold,unitsSold,vol,vol
0,55,30,30,20,5,20
1,55,12,30,10,5,15
2,55,52,30,12,5,16
3,45,30,15,5,35,40
4,35,30,10,5,15,40
5,55,30,30,5,5,40


In [11]:
c.conditional_join(d, ('vol','vol','<'),('profit','profit','>'), ('unitsSold','unitsSold','>'),use_numba=False).sort_index(axis=1,level=1)

Unnamed: 0_level_0,left,right,left,right,left,right
Unnamed: 0_level_1,profit,profit,unitsSold,unitsSold,vol,vol
0,45,30,15,5,35,40
1,35,30,10,5,15,40
2,55,12,30,10,5,15
3,55,30,30,20,5,20
4,55,30,30,5,5,40
5,55,52,30,12,5,16


In [12]:
url = 'https://raw.githubusercontent.com/samukweku/data-wrangling-blog/master/notebooks/Data_files/results.csv'
events = pd.read_csv(url, parse_dates=['start', 'end']).iloc[:, 1:]
events.head()

Unnamed: 0,id,name,audience,start,sponsor,end
0,1,Event 1,1178,2022-11-19 10:00:00,Sponsor 2,2022-11-19 10:15:00
1,2,Event 2,1446,2015-09-27 15:00:00,Sponsor 11,2015-09-27 15:11:00
2,3,Event 3,2261,2019-11-12 18:00:00,Sponsor 10,2019-11-12 18:53:00
3,4,Event 4,1471,2019-12-24 22:00:00,Sponsor 6,2019-12-24 22:11:00
4,5,Event 5,2605,2028-06-20 12:00:00,Sponsor 8,2028-06-20 12:31:00


In [13]:
A = (events
.conditional_join(
    events,
    ('start', 'end', '<='),
    ('end', 'start', '>='),
    ('id', 'id', '!='),
    use_numba = True,
    df_columns = ['id', 'start', 'end'],
    right_columns = ['id', 'start', 'end'])
)

A

Unnamed: 0_level_0,left,left,left,right,right,right
Unnamed: 0_level_1,id,start,end,id,start,end
0,2345,1993-11-27 10:00:00,1993-11-27 12:00:00,10,1993-11-27 12:00:00,1993-11-27 12:37:00
1,11178,1993-04-04 17:00:00,1993-04-04 17:22:00,15,1993-04-04 16:00:00,1993-04-04 18:00:00
2,19605,2030-10-25 06:00:00,2030-10-25 08:00:00,17,2030-10-25 07:00:00,2030-10-25 07:27:00
3,8218,2005-10-04 17:00:00,2005-10-04 17:27:00,26,2005-10-04 17:00:00,2005-10-04 17:18:00
4,27696,2024-05-02 15:00:00,2024-05-02 15:07:00,35,2024-05-02 15:00:00,2024-05-02 15:35:00
...,...,...,...,...,...,...
3697,29375,2000-08-26 13:00:00,2000-08-26 13:53:00,29966,2000-08-26 11:00:00,2000-08-26 13:00:00
3698,24173,2018-05-18 04:00:00,2018-05-18 04:36:00,29971,2018-05-18 04:00:00,2018-05-18 04:18:00
3699,981,1992-06-07 22:00:00,1992-06-07 22:30:00,29978,1992-06-07 22:00:00,1992-06-07 22:23:00
3700,19051,2025-06-05 01:00:00,2025-06-05 03:00:00,29984,2025-06-05 03:00:00,2025-06-05 03:17:00


In [14]:
#https://stackoverflow.com/q/61948103/7175713 
df1 = pd.DataFrame({'id': [1,1,1,2,2,3], 
                    'value_1': [2,5,7,1,3,4]})

df2 = pd.DataFrame({'id': [1,1,1,1,2,2,2,3], 
                    'value_2A': [0,3,7,12,0,2,3,1], 
                    'value_2B': [1,5,9,15,1,4,6,3]})

In [15]:
df1

Unnamed: 0,id,value_1
0,1,2
1,1,5
2,1,7
3,2,1
4,2,3
5,3,4


In [16]:
df2

Unnamed: 0,id,value_2A,value_2B
0,1,0,1
1,1,3,5
2,1,7,9
3,1,12,15
4,2,0,1
5,2,2,4
6,2,3,6
7,3,1,3


Join on equi and non-equi operators is possible:

In [17]:
df1.conditional_join(
        df2,
        ('id', 'id', '=='),
        ('value_1', 'value_2A', '>='),
        ('value_1', 'value_2B', '<='),
        sort_by_appearance = True
    )

Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,1,5,1,3,5
1,1,7,1,7,9
2,2,1,2,0,1
3,2,3,2,2,4
4,2,3,2,3,6


The default join is inner. left and right joins are supported as well:

In [18]:
df1.conditional_join(
        df2,
        ('id', 'id', '=='),
        ('value_1', 'value_2A', '>='),
        ('value_1', 'value_2B', '<='),
        how='left',
        sort_by_appearance = True
    )

Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,1,2,,,
1,1,5,1.0,3.0,5.0
2,1,7,1.0,7.0,9.0
3,2,1,2.0,0.0,1.0
4,2,3,2.0,2.0,4.0
5,2,3,2.0,3.0,6.0
6,3,4,,,


In [19]:
df1.conditional_join(
        df2,
        ('id', 'id', '=='),
        ('value_1', 'value_2A', '>='),
        ('value_1', 'value_2B', '<='),
        how='right',
        sort_by_appearance = True
    )

Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,,,1,0,1
1,1.0,5.0,1,3,5
2,1.0,7.0,1,7,9
3,,,1,12,15
4,2.0,1.0,2,0,1
5,2.0,3.0,2,2,4
6,2.0,3.0,2,3,6
7,,,3,1,3


Join on just the non-equi joins is also possible:

In [20]:
df1.conditional_join(
        df2,
        ('value_1', 'value_2A', '>'),
        ('value_1', 'value_2B', '<'),
        how='inner',
        sort_by_appearance = True
    )

Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,1,2,3,1,3
1,1,5,2,3,6
2,2,3,2,2,4
3,3,4,1,3,5
4,3,4,2,3,6


Join on not equal -> !=

In [21]:
df1.conditional_join(
        df2,
        ('id', 'id', "!=")
    )

Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,1,2,2,0,1
1,1,2,2,2,4
2,1,2,2,3,6
3,1,2,3,1,3
4,1,5,2,0,1
5,1,5,2,2,4
6,1,5,2,3,6
7,1,5,3,1,3
8,1,7,2,0,1
9,1,7,2,2,4


If the columns from both dataframes have nothing in common, a single indexed column is returned:

In [22]:
(df1.select_columns('value_1')
    .conditional_join(
        df2.select_columns('val*'),
        ('value_1', 'value_2A', '>'),
        ('value_1', 'value_2B', '<'),
    )
)

Unnamed: 0,value_1,value_2A,value_2B
0,2,1,3
1,5,3,6
2,3,2,4
3,4,3,5
4,4,3,6


Selection of relevant columns within `conditional_join`: 

In [23]:
df1.conditional_join(
        df2,
        ('id', 'id', "<"),
        df_columns = 'id',
        right_columns = 'id'
    )

Unnamed: 0_level_0,left,right
Unnamed: 0_level_1,id,id
0,1,2
1,1,2
2,1,2
3,1,3
4,1,2
5,1,2
6,1,2
7,1,3
8,1,2
9,1,2


Column renaming is also possible:

In [24]:
df1.conditional_join(
        df2,
        ('id', 'id', "<"),
        df_columns = {'id':'df_id'},
        right_columns = {'id':'right_id'}
    )

Unnamed: 0,df_id,right_id
0,1,2
1,1,2
2,1,2
3,1,3
4,1,2
5,1,2
6,1,2
7,1,3
8,1,2
9,1,2
