## Joining on Non-Equi Operators

In [1]:
import pandas as pd
import janitor
import numpy as np

In [2]:
from io import StringIO
from janitor.functions.conditional_join import _not_equal_indices, _less_than_indices, _greater_than_indices, _generic_func_cond_join

In [3]:
data = """A    B   C      D          E
        0  1  0.0  r   False   1970-01-01
        1  1  0.0  r  False   1970-01-01
        """

df = pd.read_csv(StringIO(data), sep='\s{2,}', engine='python', parse_dates=['E'])

df

Unnamed: 0,A,B,C,D,E
0,1,0.0,r,False,1970-01-01
1,1,0.0,r,False,1970-01-01


In [4]:
data = """Integers  Numeric  Floats  Strings  Booleans      Dates   Dates_Right
        0         0      NaN     0.0     r        False   1970-01-01  1970-01-01
        1         0      NaN     0.0     r        False   1970-01-01  1970-01-01"""

right = pd.read_csv(StringIO(data), sep='\s{2,}', engine='python', parse_dates=['Dates', 'Dates_Right'])

right

Unnamed: 0,Integers,Numeric,Floats,Strings,Booleans,Dates,Dates_Right
0,0,,0.0,r,False,1970-01-01,1970-01-01
1,0,,0.0,r,False,1970-01-01,1970-01-01


In [5]:
left_on, right_on = ["B", "Numeric"]

In [6]:
expected = (
            df.assign(t=1)
            .merge(right.assign(t=1), on="t")
            #.dropna(subset=["B", "Numeric"])
            .query(f"{left_on} != {right_on}")
            .reset_index(drop=True)
        )
expected.filter([left_on, right_on])


Unnamed: 0,B,Numeric
0,0.0,
1,0.0,
2,0.0,
3,0.0,


In [7]:
df.conditional_join(
            right, (left_on, right_on, "!="), how="inner", sort_by_appearance=True
        )

(array([0, 0, 1, 1]), array([0, 1, 0, 1]))

In [8]:
_not_equal_indices(df.B, right.Numeric)

(array([0, 0, 1, 1]), array([0, 1, 0, 1]))

In [9]:
_generic_func_cond_join(df.A, right.Integers, '!=', True)

(array([0, 0, 1, 1]), array([0, 1, 0, 1]))

In [10]:
_less_than_indices(df.A, right.Integers, True, True)

In [11]:
#https://stackoverflow.com/q/61948103/7175713 
df1 = pd.DataFrame({'id': [1,1,1,2,2,3], 
                    'value_1': [2,5,7,1,3,4]})

df2 = pd.DataFrame({'id': [1,1,1,1,2,2,2,3], 
                    'value_2A': [0,3,7,12,0,2,3,1], 
                    'value_2B': [1,5,9,15,1,4,6,3]})

In [12]:
df1

Unnamed: 0,id,value_1
0,1,2
1,1,5
2,1,7
3,2,1
4,2,3
5,3,4


In [13]:
df2

Unnamed: 0,id,value_2A,value_2B
0,1,0,1
1,1,3,5
2,1,7,9
3,1,12,15
4,2,0,1
5,2,2,4
6,2,3,6
7,3,1,3


Join on equi and non-equi operators is possible:

In [14]:
df1.conditional_join(
        df2,
        ('id', 'id', '=='),
        ('value_1', 'value_2A', '>='),
        ('value_1', 'value_2B', '<='),
        sort_by_appearance = True
    )

(   id  value_1
 0   1        2
 1   1        5
 2   1        7
 3   2        1
 4   2        3
 5   3        4,
    id  value_2A  value_2B
 0   1         0         1
 1   1         3         5
 2   1         7         9
 3   1        12        15
 4   2         0         1
 5   2         2         4
 6   2         3         6
 7   3         1         3)

The default join is inner. left and right joins are supported as well:

In [15]:
df1.conditional_join(
        df2,
        ('id', 'id', '=='),
        ('value_1', 'value_2A', '>='),
        ('value_1', 'value_2B', '<='),
        how='left',
        sort_by_appearance = True
    )

(   id  value_1
 0   1        2
 1   1        5
 2   1        7
 3   2        1
 4   2        3
 5   3        4,
    id  value_2A  value_2B
 0   1         0         1
 1   1         3         5
 2   1         7         9
 3   1        12        15
 4   2         0         1
 5   2         2         4
 6   2         3         6
 7   3         1         3)

In [16]:
df1.conditional_join(
        df2,
        ('id', 'id', '=='),
        ('value_1', 'value_2A', '>='),
        ('value_1', 'value_2B', '<='),
        how='right',
        sort_by_appearance = True
    )

(   id  value_1
 0   1        2
 1   1        5
 2   1        7
 3   2        1
 4   2        3
 5   3        4,
    id  value_2A  value_2B
 0   1         0         1
 1   1         3         5
 2   1         7         9
 3   1        12        15
 4   2         0         1
 5   2         2         4
 6   2         3         6
 7   3         1         3)

Join on just the non-equi joins is also possible:

In [17]:
df1.conditional_join(
        df2,
        ('value_1', 'value_2A', '>'),
        ('value_1', 'value_2B', '<'),
        how='inner',
        sort_by_appearance = True
    )

1

Join on just equality is also possible, but should be avoided, as Pandas' merge/join is more efficient (it uses Pandas' internal merge functions anyway);

In [18]:
df1.conditional_join(
    df2,
    ('id', 'id', "==")
)

ValueError: Equality only joins are not supported.

Join on not equal -> !=

In [None]:
df1.conditional_join(
        df2,
        ('id', 'id', "!=")
    )

Unnamed: 0_level_0,left,left,right,right,right
Unnamed: 0_level_1,id,value_1,id,value_2A,value_2B
0,1,2,2,0,1
1,1,2,2,2,4
2,1,2,2,3,6
3,1,2,3,1,3
4,1,5,2,0,1
5,1,5,2,2,4
6,1,5,2,3,6
7,1,5,3,1,3
8,1,7,2,0,1
9,1,7,2,2,4


If the columns from both dataframes have nothing in common, a single indexed column is returned:

In [None]:
(df1.select_columns('value_1')
    .conditional_join(
        df2.select_columns('val*'),
        ('value_1', 'value_2A', '>'),
        ('value_1', 'value_2B', '<'),
    )
)

Unnamed: 0,value_1,value_2A,value_2B
0,2,1,3
1,5,3,6
2,3,2,4
3,4,3,5
4,4,3,6
