# High-Performance Pandas: eval() and query()
## Motivating query() and eval(): Compound Expressions

In [2]:
import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(1000000)
y = rng.rand(1000000)
%timeit x + y

100 loops, best of 3: 3.92 ms per loop


In [3]:
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))

1 loop, best of 3: 481 ms per loop


In [4]:
mask = (x > 0.5) & (y < 0.5)

In [5]:
# above is equivalent to this
tmp1 = (x > 0.5)
tmp2 = (y < 0.5)
mask = tmp1 & tmp2

In [6]:
# numexpr is much more efficient than pythonic methods
import numexpr
mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)

True

## pandas.eval() for Efficient Operations

In [7]:
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))

In [8]:
# compute the sums using typical pandas approach
%timeit df1 + df2 + df3 + df4

10 loops, best of 3: 107 ms per loop


In [9]:
# compute the same with pd.eval
%timeit pd.eval('df1 + df2 + df3 + df4')

10 loops, best of 3: 52.6 ms per loop


In [10]:
# eval() is about 50% faster
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))

True

#### Operations supported by pd.eval()

In [11]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i in range(5))

#### Arithmetic Operators

In [12]:
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)

True

#### Comparison Operators

In [13]:
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')

In [23]:
df1.head()

Unnamed: 0,0,1,2
0,180,112,748
1,447,205,487
2,656,100,98
3,90,450,613
4,529,224,530


In [24]:
df2.head()

Unnamed: 0,0,1,2
0,75,15,719
1,741,587,37
2,879,695,688
3,475,110,918
4,806,420,361


In [25]:
df3.head()

Unnamed: 0,0,1,2
0,912,97,806
1,766,714,218
2,502,508,541
3,562,195,277
4,200,24,113


In [26]:
df4.head()

Unnamed: 0,0,1,2
0,461,139,372
1,487,838,959
2,97,231,638
3,109,776,472
4,199,167,816


In [27]:
result1.head()

Unnamed: 0,0,1,2
0,False,False,False
1,True,True,False
2,False,False,False
3,True,False,False
4,False,False,False


In [29]:
np.allclose(result1, result2)

True

#### Bitwise Operators

In [30]:
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)

True

#### Object attributes and indices

In [31]:
result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)

True

## DataFrame.eval() for Column-Wise Operations

In [32]:
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.375506,0.406939,0.069938
1,0.069087,0.235615,0.154374
2,0.677945,0.433839,0.652324
3,0.264038,0.808055,0.347197
4,0.589161,0.252418,0.557789


In [33]:
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)

True

In [34]:
result3 = df.eval('(A + B) / (C - 1)')
np.allclose(result1, result3)

True

#### Assignment in DataFrame.eval()

In [35]:
df.head()

Unnamed: 0,A,B,C
0,0.375506,0.406939,0.069938
1,0.069087,0.235615,0.154374
2,0.677945,0.433839,0.652324
3,0.264038,0.808055,0.347197
4,0.589161,0.252418,0.557789


In [36]:
df.eval('D = (A+B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.375506,0.406939,0.069938,11.18762
1,0.069087,0.235615,0.154374,1.973796
2,0.677945,0.433839,0.652324,1.704344
3,0.264038,0.808055,0.347197,3.087857
4,0.589161,0.252418,0.557789,1.508776


In [37]:
df.eval('D = (A - B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.375506,0.406939,0.069938,-0.449425
1,0.069087,0.235615,0.154374,-1.078728
2,0.677945,0.433839,0.652324,0.374209
3,0.264038,0.808055,0.347197,-1.566886
4,0.589161,0.252418,0.557789,0.603708


#### Local variables in DataFrame.eval()

In [38]:
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean') # @ symbol marks a variable name vs. column name
np.allclose(result1, result2)

True

## DataFrame.query() Method

In [39]:
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

True

In [40]:
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)

True

In [41]:
# query() also accepts @ flag to mark variables
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)

True

## Performance: When to Use These Functions

When considering whether to use these functions, there are two considerations: *computation time* and *memory use*.

In [42]:
x = df[(df.A < 0.5) & (df.B < 0.5)]

In [43]:
# is roughly equivalent to this
tmp1 = df.A < 0.5
tmp2 = df.B < 0.5
tmp3 = tmp1 & tmp2
x = df[tmp3]

In [44]:
# can check approximate size like this
df.values.nbytes

32000