# High-Performance Pandas / Enhancing performance:

In [None]:
# Start writing code here...
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.DataFrame({
    "a": np.random.randn(1000),
    "b": np.random.randn(1000),
    "N": np.random.randint(100, 1000, (1000)),
     "x": "x",
})
df.head()

Unnamed: 0,a,b,N,x
0,-0.697955,-0.79037,881,x
1,0.242037,2.048257,552,x
2,0.297028,0.449086,364,x
3,0.821676,-1.634745,174,x
4,0.761176,2.457768,938,x


In [None]:
def f(x):
    return x * (x-1)

In [None]:
def integrate_f(a, b, N):
    s = 0 
    dx = (b-a)/N
    for i in range(N):
        s+=f(a+i*dx)
    return s*dx

In [None]:
%timeit df.apply(lambda x: integrate_f(x['a'],x['b'],x['N']),axis=1)

177 ms ± 2.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%prun df.apply(lambda x: integrate_f(x['a'],x['b'],x['N']),axis=1)

 

In [None]:
import Cython

In [None]:
Cython.__version__

'0.29.24'

In [None]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [None]:
%%cython
def f_plain(x):
    return x(x-1)

def integrate_f_plain(a,b,N):
    s = 0 
    dx = (b-a)/N
    for i in range(N):
        s += f_plain(a + i * dx)
    return s * dx

In [None]:
%timeit df.apply(lambda x: integrate_f_plain(x["a"], x["b"], x["N"]),axis=1)

TypeError: 'float' object is not callable

In [None]:
rng = np.random.RandomState(42)
x = rng.rand(1000000)
y = rng.rand(1000000)

In [None]:
%timeit x+y

1.46 ms ± 9.07 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
%timeit np.fromiter((xi + yi for xi,yi in zip(x,y)),dtype =x.dtype,count = len(x))

542 ms ± 15.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
mask = (x > 0.5) & (y < 0.5)
mask

array([False,  True,  True, ..., False, False, False])

In [None]:
tmp1 = (x > 0.5)
tmp2 = (y < 0.5)
mask = tmp1 & tmp2

In [None]:
mask

array([False,  True,  True, ..., False, False, False])

In [None]:
import numexpr


In [None]:
%timeit
mask_numexpr = numexpr.evaluate("(x > 0.5) & (y < 0.5)")
np.allclose(mask,mask_numexpr)

True

## Pandas Eval and Query

In [None]:
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
                      for i in range(4))

In [None]:
%timeit df1 + df2 + df3 + df4 

161 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%timeit pd.eval('df1 + df2 + df3 + df4')

73 ms ± 3.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
np.allclose(df1 + df2 + df3 + df4,
            pd.eval('df1 + df2 + df3 + df4'))

True

In [None]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3)))
                           for i in range(5))

In [None]:
%timeit result1 = -df1 * df2 / (df3 + df4) - df5

1.13 ms ± 43.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
%timeit result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')

6.47 ms ± 108 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.375506,0.406939,0.069938
1,0.069087,0.235615,0.154374
2,0.677945,0.433839,0.652324
3,0.264038,0.808055,0.347197
4,0.589161,0.252418,0.557789


In [None]:
q1 = df.query("A<0.5 and B<0.5")
q2 = df.query("B>0.5 and C>0.5")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f5f90ba1-3290-463e-8fc6-44108f4fa21b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>