In [1]:
def generator_func():
    yield 1
    yield 2
    yield 3 
    
type(generator_func())

generator

In [2]:
hasattr(generator_func(), '__iter__')

True

In [13]:
# eval() , query()

import numpy as np

rng = np.random.RandomState(42)
x = rng.rand(1000000)
y = rng.rand(1000000)

%timeit x + y

3.46 ms ± 376 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))

320 ms ± 54.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
mask = (x > 0.5) & (y < 0.5)

In [16]:
tmp1 = (x > 0.5)
tmp2 = (y < 0.5)
mask = tmp1 & tmp2 # 위와 거의 동일함

In [17]:
# 모든 중간 단계가 명시적으로 메모리에 할당된다.
# x, y의 배열의 규모가 매우 크면 메모리와 계산 능력에 심각한 오버헤드 발생
# Numexpr 라이브러리를 사용하면 중간 배열을 할당 하지 않고도 요소별로 이러한 유형의 복합 표현식을 계산할 수 있음.

In [19]:
import numexpr

mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)

True

In [20]:
# 효율적인 연산을 위한 pandas.eval()

import pandas as pd 
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))

In [21]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.37454,0.950714,0.731994,0.598658,0.156019,0.155995,0.058084,0.866176,0.601115,0.708073,...,0.119594,0.713245,0.760785,0.561277,0.770967,0.493796,0.522733,0.427541,0.025419,0.107891
1,0.031429,0.63641,0.314356,0.508571,0.907566,0.249292,0.410383,0.755551,0.228798,0.07698,...,0.093103,0.897216,0.900418,0.633101,0.33903,0.34921,0.725956,0.89711,0.887086,0.779876
2,0.642032,0.08414,0.161629,0.898554,0.606429,0.009197,0.101472,0.663502,0.005062,0.160808,...,0.0305,0.037348,0.822601,0.360191,0.127061,0.522243,0.769994,0.215821,0.62289,0.085347
3,0.051682,0.531355,0.540635,0.63743,0.726091,0.975852,0.5163,0.322956,0.795186,0.270832,...,0.990505,0.412618,0.372018,0.776413,0.340804,0.930757,0.858413,0.428994,0.750871,0.754543
4,0.103124,0.902553,0.505252,0.826457,0.32005,0.895523,0.389202,0.010838,0.905382,0.091287,...,0.455657,0.620133,0.277381,0.188121,0.463698,0.353352,0.583656,0.077735,0.974395,0.986211


In [22]:
%timeit df1 + df2 + df3 + df4

80.6 ms ± 5.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
%timeit pd.eval('df1 + df2 + df3 + df4')

32.6 ms ± 2.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
# 두 배열이 공차 내에서 요소별로 동일한 경우 True를 반환
np.allclose( df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))

True

In [27]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i in range(5))

In [28]:
# pd.eval()은 모든 산술 연산자, 비교연산자, 비트연산자를 지원함

result1 = -df1 * df2 / (df3 + df4) -df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) -df5')

np.allclose(result1, result2)

True

In [29]:
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('(df1 < df2) & (df2 <= df3) & (df3 != df4)')

np.allclose(result1, result2)

True

In [30]:
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')

np.allclose(result1, result2)

True

In [33]:
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')

np.allclose(result1, result3)

True

In [34]:
# obj.attr 구문을 통해 객체 속성에 접근하는것을 지원하고 obj[index] 구문을 통해 인덱스 접근 지원

result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)

True

In [35]:
# 기타 연산 함수 호출, 조건문, 루프를 포함한 복잡한 생성과 연산은 지원안함. Numexpr 라이브러리를 사용하면 됨.

df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.375506,0.406939,0.069938
1,0.069087,0.235615,0.154374
2,0.677945,0.433839,0.652324
3,0.264038,0.808055,0.347197
4,0.589161,0.252418,0.557789


In [37]:
df.shape

(1000, 3)

In [40]:
# 열을 사용하는 표현식

result1 = (df['A'] + df['B']) / (df['C'] -1)
result2 = pd.eval("(df.A + df.B) / (df.C -1)")

np.allclose(result1, result2)

True

In [41]:
result3 = df.eval('(A + B) / (C - 1)')
np.allclose(result1, result3)

True

In [43]:
# 새로운 열을 생성해서 값을 할당

df.eval('D = (A + B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.375506,0.406939,0.069938,11.18762
1,0.069087,0.235615,0.154374,1.973796
2,0.677945,0.433839,0.652324,1.704344
3,0.264038,0.808055,0.347197,3.087857
4,0.589161,0.252418,0.557789,1.508776


In [44]:
df.eval('D = (A - B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.375506,0.406939,0.069938,-0.449425
1,0.069087,0.235615,0.154374,-1.078728
2,0.677945,0.433839,0.652324,0.374209
3,0.264038,0.808055,0.347197,-1.566886
4,0.589161,0.252418,0.557789,0.603708


In [45]:
# 지역 파이썬 변수와 함께 작업할 수 있도록 추가적인 구문을 지원함

column_mean = df.mean(1)

result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)

True

In [46]:
# query() 메서드

result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

True

In [47]:
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)

True

In [48]:
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean & B < @Cmean')
np.allclose(result1, result2)

True

In [49]:
# 성능: 이 함수를 사용해야 하는 경우 - 계산시간, 메모리 사용 두 가지 사항을 고려해야 함.

x = df[(df.A < 0.5) & (df.B < 0.5)]

In [50]:
tmp1 = df.A < 0.5
tmp2 = df.B < 0.5
tmp3 = tmp1 & tmp2
x = df[tmp3]

In [52]:
df.values.nbytes

32000

In [None]:
# 성능 측면에서 볼 때 시스템 메모리를 넘어서지 않는다면 eval()이 더 빠를수 있다.
# 실제로 전형적인 메서드와 eval, query 메서드 간의 계산 시간차이는 일반적으로 중요하지 않다.
# 오히려 작은배열에서는 전형적인 메서드가 더 빠르다.
# eval, query 의 이점은 주로 메모리를 절약하는데 있으며 때떄로 구문이 더 깔끔하다는것..