In [1]:
import os
import gc
import time
import tempfile
import warnings
import functools
warnings.filterwarnings(action='ignore')

import pandas as pd

from memory_profiler import memory_usage

%load_ext memory_profiler

In [2]:
pd.__version__

'2.0.3'

In [3]:
TMP_DIR = "./tmp"
os.makedirs(TMP_DIR, exist_ok=True)

SIZE_MAP = {
    "KB": 1024 ** 1,
    "MB": 1024 ** 2,
    "GB": 1024 ** 3,
}


def profile(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        initial_memory = memory_usage()[0]
        result = func(*args, **kwargs)
        peak_memory = max(memory_usage())
        increment_memory = peak_memory - initial_memory
        end_time = time.time()
        print(f"[{func.__name__}]\nElapsed Time: {end_time - start_time:,.3f} seconds,\nPeak Memory: {peak_memory*1.04858:,.2f} MB,\nIncrement Memory: {increment_memory*1.04858:,.2f} MB")
        return result
    return wrapper


def memory_usage_pdobj(pdobj, size="MB"):
    return pdobj.memory_usage(deep=True).sum() / SIZE_MAP.get(size, "MB")

In [4]:
@profile
def save_as(df: pd.DataFrame, file_format: str) -> str:
    temp_filename = tempfile.mktemp(dir=TMP_DIR)
    if file_format == "csv":
        df.to_csv(temp_filename, index=False)
    elif file_format == "pickle":
        df.to_pickle(temp_filename)
    elif file_format == "feather":
        df.to_feather(temp_filename)
    elif file_format == "hdf5":
        df.to_hdf(temp_filename, key='df', mode='w')
    elif file_format == "parquet":
        df.to_parquet(temp_filename)
    return temp_filename


@profile
def load_as(temp_filename: str, file_format: str) -> None:
    if file_format == "csv":
        df = pd.read_csv(temp_filename)
    elif file_format == "pickle":
        df = pd.read_pickle(temp_filename)
    elif file_format == "feather":
        df = pd.read_feather(temp_filename)
    elif file_format == "hdf5":
        df = pd.read_hdf(temp_filename, 'df')
    elif file_format == "parquet":
        df = pd.read_parquet(temp_filename)

    del(df)
    gc.collect()
    os.remove(temp_filename)


def get_filesize(filepath: str, size="MB"):
    return os.path.getsize(filepath) / SIZE_MAP.get(size, "MB")


In [5]:
GJ_FILEPATH = "../data/open-nhis-gj-all.csv"
T20_FILEPATH = "../data/open-nhis-t20-all.csv"

## 1. Memory Optimization

In [6]:
%%time
%%memit
gj_np = pd.read_csv(GJ_FILEPATH)

peak memory: 12490.92 MiB, increment: 12362.89 MiB
CPU times: user 24 s, sys: 3.79 s, total: 27.8 s
Wall time: 28.2 s


In [7]:
%%time
%%memit
gj_pa = pd.read_csv(GJ_FILEPATH, dtype_backend='pyarrow', engine='pyarrow')

peak memory: 16495.77 MiB, increment: 6740.85 MiB
CPU times: user 13.2 s, sys: 3.8 s, total: 17 s
Wall time: 2.38 s


In [8]:
print(f"GJ Numpy: {memory_usage_pdobj(gj_np):,.2f} MB")
print(f"GJ PyArrow: {memory_usage_pdobj(gj_pa):,.2f} MB")

GJ Numpy: 10,442.86 MB
GJ PyArrow: 3,393.22 MB


### 1.1 데이터 형식 변환

데이터 형식 변환에 따른 메모리 사용량

- 코드북 사용
- 파일을 읽을 때 데이터 타입 지정

(+) pyarrow 데이터 타입을 지정했을 때 변화

#### 코드북 사용

In [9]:
CODEBOOK = {
    "SEX": {1: "남자", 2: "여자"},
    "SIDO": {11: "서울특별시", 26: "부산광역시", 27: "대구광역시", 28: "인천광역시", 29: "광주광역시", 30: "대전광역시",
             31: "울산광역시", 36: "세종특별자치시", 42: "강원도", 43: "충청북도", 44: "충청남도", 45: "전라북도",
             46: "전라남도", 47: "경상북도", 48: "경상남도", 49: "제주특별자치도"},
    "HEAR_LEFT": {1: "정상", 2: "비정상"},
    "HEAR_RIGHT": {1: "정상", 2: "비정상"},
    "SMK_STAT_TYPE_CD": {1: "비흡연", 2: "금연", 3: "흡연"},
    "DRK_YN": {0: "N", 1: "Y"},
}

CODEBOOK = {col: {v: str(k) for k, v in d.items()} for col, d in CODEBOOK.items()}

In [10]:
%%time
%%memit
gj_np = gj_np.replace(CODEBOOK)

peak memory: 19293.38 MiB, increment: 2795.93 MiB
CPU times: user 21.6 s, sys: 1.65 s, total: 23.2 s
Wall time: 23.6 s


In [11]:
%%time
%%memit
gj_pa = gj_pa.replace(CODEBOOK)

peak memory: 17312.34 MiB, increment: 754.93 MiB
CPU times: user 11.4 s, sys: 414 ms, total: 11.8 s
Wall time: 12.2 s


In [12]:
print(f"GJ Numpy: {memory_usage_pdobj(gj_np):,.2f} MB")
print(f"GJ PyArrow: {memory_usage_pdobj(gj_pa):,.2f} MB") 

GJ Numpy: 8,005.85 MB
GJ PyArrow: 2,846.67 MB


#### 데이터 타입 지정

In [13]:
GJ_USECOLS = ["HCHK_YEAR", "IDV_ID", "SEX", "AGE_GROUP", "SIDO", "HEIGHT", "WEIGHT", "SIGHT_LEFT", "SIGHT_RIGHT",
              "HEAR_LEFT", "HEAR_RIGHT", "BP_HIGH", "BP_LWST", "BLDS", "TOT_CHOLE", "HMG", "OLIG_PROTE_CD",
              "SGOT_AST", "SGPT_ALT", "GAMMA_GTP", "SMK_STAT_TYPE_CD", "DRK_YN"]

# 2.0부터는 data type으로 float8, float16 사용 못함
GJ_DTYPES_NP = {
    "HCHK_YEAR": "int16",
    "IDV_ID": "int32",
    "SEX": "category",
    "AGE_GROUP": "category",
    "SIDO": "category",
    "HEIGHT": "float32",
    "WEIGHT": "float32",
    "SIGHT_LEFT": "category",
    "SIGHT_RIGHT": "category",
    "HEAR_LEFT": "category",
    "HEAR_RIGHT": "category",
    "BP_HIGH": "float32",
    "BP_LWST": "float32",
    "BLDS": "float32",
    "TOT_CHOLE": "float32",
    "HMG": "float32",
    "OLIG_PROTE_CD": "float32",
    "SGOT_AST": "float32",
    "SGPT_ALT": "float32",
    "GAMMA_GTP": "float32",
    "SMK_STAT_TYPE_CD": "category",
    "DRK_YN": "category",
}

GJ_DTYPES_PA = {k: v + "[pyarrow]" if v.startswith(("int", "float")) else v for k, v in GJ_DTYPES_NP.items()}

In [14]:
%%time
%%memit
gj_np = pd.read_csv(GJ_FILEPATH, dtype=GJ_DTYPES_NP)

peak memory: 19578.28 MiB, increment: 2263.48 MiB
CPU times: user 23.1 s, sys: 2.44 s, total: 25.6 s
Wall time: 26 s


In [15]:
gj_np.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18887937 entries, 0 to 18887936
Data columns (total 22 columns):
 #   Column            Dtype   
---  ------            -----   
 0   HCHK_YEAR         int16   
 1   IDV_ID            int32   
 2   SEX               category
 3   AGE_GROUP         category
 4   SIDO              category
 5   HEIGHT            float32 
 6   WEIGHT            float32 
 7   SIGHT_LEFT        category
 8   SIGHT_RIGHT       category
 9   HEAR_LEFT         category
 10  HEAR_RIGHT        category
 11  BP_HIGH           float32 
 12  BP_LWST           float32 
 13  BLDS              float32 
 14  TOT_CHOLE         float32 
 15  HMG               float32 
 16  OLIG_PROTE_CD     float32 
 17  SGOT_AST          float32 
 18  SGPT_ALT          float32 
 19  GAMMA_GTP         float32 
 20  SMK_STAT_TYPE_CD  category
 21  DRK_YN            category
dtypes: category(9), float32(11), int16(1), int32(1)
memory usage: 1.0 GB


In [16]:
%%time
%%memit
gj_pa = pd.read_csv(GJ_FILEPATH, dtype=GJ_DTYPES_PA, dtype_backend="pyarrow", engine='pyarrow') 

peak memory: 19466.49 MiB, increment: 4346.62 MiB
CPU times: user 15.8 s, sys: 4.65 s, total: 20.5 s
Wall time: 5.51 s


In [17]:
gj_pa.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18887937 entries, 0 to 18887936
Data columns (total 22 columns):
 #   Column            Dtype         
---  ------            -----         
 0   HCHK_YEAR         int16[pyarrow]
 1   IDV_ID            int32[pyarrow]
 2   SEX               category      
 3   AGE_GROUP         category      
 4   SIDO              category      
 5   HEIGHT            float[pyarrow]
 6   WEIGHT            float[pyarrow]
 7   SIGHT_LEFT        category      
 8   SIGHT_RIGHT       category      
 9   HEAR_LEFT         category      
 10  HEAR_RIGHT        category      
 11  BP_HIGH           float[pyarrow]
 12  BP_LWST           float[pyarrow]
 13  BLDS              float[pyarrow]
 14  TOT_CHOLE         float[pyarrow]
 15  HMG               float[pyarrow]
 16  OLIG_PROTE_CD     float[pyarrow]
 17  SGOT_AST          float[pyarrow]
 18  SGPT_ALT          float[pyarrow]
 19  GAMMA_GTP         float[pyarrow]
 20  SMK_STAT_TYPE_CD  category      
 21  DRK_YN

In [18]:
print(f"GJ Numpy: {memory_usage_pdobj(gj_np):,.2f} MB")
print(f"GJ PyArrow: {memory_usage_pdobj(gj_pa):,.2f} MB") 

GJ Numpy: 1,062.77 MB
GJ PyArrow: 1,092.04 MB


### 1.2 파일 포맷에 따른 처리시간, 메모리사용량, 파일크기 비교
csv, pickle, feather, hdf5, parquet 포맷간 비교분석
- 읽기/쓰기 시간 비교
- IO 시에 메모리 사용량 비교
- 저장된 파일의 크기 비교

(+) pyarrow 데이터 타입을 지정했을 때 변화

In [19]:
gj_np = pd.read_csv(GJ_FILEPATH)

In [20]:
gj_pa = pd.read_csv(GJ_FILEPATH, dtype_backend='pyarrow', engine='pyarrow')

In [21]:
for file_format in ["csv", "pickle", "feather", "parquet"]:
    print(f"\n\n=== {file_format} ===")
    temp = save_as(gj_np, file_format)
    print(f"File size: {get_filesize(temp):,.2f} MB")
    load_as(temp, file_format)



=== csv ===
[save_as]
Elapsed Time: 165.138 seconds,
Peak Memory: 23,910.24 MB,
Increment Memory: 3.28 MB
File size: 2,175.33 MB
[load_as]
Elapsed Time: 27.930 seconds,
Peak Memory: 24,498.01 MB,
Increment Memory: 587.77 MB


=== pickle ===
[save_as]
Elapsed Time: 3.454 seconds,
Peak Memory: 24,498.03 MB,
Increment Memory: 0.02 MB
File size: 2,926.95 MB
[load_as]
Elapsed Time: 4.167 seconds,
Peak Memory: 23,534.17 MB,
Increment Memory: -963.86 MB


=== feather ===
[save_as]
Elapsed Time: 4.849 seconds,
Peak Memory: 23,563.28 MB,
Increment Memory: 29.10 MB
File size: 1,068.58 MB
[load_as]
Elapsed Time: 2.638 seconds,
Peak Memory: 23,842.08 MB,
Increment Memory: 278.81 MB


=== parquet ===
[save_as]
Elapsed Time: 11.225 seconds,
Peak Memory: 23,862.82 MB,
Increment Memory: 20.73 MB
File size: 301.21 MB
[load_as]
Elapsed Time: 3.534 seconds,
Peak Memory: 24,060.43 MB,
Increment Memory: 197.62 MB


In [22]:
for file_format in ["csv", "pickle", "feather", "parquet"]:
    print(f"\n\n=== {file_format} ===")
    temp = save_as(gj_pa, file_format)
    print(f"File size: {get_filesize(temp):,.2f} MB")
    load_as(temp, file_format)



=== csv ===
[save_as]
Elapsed Time: 188.767 seconds,
Peak Memory: 24,068.79 MB,
Increment Memory: 8.34 MB
File size: 2,175.33 MB
[load_as]
Elapsed Time: 26.222 seconds,
Peak Memory: 25,215.53 MB,
Increment Memory: 1,146.74 MB


=== pickle ===
[save_as]
Elapsed Time: 1.840 seconds,
Peak Memory: 25,215.54 MB,
Increment Memory: 0.02 MB
File size: 3,412.85 MB
[load_as]
Elapsed Time: 0.653 seconds,
Peak Memory: 25,366.99 MB,
Increment Memory: 151.45 MB


=== feather ===
[save_as]
Elapsed Time: 1.673 seconds,
Peak Memory: 25,367.83 MB,
Increment Memory: 0.84 MB
File size: 1,072.04 MB
[load_as]
Elapsed Time: 1.071 seconds,
Peak Memory: 25,477.20 MB,
Increment Memory: 109.38 MB


=== parquet ===
[save_as]
Elapsed Time: 7.304 seconds,
Peak Memory: 25,477.35 MB,
Increment Memory: 0.15 MB
File size: 302.34 MB
[load_as]
Elapsed Time: 1.388 seconds,
Peak Memory: 25,485.41 MB,
Increment Memory: 8.06 MB


## 2. Performance Optimization

In [23]:
import numpy as np
try:
    import pyarrow as pa
except:
    pass

### 2.1 건강위험지수 산출
numeric arithmetic operations의 성능
- iteration
- np vectorization
- pa vectorization

In [24]:
SCORING_COLS = ['IDV_ID', 'SEX', 'AGE_GROUP', 'HEIGHT', 'WEIGHT', 'BLDS', 'TOT_CHOLE',
                'HMG', 'OLIG_PROTE_CD', 'SGOT_AST', 'SGPT_ALT', 'SMK_STAT_TYPE_CD', 'DRK_YN']


def scoring_health_pd(patient):
    bmi = ((patient['WEIGHT'] / (patient['HEIGHT']/100)**2) >= 30) * 1  # 체질량(BMI)
    blds = (patient['BLDS'] >= 125) * 1  # 공복혈당 BLDS
    chole = (patient['TOT_CHOLE'] >= 130) * 1  # 총콜레스테롤 TOT_CHOLE
    hmg = (patient['HMG'] < 12) * 1  # 혈색소 HMG
    sg = ((patient['SGOT_AST'] >= 40) | (patient['SGPT_ALT'] >= 40)) * 1  # 혈청지오티 SGOT_AST, SGPT_ALT
    smoke = (patient['SMK_STAT_TYPE_CD'] == 3) * 1  # 흡연 SMOKE
    drink = (patient['DRK_YN'] == 1) * 1 # 음주 DRINK
    
    patient_score = np.sum([bmi, blds, chole, hmg, sg, smoke, drink], axis=0)
    return patient_score


def scoring_health_np(patient):
    bmi = ((patient['WEIGHT'].values / (patient['HEIGHT'].values/100)**2) >= 30) * 1  # 체질량(BMI)
    blds = (patient['BLDS'].values >= 125) * 1  # 공복혈당 BLDS
    chole = (patient['TOT_CHOLE'].values >= 130) * 1  # 총콜레스테롤 TOT_CHOLE
    hmg = (patient['HMG'].values < 12) * 1  # 혈색소 HMG
    sg = ((patient['SGOT_AST'].values >= 40) | (patient['SGPT_ALT'].values >= 40)) * 1  # 혈청지오티 SGOT_AST, SGPT_ALT
    smoke = (patient['SMK_STAT_TYPE_CD'].values == 3) * 1  # 흡연 SMOKE
    drink = (patient['DRK_YN'].values == 1) * 1 # 음주 DRINK
    
    patient_score = np.sum([bmi, blds, chole, hmg, sg, smoke, drink], axis=0)
    return patient_score

def scoring_health_pa(patient):
    bmi = ((patient['WEIGHT'].array / (patient['HEIGHT'].array/100)**2) >= 30).astype('int8[pyarrow]')  # 체질량(BMI)
    blds = (patient['BLDS'].array >= 125).astype('int8[pyarrow]')  # 공복혈당 BLDS
    chole = (patient['TOT_CHOLE'].array >= 130).astype('int8[pyarrow]')  # 총콜레스테롤 TOT_CHOLE
    hmg = (patient['HMG'].array < 12).astype('int8[pyarrow]')  # 혈색소 HMG
    sg = ((patient['SGOT_AST'].array >= 40) | (patient['SGPT_ALT'].array >= 40)).astype('int8[pyarrow]')  # 혈청지오티 SGOT_AST, SGPT_ALT
    smoke = (patient['SMK_STAT_TYPE_CD'] == 3).astype('int8[pyarrow]')  # 흡연 SMOKE
    drink = (patient['DRK_YN'] == 1).astype('int8[pyarrow]') # 음주 DRINK
    
    patient_score = np.sum([bmi, blds, chole, hmg, sg, smoke, drink], axis=0)
    return patient_score

In [25]:
%%time
%%memit
gj_np = pd.read_csv(GJ_FILEPATH, usecols=SCORING_COLS, dtype=GJ_DTYPES_NP)
gj_np = gj_np.replace(CODEBOOK).fillna(method="bfill")

peak memory: 24678.58 MiB, increment: 373.89 MiB
CPU times: user 16.1 s, sys: 1.89 s, total: 18 s
Wall time: 18.5 s


In [26]:
%%time
%%memit
gj_pa = pd.read_csv(GJ_FILEPATH, usecols=SCORING_COLS, dtype=GJ_DTYPES_PA, dtype_backend="pyarrow")
gj_pa = gj_pa.replace(CODEBOOK).fillna(method="bfill")

peak memory: 26468.77 MiB, increment: 5008.12 MiB
CPU times: user 1min 15s, sys: 3.89 s, total: 1min 19s
Wall time: 1min 19s


#### Iteration using `iterrows()`

In [27]:
%%time
scores_itterrows = []
for _, patient in gj_np.iterrows():
    patient_score = scoring_health_pd(patient)
    scores_itterrows.append(patient_score)

CPU times: user 12min 13s, sys: 4.79 s, total: 12min 18s
Wall time: 12min 18s


#### Iteration using `.apply()`

In [28]:
%%time
%%memit
scores_apply = gj_np.apply(scoring_health_pd, axis=1)

peak memory: 38070.17 MiB, increment: 8916.32 MiB
CPU times: user 6min 32s, sys: 4.77 s, total: 6min 37s
Wall time: 6min 38s


#### Vectorization with `pd.Series`

In [29]:
%%time
%%memit
scores_pd = scoring_health_pd(gj_np)

peak memory: 30202.95 MiB, increment: 1005.60 MiB
CPU times: user 501 ms, sys: 655 ms, total: 1.16 s
Wall time: 1.54 s


#### Vectorization with `np.array`

In [30]:
%%time
%%memit
scores_np = scoring_health_np(gj_np)

peak memory: 30205.04 MiB, increment: 1007.82 MiB
CPU times: user 493 ms, sys: 619 ms, total: 1.11 s
Wall time: 1.5 s


#### Vectorization with `pyarrow`

In [31]:
%%time
%%memit
scores_pa = scoring_health_pa(gj_pa)

peak memory: 29337.27 MiB, increment: 139.57 MiB
CPU times: user 661 ms, sys: 224 ms, total: 886 ms
Wall time: 1.29 s


#### Python List, NumPy Array, PyArrow Array

In [32]:
%%time
%%memit
# Python List
weight_py = gj_np["WEIGHT"].tolist()
height_py = gj_np["HEIGHT"].tolist()

bmi_py = list(map(lambda a, b: a / (b/100)**2, weight_py, height_py))

peak memory: 30735.50 MiB, increment: 1537.67 MiB
CPU times: user 3.54 s, sys: 505 ms, total: 4.05 s
Wall time: 4.44 s


In [33]:
%%time
%%memit
# Numpy Array
weight_np = gj_np["WEIGHT"].values
height_np = gj_np["HEIGHT"].values

bmi_np = np.divide(weight_np, np.power(np.divide(height_np, 100), 2))

peak memory: 30704.66 MiB, increment: 0.01 MiB
CPU times: user 652 ms, sys: 155 ms, total: 807 ms
Wall time: 1.21 s


In [34]:
%%time
%%memit
# PyArrow Array
weight_pa = pa.array(gj_np["WEIGHT"])
height_pa = pa.array(gj_np["HEIGHT"])

bmi_pa = pa.compute.divide(weight_pa, pa.compute.power(pa.compute.divide(height_pa, 100), 2))

peak memory: 30833.55 MiB, increment: 128.94 MiB
CPU times: user 768 ms, sys: 79.3 ms, total: 848 ms
Wall time: 1.24 s


### 2.2 병원 비용이 높은 최상위 진료내역서 5개

In [35]:
T20_DTYPES_NP = {
    "STND_Y": "int16",
    "IDV_ID": "int32",
    "KEY_SEQ": "int32",
    "SEX": "category",
    "AGE_GROUP": "category",
    "SIDO": "category",
    "RECU_FR_DT": "string",
    "MAIN_SICK": "category",
    "SUB_SICK": "category",
    "VSCN": "int16",
    "EDEC_ADD_RT": "float32",
    "EDEC_TRAMT": "int32",
    "EDEC_SBRDN_AMT": "int32",
    "EDEC_JBRDN_AMT": "int32",
}

T20_DTYPES_PA = {k: v + "[pyarrow]" if v.startswith(("int", "float", "string")) else v for k, v in T20_DTYPES_NP.items()}

In [36]:
%%time
%%memit
t20_np = pd.read_csv(T20_FILEPATH, dtype=T20_DTYPES_NP)
t20_np = t20_np.replace(CODEBOOK)

peak memory: 37664.20 MiB, increment: 6887.55 MiB
CPU times: user 2min 36s, sys: 20.5 s, total: 2min 56s
Wall time: 3min 1s


In [37]:
%%time
%%memit
t20_pa = pd.read_csv(T20_FILEPATH, dtype=T20_DTYPES_PA, dtype_backend="pyarrow", engine='pyarrow')
t20_pa = t20_pa.replace(CODEBOOK)

peak memory: 35717.24 MiB, increment: 11490.71 MiB
CPU times: user 1min 42s, sys: 58.6 s, total: 2min 41s
Wall time: 1min 28s


#### `sorting -> head`

In [38]:
%%time
%%memit
result_np = t20_np["EDEC_TRAMT"].sort_values(ascending=False).head(5)

peak memory: 16402.54 MiB, increment: 1312.99 MiB
CPU times: user 18.3 s, sys: 6.42 s, total: 24.7 s
Wall time: 28.2 s


In [39]:
%%time
%%memit
result_pa = t20_pa["EDEC_TRAMT"].sort_values(ascending=False).head(5)

peak memory: 18741.06 MiB, increment: 5124.00 MiB
CPU times: user 20.7 s, sys: 2.52 s, total: 23.2 s
Wall time: 23.6 s


#### `nlargest`

In [40]:
%%time
%%memit
result_np = t20_np["EDEC_TRAMT"].nlargest(5)

peak memory: 19467.38 MiB, increment: 2118.30 MiB
CPU times: user 1.44 s, sys: 1.14 s, total: 2.57 s
Wall time: 3.07 s


In [41]:
%%time
%%memit
result_pa = t20_pa["EDEC_TRAMT"].nlargest(5)

peak memory: 18805.21 MiB, increment: 3575.41 MiB
CPU times: user 2.09 s, sys: 1.47 s, total: 3.56 s
Wall time: 3.97 s


### 2.3 건강위험지수 5 이상인 사람들의 진료내역 추출
filtering operation의 성능
- list comprehension
- apply
- pd isin
- pd query
- np isin
- pd merge
- pa ???

(+) pyarrow 데이터 타입을 지정했을 때 변화

In [42]:
gj_np.head()

Unnamed: 0,IDV_ID,SEX,AGE_GROUP,HEIGHT,WEIGHT,BLDS,TOT_CHOLE,HMG,OLIG_PROTE_CD,SGOT_AST,SGPT_ALT,SMK_STAT_TYPE_CD,DRK_YN
0,725578,2,1,140.0,45.0,91.0,216.0,13.4,1.0,20.0,11.0,1,1
1,118183,2,1,145.0,35.0,75.0,162.0,13.0,1.0,26.0,12.0,1,1
2,667818,2,1,145.0,35.0,85.0,149.0,12.7,1.0,16.0,15.0,1,1
3,877233,2,1,145.0,40.0,88.0,170.0,12.6,1.0,17.0,10.0,1,1
4,917332,2,1,145.0,40.0,85.0,242.0,12.2,1.0,9.0,10.0,1,1


In [43]:
PATIENTS = gj_np[scores_np >= 5]["IDV_ID"].unique().tolist()
print(f"Number of patients scoring above 5 : {len(PATIENTS):,}")

Number of patients scoring above 5 : 940


#### List Comprehension

In [44]:
%%time
%%memit
tmp = t20_np[[x in PATIENTS for x in t20_np["IDV_ID"]]][["IDV_ID"]]

peak memory: 18570.79 MiB, increment: 5208.49 MiB
CPU times: user 26min 6s, sys: 9.79 s, total: 26min 16s
Wall time: 26min 29s


#### `pd.DataFrame.apply()`

In [45]:
%%time
%%memit
tmp = t20_np[t20_np["IDV_ID"].apply(lambda x: x in PATIENTS)][["IDV_ID"]]

peak memory: 18561.05 MiB, increment: 1739.88 MiB
CPU times: user 26min 15s, sys: 10 s, total: 26min 25s
Wall time: 26min 27s


#### `pd.DataFrame.isin()`

In [46]:
%%time
%%memit
tmp = t20_np[t20_np.isin({"IDV_ID": PATIENTS})["IDV_ID"]][["IDV_ID"]]

peak memory: 20025.65 MiB, increment: 8401.74 MiB
CPU times: user 29.4 s, sys: 7.05 s, total: 36.4 s
Wall time: 37.9 s


#### `pd.DataFrame.query()`

In [47]:
%%time
%%memit
tmp = t20_np[["IDV_ID"]].query("IDV_ID in @PATIENTS")

peak memory: 20615.93 MiB, increment: 589.33 MiB
CPU times: user 3.08 s, sys: 1.76 s, total: 4.84 s
Wall time: 6.02 s


#### `pd.DataFrame.merge()`


In [48]:
%%time
%%memit
tmp = t20_np.merge(pd.Series(PATIENTS, name='IDV_ID'), how='inner', on='IDV_ID')

peak memory: 20064.92 MiB, increment: 4348.41 MiB
CPU times: user 10.1 s, sys: 2.3 s, total: 12.4 s
Wall time: 12.9 s


#### `np.isin()`

In [49]:
%%time
%%memit
tmp = t20_np[np.isin(t20_np["IDV_ID"].values, PATIENTS)][["IDV_ID"]]

peak memory: 17171.03 MiB, increment: 822.85 MiB
CPU times: user 1.17 s, sys: 389 ms, total: 1.56 s
Wall time: 1.96 s


#### `pa.compute.is_in()`

In [50]:
%%time
%%memit
tmp = t20_np[pa.compute.is_in(pa.array(t20_pa["IDV_ID"]), pa.array(PATIENTS)).to_pandas()][["IDV_ID"]]

peak memory: 17388.33 MiB, increment: 913.15 MiB
CPU times: user 2.19 s, sys: 314 ms, total: 2.51 s
Wall time: 2.91 s


### PyArrow datatype 데이터프레임에서의 시행

#### `np.isin()`

In [51]:
%%time
%%memit
tmp = t20_pa[np.isin(t20_pa["IDV_ID"].array, np.array(PATIENTS))][["IDV_ID"]]

peak memory: 20621.08 MiB, increment: 3397.46 MiB
CPU times: user 1.39 s, sys: 1.61 s, total: 2.99 s
Wall time: 3.39 s


#### `pa.compute.is_in()`

In [52]:
%%time
%%memit
tmp = t20_pa[pa.compute.is_in(pa.array(t20_pa["IDV_ID"]), pa.array(PATIENTS)).to_pandas()][["IDV_ID"]]

peak memory: 20797.61 MiB, increment: 175.41 MiB
CPU times: user 2.17 s, sys: 193 ms, total: 2.36 s
Wall time: 2.76 s


In [53]:
%%time
%%memit
tmp = t20_pa[["IDV_ID"]].query("IDV_ID in @PATIENTS")

peak memory: 22006.80 MiB, increment: 1383.84 MiB
CPU times: user 2.49 s, sys: 897 ms, total: 3.39 s
Wall time: 3.87 s


In [54]:
%%time
%%memit
tmp = t20_pa.merge(pd.Series(PATIENTS, name='IDV_ID', dtype='int32[pyarrow]'), how='inner', on='IDV_ID')

peak memory: 21859.88 MiB, increment: 3090.37 MiB
CPU times: user 7.07 s, sys: 2.14 s, total: 9.21 s
Wall time: 9.84 s
