In [4]:
!pip install joblib
!pip install diskcache


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting diskcache
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diskcache
Successfully installed diskcache-5.6.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
import pandas as pd
import numpy as np
import time
import os
import psutil
import gc
from functools import lru_cache
from joblib import Memory
from diskcache import Cache

# テストデータの生成（既に生成済みの場合はスキップ可能）
import pandas as pd
import numpy as np
import time

def generate_test_data(n_rows=20000, n_cols=50):
    np.random.seed(42)
    data = {
        'id': range(n_rows),
        'timestamp': pd.date_range(start='2023-01-01', periods=n_rows, freq='H')
    }
    
    # 追加の数値カラム
    for i in range(2, n_cols):
        data[f'value_{i}'] = np.random.rand(n_rows)
    
    df = pd.DataFrame(data)
    df.to_pickle('test_data_large.pkl')
    print(f"Generated test data: {n_rows} rows, {n_cols} columns")
    print(df)

generate_test_data()

from functools import lru_cache
from joblib import Memory
from diskcache import Cache
import os
import psutil

# キャッシュ実装
@lru_cache(maxsize=1)
def load_data_lru_cache(file_path):
    return pd.read_pickle(file_path)

joblib_memory = Memory('./joblib_cache', verbose=0)
@joblib_memory.cache
def load_data_joblib(file_path):
    return pd.read_pickle(file_path)

diskcache = Cache('./diskcache_cache')
def load_data_diskcache(file_path):
    key = f'data:{file_path}'
    data = diskcache.get(key)
    if data is None:
        data = pd.read_pickle(file_path)
        diskcache.set(key, data)
    return data

def load_data_no_cache(file_path):
    return pd.read_pickle(file_path)

# 性能測定関数
def measure_performance(load_func, file_path, num_iterations=10):
    gc.collect()  # ガベージコレクションを実行
    
    # メモリ使用量の測定（開始時）
    process = psutil.Process(os.getpid())
    start_mem = process.memory_info().rss

    # 実行時間の測定
    start_time = time.time()
    for _ in range(num_iterations):
        df = load_func(file_path)
    end_time = time.time()
    
    # メモリ使用量の測定（終了時）
    end_mem = process.memory_info().rss
    
    avg_time = (end_time - start_time) / num_iterations
    mem_used = (end_mem - start_mem) / (1024 * 1024)  # MB単位
    
    return avg_time, mem_used

# テスト実行
file_path = 'test_data_large.pkl'
implementations = [
    ("No Cache", load_data_no_cache),
    ("lru_cache", load_data_lru_cache),
    ("joblib", load_data_joblib),
    ("diskcache", load_data_diskcache)
]

print("Performance Test Results:")
print("-------------------------")
for name, func in implementations:
    # キャッシュをクリアし、初回ロード時間を測定
    if name == "lru_cache":
        load_data_lru_cache.cache_clear()
    elif name == "joblib":
        joblib_memory.clear()
    elif name == "diskcache":
        diskcache.clear()
    
    first_load_time, first_mem_usage = measure_performance(func, file_path, 1)
    print(f"{name}:")
    print(f"  First load time: {first_load_time:.4f} seconds")
    print(f"  First load memory usage: {first_mem_usage:.2f} MB")
    
    # 2回目以降のロード時間を測定
    cached_load_time, cached_mem_usage = measure_performance(func, file_path)
    print(f"  Cached load time: {cached_load_time:.4f} seconds")
    print(f"  Cached load memory usage: {cached_mem_usage:.2f} MB")
    print()

# ファイルサイズの確認
file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB単位
print(f"Test data file size: {file_size:.2f} MB")

[Memory(location=./joblib_cache/joblib)]: Flushing completely the cache


Generated test data: 20000 rows, 50 columns
          id           timestamp   value_2   value_3   value_4   value_5  \
0          0 2023-01-01 00:00:00  0.374540  0.729998  0.298912  0.741555   
1          1 2023-01-01 01:00:00  0.950714  0.184512  0.094818  0.881102   
2          2 2023-01-01 02:00:00  0.731994  0.346640  0.126359  0.463180   
3          3 2023-01-01 03:00:00  0.598658  0.663281  0.180671  0.289179   
4          4 2023-01-01 04:00:00  0.156019  0.482089  0.203653  0.318847   
...      ...                 ...       ...       ...       ...       ...   
19995  19995 2025-04-13 03:00:00  0.877039  0.754034  0.966141  0.322129   
19996  19996 2025-04-13 04:00:00  0.046814  0.764527  0.373240  0.374626   
19997  19997 2025-04-13 05:00:00  0.303698  0.269569  0.304675  0.381702   
19998  19998 2025-04-13 06:00:00  0.443320  0.434320  0.407363  0.129632   
19999  19999 2025-04-13 07:00:00  0.172265  0.487424  0.522833  0.947287   

        value_6   value_7   value_8   value

In [12]:
import pandas as pd
import numpy as np
import time
import os
import psutil
import gc
from functools import lru_cache
from joblib import Memory
from diskcache import Cache

# テストデータの生成
def generate_large_test_data(n_rows=30000, n_cols=100):
    np.random.seed(42)
    data = {
        'id': np.random.randint(0, 1000, n_rows),
        'timestamp': pd.date_range(start='2023-01-01', periods=n_rows, freq='S')
    }
    
    for i in range(2, n_cols):
        data[f'value_{i}'] = np.random.rand(n_rows)
    
    df = pd.DataFrame(data)
    df.to_pickle('test_data_large.pkl')
    print(f"Generated test data: {n_rows} rows, {n_cols} columns")

# キャッシュ実装
@lru_cache(maxsize=1)
def load_data_lru_cache(file_path):
    return pd.read_pickle(file_path)

joblib_memory = Memory('./joblib_cache', verbose=0)
@joblib_memory.cache
def load_data_joblib(file_path):
    return pd.read_pickle(file_path)

diskcache = Cache('./diskcache_cache')
def load_data_diskcache(file_path):
    key = f'data:{file_path}'
    data = diskcache.get(key)
    if data is None:
        data = pd.read_pickle(file_path)
        diskcache.set(key, data)
    return data

def load_data_no_cache(file_path):
    return pd.read_pickle(file_path)

# 複雑な操作の例
def perform_operation(df):
    result = df.groupby('id').agg({
        'value_2': 'mean',
        'value_3': 'sum',
        'value_4': 'max',
        'value_5': 'min'
    })
    return result

# パフォーマンス測定関数
def measure_performance(load_func, file_path, num_iterations=100):
    gc.collect()
    process = psutil.Process(os.getpid())
    start_mem = process.memory_info().rss

    start_time = time.time()
    for _ in range(num_iterations):
        df = load_func(file_path)
        result = perform_operation(df)
    end_time = time.time()
    
    end_mem = process.memory_info().rss
    
    avg_time = (end_time - start_time) / num_iterations
    mem_used = (end_mem - start_mem) / (1024 * 1024)  # MB単位
    
    return avg_time, mem_used

# メイン実行部分
if __name__ == "__main__":
    # テストデータの生成（初回のみ実行）
    generate_large_test_data()

    file_path = 'test_data_large.pkl'
    implementations = [
        ("No Cache", load_data_no_cache),
        ("lru_cache", load_data_lru_cache),
        ("joblib", load_data_joblib),
        ("diskcache", load_data_diskcache)
    ]

    print("Performance Test Results:")
    print("-------------------------")
    for name, func in implementations:
        # キャッシュをクリアし、初回ロード時間を測定
        if name == "lru_cache":
            load_data_lru_cache.cache_clear()
        elif name == "joblib":
            joblib_memory.clear()
        elif name == "diskcache":
            diskcache.clear()
        
        first_load_time, first_mem_usage = measure_performance(func, file_path, 1)
        print(f"{name}:")
        print(f"  First load time: {first_load_time:.4f} seconds")
        print(f"  First load memory usage: {first_mem_usage:.2f} MB")
        
        # 2回目以降のロード時間を測定
        cached_load_time, cached_mem_usage = measure_performance(func, file_path)
        print(f"  Cached load time: {cached_load_time:.4f} seconds")
        print(f"  Cached load memory usage: {cached_mem_usage:.2f} MB")
        print()

    # ファイルサイズの確認
    file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB単位
    print(f"Test data file size: {file_size:.2f} MB")

Generated test data: 30000 rows, 100 columns
Performance Test Results:
-------------------------
No Cache:
  First load time: 0.0058 seconds
  First load memory usage: 0.00 MB


[Memory(location=./joblib_cache/joblib)]: Flushing completely the cache


  Cached load time: 0.0039 seconds
  Cached load memory usage: 6.44 MB

lru_cache:
  First load time: 0.0122 seconds
  First load memory usage: 22.17 MB
  Cached load time: 0.0014 seconds
  Cached load memory usage: 0.00 MB

joblib:
  First load time: 0.0440 seconds
  First load memory usage: 38.41 MB
  Cached load time: 0.0054 seconds
  Cached load memory usage: 6.57 MB

diskcache:
  First load time: 0.0757 seconds
  First load memory usage: 22.30 MB
  Cached load time: 0.0038 seconds
  Cached load memory usage: 22.42 MB

Test data file size: 22.89 MB


In [13]:
%pip install cachetools

Collecting cachetools
  Downloading cachetools-5.4.0-py3-none-any.whl (9.5 kB)
Installing collected packages: cachetools
Successfully installed cachetools-5.4.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
import numpy as np
import time
import os
import psutil
import gc
from functools import lru_cache
from joblib import Memory
from diskcache import Cache
from cachetools import TTLCache, cached

# テストデータの生成
def generate_large_test_data(n_rows=1000000, n_cols=150):
    np.random.seed(42)
    data = {
        'id': np.random.randint(0, 1000, n_rows),
        'timestamp': pd.date_range(start='2023-01-01', periods=n_rows, freq='S')
    }
    
    for i in range(2, n_cols):
        data[f'value_{i}'] = np.random.rand(n_rows)
    
    df = pd.DataFrame(data)
    df.to_pickle('test_data_large.pkl')
    print(f"Generated test data: {n_rows} rows, {n_cols} columns")

# キャッシュ実装
@lru_cache(maxsize=1)
def load_data_lru_cache(file_path):
    return pd.read_pickle(file_path)

joblib_memory = Memory('./joblib_cache', verbose=0)
@joblib_memory.cache
def load_data_joblib(file_path):
    return pd.read_pickle(file_path)

diskcache = Cache('./diskcache_cache')
def load_data_diskcache(file_path):
    key = f'data:{file_path}'
    data = diskcache.get(key)
    if data is None:
        data = pd.read_pickle(file_path)
        diskcache.set(key, data)
    return data

def load_data_no_cache(file_path):
    return pd.read_pickle(file_path)

# cachetoolsの実装
ttl_cache = TTLCache(maxsize=1, ttl=3600)  # 1時間のTTL

@cached(cache=ttl_cache)
def load_data_cachetools(file_path):
    return pd.read_pickle(file_path)

# 複雑な操作の例
def perform_operation(df):
    result = df.groupby('id').agg({
        'value_2': 'mean',
        'value_3': 'sum',
        'value_4': 'max',
        'value_5': 'min'
    })
    return result

# パフォーマンス測定関数
def measure_performance(load_func, file_path, num_iterations=100):
    gc.collect()
    process = psutil.Process(os.getpid())
    start_mem = process.memory_info().rss

    start_time = time.time()
    for _ in range(num_iterations):
        df = load_func(file_path)
        result = perform_operation(df)
    end_time = time.time()
    
    end_mem = process.memory_info().rss
    
    avg_time = (end_time - start_time) / num_iterations
    mem_used = (end_mem - start_mem) / (1024 * 1024)  # MB単位
    
    return avg_time, mem_used

# メイン実行部分
if __name__ == "__main__":
    # テストデータの生成（初回のみ実行）
    # generate_large_test_data()

    file_path = 'test_data_large.pkl'
    implementations = [
        ("No Cache", load_data_no_cache),
        ("lru_cache", load_data_lru_cache),
        ("joblib", load_data_joblib),
        ("diskcache", load_data_diskcache),
        ("cachetools", load_data_cachetools)
    ]

    print("Performance Test Results:")
    print("-------------------------")
    for name, func in implementations:
        # キャッシュをクリアし、初回ロード時間を測定
        if name == "lru_cache":
            load_data_lru_cache.cache_clear()
        elif name == "joblib":
            joblib_memory.clear()
        elif name == "diskcache":
            diskcache.clear()
        elif name == "cachetools":
            ttl_cache.clear()
        
        first_load_time, first_mem_usage = measure_performance(func, file_path, 1)
        print(f"{name}:")
        print(f"  First load time: {first_load_time:.4f} seconds")
        print(f"  First load memory usage: {first_mem_usage:.2f} MB")
        
        # 2回目以降のロード時間を測定
        cached_load_time, cached_mem_usage = measure_performance(func, file_path)
        print(f"  Cached load time: {cached_load_time:.4f} seconds")
        print(f"  Cached load memory usage: {cached_mem_usage:.2f} MB")
        print()

    # ファイルサイズの確認
    file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB単位
    print(f"Test data file size: {file_size:.2f} MB")

Performance Test Results:
-------------------------
No Cache:
  First load time: 0.0139 seconds
  First load memory usage: 22.16 MB


[Memory(location=./joblib_cache/joblib)]: Flushing completely the cache


  Cached load time: 0.0040 seconds
  Cached load memory usage: 22.43 MB

lru_cache:
  First load time: 0.0052 seconds
  First load memory usage: 0.00 MB
  Cached load time: 0.0015 seconds
  Cached load memory usage: 0.00 MB

joblib:
  First load time: 0.0360 seconds
  First load memory usage: 15.98 MB
  Cached load time: 0.0057 seconds
  Cached load memory usage: 6.57 MB

diskcache:
  First load time: 0.0637 seconds
  First load memory usage: 0.02 MB
  Cached load time: 0.0042 seconds
  Cached load memory usage: 0.00 MB

cachetools:
  First load time: 0.0065 seconds
  First load memory usage: 0.00 MB
  Cached load time: 0.0015 seconds
  Cached load memory usage: 0.00 MB

Test data file size: 22.89 MB


In [17]:
import pandas as pd
import numpy as np
import time
import os
import psutil
import gc
from functools import lru_cache
from joblib import Memory
from diskcache import Cache
from cachetools import TTLCache, cached

# テストデータの生成
def generate_large_test_data(n_rows=1000000, n_cols=50):
    np.random.seed(42)
    data = {
        'id': np.random.randint(0, 1000, n_rows),
        'timestamp': pd.date_range(start='2023-01-01', periods=n_rows, freq='S')
    }
    
    for i in range(2, n_cols):
        data[f'value_{i}'] = np.random.rand(n_rows)
    
    df = pd.DataFrame(data)
    df.to_pickle('test_data_large.pkl')
    print(f"Generated test data: {n_rows} rows, {n_cols} columns")

# キャッシュ実装
@lru_cache(maxsize=1)
def load_data_lru_cache(file_path):
    return pd.read_pickle(file_path)

joblib_memory = Memory('./joblib_cache', verbose=0)
@joblib_memory.cache
def load_data_joblib(file_path):
    return pd.read_pickle(file_path)

diskcache = Cache('./diskcache_cache')
def load_data_diskcache(file_path):
    key = f'data:{file_path}'
    data = diskcache.get(key)
    if data is None:
        data = pd.read_pickle(file_path)
        diskcache.set(key, data)
    return data

def load_data_no_cache(file_path):
    return pd.read_pickle(file_path)

# cachetoolsの改善された実装
def get_file_mtime(file_path):
    return os.path.getmtime(file_path)

ttl_cache = TTLCache(maxsize=1, ttl=3600)  # 1時間のTTL

@cached(cache=ttl_cache, key=lambda file_path: (file_path, get_file_mtime(file_path)))
def load_data_cachetools(file_path):
    return pd.read_pickle(file_path)

# 複雑な操作の例
def perform_operation(df):
    result = df.groupby('id').agg({
        'value_2': 'mean',
        'value_3': 'sum',
        'value_4': 'max',
        'value_5': 'min'
    })
    return result

# パフォーマンス測定関数
def measure_performance(load_func, file_path, num_iterations=100):
    gc.collect()
    process = psutil.Process(os.getpid())
    start_mem = process.memory_info().rss

    start_time = time.time()
    for _ in range(num_iterations):
        df = load_func(file_path)
        result = perform_operation(df)
    end_time = time.time()
    
    end_mem = process.memory_info().rss
    
    avg_time = (end_time - start_time) / num_iterations
    mem_used = (end_mem - start_mem) / (1024 * 1024)  # MB単位
    
    return avg_time, mem_used

# メイン実行部分
if __name__ == "__main__":
    # テストデータの生成（初回のみ実行）
    # generate_large_test_data()

    file_path = 'test_data_large.pkl'
    implementations = [
        ("No Cache", load_data_no_cache),
        ("lru_cache", load_data_lru_cache),
        ("joblib", load_data_joblib),
        ("diskcache", load_data_diskcache),
        ("cachetools", load_data_cachetools)
    ]

    print("Performance Test Results:")
    print("-------------------------")
    for name, func in implementations:
        # キャッシュをクリアし、初回ロード時間を測定
        if name == "lru_cache":
            load_data_lru_cache.cache_clear()
        elif name == "joblib":
            joblib_memory.clear()
        elif name == "diskcache":
            diskcache.clear()
        elif name == "cachetools":
            ttl_cache.clear()
        
        first_load_time, first_mem_usage = measure_performance(func, file_path, 1)
        print(f"{name}:")
        print(f"  First load time: {first_load_time:.4f} seconds")
        print(f"  First load memory usage: {first_mem_usage:.2f} MB")
        
        # 2回目以降のロード時間を測定
        cached_load_time, cached_mem_usage = measure_performance(func, file_path)
        print(f"  Cached load time: {cached_load_time:.4f} seconds")
        print(f"  Cached load memory usage: {cached_mem_usage:.2f} MB")
        print()

    # ファイルの更新をシミュレート
    print("Simulating file update...")
    with open(file_path, 'a') as f:
        f.write('dummy')  # ファイルを更新
    
    print("Performance after file update:")
    print("-------------------------------")
    for name, func in implementations:
        update_load_time, update_mem_usage = measure_performance(func, file_path, 1)
        print(f"{name}:")
        print(f"  Load time after update: {update_load_time:.4f} seconds")
        print(f"  Memory usage after update: {update_mem_usage:.2f} MB")
        print()

    # ファイルサイズの確認
    file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB単位
    print(f"Test data file size: {file_size:.2f} MB")Performance Test Results:
    -------------------------
    No Cache:
      First load time: 0.0139 seconds
      First load memory usage: 22.17 MB
    [Memory(location=./joblib_cache/joblib)]: Flushing completely the cache
      Cached load time: 0.0039 seconds
      Cached load memory usage: 22.43 MB
    
    lru_cache:
      First load time: 0.0052 seconds
      First load memory usage: 0.00 MB
      Cached load time: 0.0015 seconds
      Cached load memory usage: 0.00 MB
    
    joblib:
      First load time: 0.0348 seconds
      First load memory usage: 15.98 MB
      Cached load time: 0.0057 seconds
      Cached load memory usage: 6.57 MB
    
    diskcache:
      First load time: 0.0688 seconds
      First load memory usage: 0.02 MB
      Cached load time: 0.0036 seconds
      Cached load memory usage: 0.00 MB
    
    cachetools:
      First load time: 0.0058 seconds
      First load memory usage: 0.00 MB
      Cached load time: 0.0015 seconds
      Cached load memory usage: 0.00 MB
    
    Simulating file update...
    Performance after file update:
    -------------------------------
    No Cache:
      Load time after update: 0.0056 seconds
      Memory usage after update: 0.00 MB
    
    lru_cache:
      Load time after update: 0.0016 seconds
      Memory usage after update: 0.00 MB
    
    joblib:
      Load time after update: 0.0071 seconds
      Memory usage after update: 0.00 MB
    
    diskcache:
      Load time after update: 0.0050 seconds
      Memory usage after update: 0.00 MB
    
    cachetools:
      Load time after update: 0.0052 seconds
      Memory usage after update: 0.00 MB
    
    Test data file size: 22.89 MB

Performance Test Results:
-------------------------
No Cache:
  First load time: 0.0139 seconds
  First load memory usage: 22.17 MB


[Memory(location=./joblib_cache/joblib)]: Flushing completely the cache


  Cached load time: 0.0039 seconds
  Cached load memory usage: 22.43 MB

lru_cache:
  First load time: 0.0052 seconds
  First load memory usage: 0.00 MB
  Cached load time: 0.0015 seconds
  Cached load memory usage: 0.00 MB

joblib:
  First load time: 0.0348 seconds
  First load memory usage: 15.98 MB
  Cached load time: 0.0057 seconds
  Cached load memory usage: 6.57 MB

diskcache:
  First load time: 0.0688 seconds
  First load memory usage: 0.02 MB
  Cached load time: 0.0036 seconds
  Cached load memory usage: 0.00 MB

cachetools:
  First load time: 0.0058 seconds
  First load memory usage: 0.00 MB
  Cached load time: 0.0015 seconds
  Cached load memory usage: 0.00 MB

Simulating file update...
Performance after file update:
-------------------------------
No Cache:
  Load time after update: 0.0056 seconds
  Memory usage after update: 0.00 MB

lru_cache:
  Load time after update: 0.0016 seconds
  Memory usage after update: 0.00 MB

joblib:
  Load time after update: 0.0071 seconds
  M