# Reduce memory usage of large CSV files

In [1]:
import pandas as pd
import numpy as np

In [2]:
adv = pd.read_csv("../data/advanced.csv")
prg = pd.read_csv("../data/per_game.csv")

In [3]:
def reduce_mem_usage(df):
    start_memory = df.memory_usage().sum() / 1024**2
    print("------------------------------------------------------------")
    print("Memory usage of dataframe is: " + str(start_memory) + " MB.")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            col_min = df[col].min()
            col_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if col_min > np.iinfo(np.int8).min and col_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif col_min > np.iinfo(np.int16).min and col_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif col_min > np.iinfo(np.int32).min and col_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif col_min > np.iinfo(np.int64).min and col_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if col_min > np.finfo(np.float16).min and col_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")
    
    end_memory = df.memory_usage().sum() / 1024**2
    p_decrease = (start_memory - end_memory) / start_memory
    print("Memory usage after optimization is: " + str(end_memory) + " MB.")
    print("Decreased memory by " + str(p_decrease * 100) + "%")
    print("------------------------------------------------------------")

In [4]:
reduce_mem_usage(adv)
reduce_mem_usage(prg)

------------------------------------------------------------
Memory usage of dataframe is: 2.8660888671875 MB.
Memory usage after optimization is: 0.7522506713867188 MB.
Decreased memory by 73.75340730014055%
------------------------------------------------------------
------------------------------------------------------------
Memory usage of dataframe is: 3.17315673828125 MB.
Memory usage after optimization is: 0.81622314453125 MB.
Decreased memory by 74.27725095693319%
------------------------------------------------------------


In [5]:
adv.to_pickle("../data/advanced.pkl")
prg.to_pickle("../data/per_game.pkl")