# Reduce memory usage of large CSV files

In [1]:
import pandas as pd
import numpy as np

In [2]:
adv = pd.read_csv("../data/advanced.csv")
prg = pd.read_csv("../data/per_game.csv")

In [3]:
def reduce_mem_usage(df):
    start_memory = df.memory_usage().sum() / 1024**2
    print("------------------------------------------------------------")
    print("Memory usage of dataframe is: " + str(start_memory) + " MB.")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            col_min = df[col].min()
            col_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if col_min > np.iinfo(np.int8).min and col_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif col_min > np.iinfo(np.int16).min and col_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif col_min > np.iinfo(np.int32).min and col_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif col_min > np.iinfo(np.int64).min and col_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if col_min > np.finfo(np.float16).min and col_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")
    
    end_memory = df.memory_usage().sum() / 1024**2
    p_decrease = (start_memory - end_memory) / start_memory
    print("Memory usage after optimization is: " + str(end_memory) + " MB.")
    print("Decreased memory by " + str(p_decrease * 100) + "%")
    print("------------------------------------------------------------")

In [4]:
reduce_mem_usage(adv)

------------------------------------------------------------
Memory usage of dataframe is: 2.8660888671875 MB.
Memory usage after optimization is: 0.7522506713867188 MB.
Decreased memory by 73.75340730014055%
------------------------------------------------------------


In [5]:
adv.head()

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Season,All_NBA
0,Mark Acres,C,27,ORL,80,1691,8.296875,0.536133,0.014,0.420898,...,1.099609,0.600098,1.599609,0.046997,-3.099609,-0.600098,-3.699219,-0.700195,1990,0
1,Michael Adams,PG,27,DEN,79,2690,15.398438,0.541992,0.437012,0.316895,...,4.398438,2.5,6.898438,0.124023,1.799805,-0.099976,1.799805,2.599609,1990,0
2,Mark Aguirre,SF,30,DET,78,2005,15.796875,0.543945,0.104004,0.282959,...,3.099609,2.5,5.699219,0.135986,1.0,0.0,1.0,1.5,1990,0
3,Danny Ainge,PG,30,SAC,75,2727,16.09375,0.527832,0.25,0.230957,...,2.699219,2.099609,4.800781,0.085022,1.200195,-0.099976,1.099609,2.099609,1990,0
4,Mark Alarie,PF,26,WSB,82,1893,14.101562,0.509766,0.062012,0.168945,...,1.5,1.599609,3.099609,0.078979,-0.600098,-0.700195,-1.299805,0.300049,1990,0


In [7]:
adv.to_csv("reduced_adv.csv")