# Reduce memory usage by pandas  
pd use int64 and float64 to store data which are not neccesary. We can reduce memory usage by change data type to int8 or int16 e.g. But we may lose some accuracy when convert float data.  

In [12]:
import pandas as pd
import numpy as np

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [29]:
array = np.random.randint(1,100,size=(100000,100))
data = pd.DataFrame(array)
data.memory_usage().sum()/1024**2

38.1470947265625

In [31]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,12,74,37,13,71,61,93,95,8,55,...,79,93,53,35,77,28,43,94,86,9
1,27,51,84,9,47,88,36,33,14,66,...,73,86,33,54,78,62,12,64,82,97
2,63,83,87,67,71,78,34,2,54,49,...,57,13,13,32,65,4,83,4,28,54
3,15,52,9,57,39,91,23,87,43,87,...,27,69,84,33,31,84,88,87,20,82
4,41,57,99,83,48,91,38,23,36,45,...,78,77,96,60,95,7,12,34,28,89


In [32]:
data = reduce_mem_usage(data)
data.head(5)

Memory usage of dataframe is 38.15 MB
Memory usage after optimization is: 9.54 MB
Decreased by 75.0%


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,12,74,37,13,71,61,93,95,8,55,...,79,93,53,35,77,28,43,94,86,9
1,27,51,84,9,47,88,36,33,14,66,...,73,86,33,54,78,62,12,64,82,97
2,63,83,87,67,71,78,34,2,54,49,...,57,13,13,32,65,4,83,4,28,54
3,15,52,9,57,39,91,23,87,43,87,...,27,69,84,33,31,84,88,87,20,82
4,41,57,99,83,48,91,38,23,36,45,...,78,77,96,60,95,7,12,34,28,89


In [35]:
array = np.random.standard_normal(size=(100000,100))
data = pd.DataFrame(array)
data.memory_usage().sum()/1024**2

76.2940673828125

In [36]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.077592,0.072872,1.291744,-0.608602,-0.844491,0.848183,-0.382064,-0.104247,1.552509,-0.30078,...,1.343131,2.042646,-0.855465,0.043209,1.084276,-0.113975,1.152395,-0.760854,-0.88599,0.836362
1,0.204343,1.247096,-1.313617,0.598853,-1.381371,0.671011,1.127554,0.706862,0.908093,1.930721,...,-1.018889,-0.195704,0.910329,-0.7999,-0.408897,-0.818052,-0.047788,0.852965,0.864803,0.864705
2,0.994113,-1.359532,-0.110589,0.583824,-1.403925,0.504861,-0.582632,-0.713782,0.31031,-0.808794,...,-0.806247,0.733014,0.188284,1.294415,-0.252979,-0.460128,0.348113,-0.320677,0.220596,0.556743
3,-0.382661,-0.669749,1.097721,1.353773,-0.438872,-0.243771,0.210782,1.029143,1.275212,-0.924238,...,0.668171,-0.739298,-1.659308,-1.182945,-2.813609,0.629367,-1.012866,-0.490169,-1.476727,-1.252422
4,-2.221255,1.718117,-0.475548,-1.146333,0.178759,-0.039649,0.681325,0.679493,0.400707,0.243066,...,0.450272,0.877788,0.259984,-0.125988,-0.540219,-0.991885,0.064818,2.06543,0.747034,-1.528125


In [37]:
data = reduce_mem_usage(data)
data.head(5)

Memory usage of dataframe is 76.29 MB
Memory usage after optimization is: 19.07 MB
Decreased by 75.0%


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.077148,0.072876,1.291992,-0.608398,-0.844727,0.848145,-0.38208,-0.104248,1.552734,-0.300781,...,1.342773,2.042969,-0.855469,0.043213,1.083984,-0.113953,1.152344,-0.760742,-0.88623,0.836426
1,0.204346,1.24707,-1.313477,0.598633,-1.381836,0.670898,1.12793,0.707031,0.908203,1.930664,...,-1.018555,-0.195679,0.910156,-0.799805,-0.408936,-0.817871,-0.047791,0.853027,0.864746,0.864746
2,0.994141,-1.359375,-0.110596,0.583984,-1.404297,0.504883,-0.58252,-0.713867,0.310303,-0.808594,...,-0.806152,0.73291,0.188232,1.293945,-0.25293,-0.460205,0.348145,-0.320557,0.220581,0.556641
3,-0.382568,-0.669922,1.097656,1.353516,-0.438965,-0.243774,0.210815,1.029297,1.275391,-0.924316,...,0.667969,-0.739258,-1.65918,-1.182617,-2.814453,0.629395,-1.012695,-0.490234,-1.476562,-1.251953
4,-2.220703,1.717773,-0.475586,-1.146484,0.178711,-0.039642,0.681152,0.679688,0.400635,0.243042,...,0.450195,0.87793,0.26001,-0.125977,-0.540039,-0.991699,0.064819,2.066406,0.74707,-1.52832
