# Memory Optimization in Pandas by  Downcasting and changing the dtypes

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf

In [2]:
titanic_data=sns.load_dataset('titanic')

In [3]:
titanic_data[['deck','class']]=titanic_data[['deck','class']].astype('object')

In [4]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


### we can see the dataset's memory usage is ~ 92.4 KB


In [6]:
%%time
titanic_data[['who','deck']]

CPU times: user 1.02 ms, sys: 257 µs, total: 1.27 ms
Wall time: 1.07 ms


Unnamed: 0,who,deck
0,man,
1,woman,C
2,woman,
3,woman,C
4,man,
...,...,...
886,man,
887,woman,B
888,woman,
889,man,C


## Numeric dtype Downcasting

In [7]:
def downcaste_dtype(df):
    previous_memory_consumption=(df.memory_usage().sum())/1024**2 ## Usage in MB
    for col in df.columns:
        col_dtype=df[col].dtypes
        if(col_dtype!='object'):
            xmin=df[col].min()
            xmax=df[col].max()
            if(str(df[col].dtype)[0]=='i'): #for int downcasting
                if(np.iinfo('int8').min<xmin and np.iinfo('int8').max>xmax):
                    df[col]=df[col].astype('int8')
                    
                elif(np.iinfo('int16').min<xmin and np.iinfo('int16').max>xmax):
                    df[col]=df[col].astype('int16')
                    
                elif(np.iinfo('int32').min<xmin and np.iinfo('int32').max>xmax):
                    df[col]=df[col].astype('int32')
                    
                elif(np.iinfo('int64').min<xmin and np.iinfo('int64').max>xmax):
                    df[col]=df[col].astype('int64')
                    
            elif(str(df[col].dtype)[0]=='f'): #for float downcasting
                if(np.finfo('float16').min<xmin and np.finfo('float16').max>xmax):
                    df[col]=df[col].astype('float16')
                    
                elif(np.finfo('float32').min<xmin and np.finfo('float32').max>xmax):
                    df[col]=df[col].astype('float32')
                    
                elif(np.finfo('float64').min<xmin and np.finfo('float64').max>xmax):
                    df[col]=df[col].astype('float64')
            else:pass
    After_memory_consumption=(df.memory_usage().sum())/1024**2
    percentage_of_decrease=((previous_memory_consumption-After_memory_consumption)/previous_memory_consumption)*100
    print('Memory usage before downcasting: {:.2f}\n'.format(previous_memory_consumption))
    print('After Downcasting the memory usage decreased by: {:.2f}\n'.format(After_memory_consumption))
    print('Memory usage decreased by {:.3f}%'.format(percentage_of_decrease))    

In [8]:
downcaste_dtype(titanic_data)

Memory usage before downcasting: 0.09

After Downcasting the memory usage decreased by: 0.06

Memory usage decreased by 37.685%


# Object dtype Downcasting

## Changing the required objects datatypes into categorical datatypes

### In Titanic dataset all the objects are of categorical so instead of doing it explicitly we can automate it by defining a function

In [9]:
def titanic_downcast_obj(df):
    previous_memory_consumption=(df.memory_usage().sum())/1024**2
    for col in df.columns:
        col_dtype=df[col].dtypes
        if(col_dtype=='object'):
            df[col]=df[col].astype('category')
    After_memory_consumption=(df.memory_usage().sum())/1024**2
    percentage_of_decrease=((previous_memory_consumption-After_memory_consumption)/previous_memory_consumption)*100
    print('Memory usage before downcasting: {:.2f}\n'.format(previous_memory_consumption))
    print('After Downcasting the memory usage decreased by: {:.2f}\n'.format(After_memory_consumption))
    print('Memory usage decreased by: {:.3f}%'.format(percentage_of_decrease))

In [10]:
titanic_downcast_obj(titanic_data)

Memory usage before downcasting: 0.06

After Downcasting the memory usage decreased by: 0.02

Memory usage decreased by: 72.160%


In [11]:
## 72% of memory consumption decreases only by reducing object to category

In [12]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int8    
 1   pclass       891 non-null    int8    
 2   sex          891 non-null    category
 3   age          714 non-null    float16 
 4   sibsp        891 non-null    int8    
 5   parch        891 non-null    int8    
 6   fare         891 non-null    float16 
 7   embarked     889 non-null    category
 8   class        891 non-null    category
 9   who          891 non-null    category
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    category
 13  alive        891 non-null    category
 14  alone        891 non-null    bool    
dtypes: bool(2), category(7), float16(2), int8(4)
memory usage: 16.0 KB


# From 92.4+ KB to 16 KB

In [13]:
%%time
titanic_data[['who','deck']]

CPU times: user 887 µs, sys: 235 µs, total: 1.12 ms
Wall time: 923 µs


Unnamed: 0,who,deck
0,man,
1,woman,C
2,woman,
3,woman,C
4,man,
...,...,...
886,man,
887,woman,B
888,woman,
889,man,C


### This is an example of decrease in access time after optimization