# Do we gain anything by specifying the dtypes when reading the catalogues using Pandas?

In [1]:
import pandas as pd

In [2]:
# use category when we have lots of repeats, and int32 rather than default int64
dtypes = {"Experiment":"category", "Var":"category", "Model":"category", "CMOR":"category", 
          "RunID":"category", "Centre":"category", "Frequency":"category", 
          "SubModel":"category", "Version":"category", "StartDate":"int32", "EndDate":"int32"}

# Read files

In [3]:
df        = pd.read_csv('~/.baspy/cmip5_catalogue.csv')
df_dtypes = pd.read_csv('~/.baspy/cmip5_catalogue.csv', dtype=dtypes)

In [4]:
df.dtypes

Centre        object
Model         object
Experiment    object
Frequency     object
SubModel      object
CMOR          object
RunID         object
Version       object
Var           object
StartDate      int64
EndDate        int64
Path          object
DataFiles     object
dtype: object

In [5]:
df_dtypes.dtypes

Centre        category
Model         category
Experiment    category
Frequency     category
SubModel      category
CMOR          category
RunID         category
Version       category
Var           category
StartDate        int32
EndDate          int32
Path            object
DataFiles       object
dtype: object

## Time to read file
we do not see a slow down when specifying dtypes

In [6]:
%timeit df = pd.read_csv('~/.baspy/cmip5_catalogue.csv')

4.53 s ± 145 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit df_dtypes = pd.read_csv('~/.baspy/cmip5_catalogue.csv', dtype=dtypes)

4.57 s ± 20 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Compare memory usage
We can reduce the total memory footprint to around a third (80MB -->28MB)

In [8]:
df.memory_usage()

Index              80
Centre        6215800
Model         6215800
Experiment    6215800
Frequency     6215800
SubModel      6215800
CMOR          6215800
RunID         6215800
Version       6215800
Var           6215800
StartDate     6215800
EndDate       6215800
Path          6215800
DataFiles     6215800
dtype: int64

In [9]:
df_dtypes.memory_usage()

Index              80
Centre         778471
Model          780015
Experiment     782903
Frequency      777359
SubModel       777351
CMOR           777759
RunID         1566006
Version       1579070
Var           1579294
StartDate     3107900
EndDate       3107900
Path          6215800
DataFiles     6215800
dtype: int64

In [10]:
df.memory_usage().sum() * 0.000001 # convert bytes to MBs

80.80548

In [11]:
df_dtypes.memory_usage().sum() * 0.000001

28.045707999999998