
# Speed comparison of different pandas read (extra with dask)
![](https://www.pngall.com/wp-content/uploads/2016/03/Kung-Fu-Panda-Fighting-PNG-180x180.png)

### This notebook gives you a comparison and understanding of changing object types while reading a pandas dataframe impact the speed (performance) 

1. Pandas - `dtypes` int32 numbers, str string
2. Pandas - `dtypes` uint8 numbers, object string
3. Dask to Pandas - `dtypes` uint8 numbers, object string
4. Dask to Pandas - `dtypes`  uint8 numbers, object string, `usecols`

In [None]:

import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dask.dataframe as dd
import os

Taking reference from https://www.kaggle.com/columbia2131/speed-up-reading-csv-to-pickle/notebook to change the data types for faster reading

# 1. Pandas - `dtypes` int32 numbers, str string

In [None]:
%%time

dtypes = {
    "row_id": "str",
    "time_id": "int32",
    "investment_id": "int32",
    "target": "float32"    
}

for feature in [f'f_{i}' for i in range(300)]:
    dtypes[feature] = "float32"
    

train = pd.read_csv("/kaggle/input/ubiquant-market-prediction/train.csv", dtype=dtypes)

print("Train size:", train.shape)

In [None]:
del train

In [None]:
gc.collect()

# 2. Pandas - `dtypes` uint8 numbers, object string

In [None]:
%%time

dtypes = {
    "row_id": "object",
    "time_id": "uint8",
    "investment_id": "uint8",
    "target": "float32"    
}

for feature in [f'f_{i}' for i in range(300)]:
    dtypes[feature] = "float32"
    

train = pd.read_csv("/kaggle/input/ubiquant-market-prediction/train.csv", dtype=dtypes)

print("Train size:", train.shape)

In [None]:
del train

In [None]:
gc.collect()

# 3. Dask to Pandas - `dtypes` uint8 numbers, object string

In [None]:
%%time

dtypes = {
    "row_id": "object",
    "time_id": "uint8",
    "investment_id": "uint8",
    "target": "float32"    
}

for feature in [f'f_{i}' for i in range(300)]:
    dtypes[feature] = "float32"

df_train = dd.read_csv("/kaggle/input/ubiquant-market-prediction/train.csv", dtype=dtypes)

pd_df = df_train.compute()

print("Train size:", pd_df.shape)

In [None]:
del df_train; del pd_df;

In [None]:
gc.collect()

# 4. Dask to Pandas - `dtypes`  uint8 numbers, object string, `usecols`

In [None]:
%%time

first_cols = ['row_id','time_id','investment_id','target']
my_cols = first_cols.append([f'f_{i}' for i in range(300)])


dtypes = {
    "row_id": "object",
    "time_id": "uint8",
    "investment_id": "uint8",
    "target": "float32"    
}

for feature in [f'f_{i}' for i in range(300)]:
    dtypes[feature] = "float32"

df_train = dd.read_csv("/kaggle/input/ubiquant-market-prediction/train.csv", dtype=dtypes, usecols = my_cols)

pd_df = df_train.compute()

print("Train size:", pd_df.shape)

In [None]:
del df_train; del pd_df;

In [None]:
gc.collect()