In [None]:
import os
import platform
import numpy as np
import pandas as pd

In [None]:
print(f"Conda env      = {os.environ['CONDA_DEFAULT_ENV']}")
print(f"Python version = {platform.python_version()}")
print(f"Pandas version = {pd.__version__}")

In [None]:
def profile_header():
    print(f"{'column':21} {'type':>15} {'unique':>8} {'mem_mb':>8}")
    
def profile_column(col):
        name = col.name
        typ = str(col.dtype)
        n_unique = len(col.unique())
        mem_mb = col.memory_usage(deep=True) // (1024**2)               
        print(f"{name:21} {typ:>15} {n_unique:8} {mem_mb:8}")    

def df_mb(df):
    sum = 0
    for col in df.columns:
        sum += df[col].memory_usage(deep=True)
    return sum // (1024**2)
        
        
def profile(df):
    print(f"DF nrows  = {len(df):,}")      
    print(f"DF memory = {df_mb(df):,} MB")  
    profile_header()    
    for col in df.columns:
        profile_column(df[col])

In [3]:
%%time

# Read uncompressed csv data w/o parsing dates, note the size
df0 = pd.read_csv("yellow_tripdata_2020-01-1M.csv")
profile(df0)

DF nrows  = 1,000,000
DF memory = 320 MB
column                           type   unique   mem_mb
VendorID                      float64        3        7
tpep_pickup_datetime           object   386921       75
tpep_dropoff_datetime          object   388189       75
passenger_count               float64       11        7
trip_distance                 float64     3412        7
RatecodeID                    float64        8        7
store_and_fwd_flag             object        3       55
PULocationID                    int64      251        7
DOLocationID                    int64      260        7
payment_type                  float64        5        7
fare_amount                   float64     1626        7
extra                         float64       23        7
mta_tax                       float64        5        7
tip_amount                    float64     2202        7
tolls_amount                  float64      447        7
improvement_surcharge         float64        3        7
total_a

In [4]:
%%time

# parse dates so they take up less space (75MB->7MB)- LONG TIME TO CONVERT (2m28s vs 1.8s)
df1 = pd.read_csv(
    "yellow_tripdata_2020-01-1M.csv", 
     parse_dates=[
         'tpep_pickup_datetime', 
         'tpep_dropoff_datetime'
     ], 
     infer_datetime_format=True
)
profile(df1)

DF nrows  = 1,000,000
DF memory = 184 MB
column                           type   unique   mem_mb
VendorID                      float64        3        7
tpep_pickup_datetime   datetime64[ns]   386921        7
tpep_dropoff_datetime  datetime64[ns]   388189        7
passenger_count               float64       11        7
trip_distance                 float64     3412        7
RatecodeID                    float64        8        7
store_and_fwd_flag             object        3       55
PULocationID                    int64      251        7
DOLocationID                    int64      260        7
payment_type                  float64        5        7
fare_amount                   float64     1626        7
extra                         float64       23        7
mta_tax                       float64        5        7
tip_amount                    float64     2202        7
tolls_amount                  float64      447        7
improvement_surcharge         float64        3        7
total_a

In [5]:
%%time

# Read compressed data - not converting dates (3s compressed vs 1.8s uncompressed)
df2 = pd.read_csv("yellow_tripdata_2020-01-1M.csv.xz")
profile(df2)

DF nrows  = 1,000,000
DF memory = 320 MB
column                           type   unique   mem_mb
VendorID                      float64        3        7
tpep_pickup_datetime           object   386921       75
tpep_dropoff_datetime          object   388189       75
passenger_count               float64       11        7
trip_distance                 float64     3412        7
RatecodeID                    float64        8        7
store_and_fwd_flag             object        3       55
PULocationID                    int64      251        7
DOLocationID                    int64      260        7
payment_type                  float64        5        7
fare_amount                   float64     1626        7
extra                         float64       23        7
mta_tax                       float64        5        7
tip_amount                    float64     2202        7
tolls_amount                  float64      447        7
improvement_surcharge         float64        3        7
total_a

In [6]:
%%time 

# Reduce size by converting from object to datetime (also read_csv optiona)
# df3 = df2.copy(deep=True)
# df3["tpep_pickup_datetime"]  = pd.to_datetime(df3["tpep_pickup_datetime"])
# df3["tpep_dropoff_datetime"] = pd.to_datetime(df3["tpep_dropoff_datetime"])
# profile(df3)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.29 µs


In [7]:
# Reduce size by converting low cardinality floats (integers w/ Nan's) to nullable integers
import numpy as np

df4 = df1.copy(deep=True)
profile_header()
profile_column(df4["payment_type"])
print(df4["payment_type"].unique())

print(f"df before = {df_mb(df4):,} MB")
df4["payment_type"] = df4["payment_type"].astype('Int8')
profile_column(df4["payment_type"])
print(f"df after = {df_mb(df4):,} MB")

column                           type   unique   mem_mb
payment_type                  float64        5        7
[ 1.  2.  4.  3. nan]
df before = 184 MB
payment_type                     Int8        5        1
df after = 179 MB


In [13]:
%%time

# Read parquet - 6.4MRows in 4s, picks up datetime (could specify entire schema)
df_pq = pd.read_parquet(
    "yellow_tripdata_2020-01.parquet", 
    use_nullable_dtypes=True,
    dtypes=["payment_type" ],
)
#df_pq["payment_type"] = df_pq["payment_type"].astype('category')
profile(df_pq)

DF nrows  = 6,405,008
DF memory = 1,286 MB
column                           type   unique   mem_mb
VendorID                        int64        3       48
tpep_pickup_datetime   datetime64[ns]  2134342       48
tpep_dropoff_datetime  datetime64[ns]  2137286       48
passenger_count               float64       11       48
trip_distance                 float64     5606       48
RatecodeID                    float64        8       48
store_and_fwd_flag             object        3      352
PULocationID                    int64      261       48
DOLocationID                    int64      262       48
payment_type                 category        6        6
fare_amount                   float64     5283       48
extra                         float64       47       48
mta_tax                       float64       11       48
tip_amount                    float64     3626       48
tolls_amount                  float64     1035       48
improvement_surcharge         float64        3       48
total