In [None]:
import pandas as pd 
import numpy as np 
import os
dir_data = "../input/tabular-playground-series-jun-2022/data.csv"
data_size = os.path.getsize(dir_data)/1e9
print(f"The file size is {data_size:.4f} GB")


In [None]:
# Load Data using Pandas
%time
data = pd.read_csv(dir_data)

In [None]:
data.info()

In [None]:
# Load Data using Datatable
%time
import datatable as dt 
dt_df = dt.fread(dir_data)
pd_df = dt_df.to_pandas()


In [None]:
import dask.dataframe as dd

In [None]:
# Load Data using Dask
%time
df_dask = dd.read_csv(dir_data)


In [None]:
# Make sure to activate your GPU on yur kenrnel when using cudf
import cudf
%time
df_cudf = cudf.read_csv(dir_data)

Voila! You can use datatable or cudF to load your data faster later in your next competition. It is faster compared to loading data using pandas. You see the result on this experiment by looking at the Wall time. There is also one trick that you can use by compressing the data type to reduce memory usage. You can see the following code below.

In [None]:
# https://www.kaggle.com/code/gemartin/load-data-reduce-memory-usage/notebook
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
print('-' * 80)
print('data')
train = import_data(dir_data)

you can see we compressed the size of the file from 617+ MB to 132.56 MB. it decreased around 78%. Then you can load as usual using cudf,dask, or even pandas. 