# This notebook is to
### 1. Evaluate the time to read data (csv, pickle, or dask)
#### Conclusion:
- pandas.read_csv : ~ 7 min
- pandsa.read_pickle : ~ 10 sec (in advance, train.csv is transformed to pkl file)
- dask.dataframe.read_csv: ~ 3 min (including transform time from dask.dataframe.DataFrame to pd.DataFrame).

### 2. Optimize dtype of each column to reduce memory usage
#### Conclusion:  
- memory usage: 3.6 GB -> 1.8 GB 

Note: When you calculate mean or std by using some package library like pandas, over (or under) flow may occurs.  


In [None]:
import os
import gc
import time

import numpy as np
import pickle
import pandas as pd
from tqdm.auto import tqdm

In [None]:
CSV_PATH = '../input/ubiquant-market-prediction/train.csv'
print(f'train.csv: {os.path.getsize(CSV_PATH)/10**9:.1f} GB')

## Set reading dtypes
The size of train.csv (18.5 GB) is larger than the memory of kaggle karnel, so the dtypes shall be assigned when reading.

In [None]:
dtypes_dict = {
    'row_id': 'str',
    'time_id': 'uint16',
    'investment_id': 'uint16',
    'target': 'float32',
}
for i in range(300):
    dtypes_dict[f'f_{i}'] = 'float32'

#  1. Evaluate the time to read data (pd.read_csv, pickle, or dask)

In [None]:
summary = pd.DataFrame(columns=['method', 'read_time'])

### Pandas (pd.read_csv)

In [None]:
%%time
s_time = time.time()
df = pd.read_csv(CSV_PATH,
                 usecols = dtypes_dict.keys(),
                 dtype = dtypes_dict)
read_time = round(time.time() - s_time, 1)
print(df.info())

summary = summary.append({'method': 'pd.read_csv',
                          'read_time': read_time},
                         ignore_index=True)

### Pickle (pd.read_pickle)

In [None]:
# Save as train_dtype_changed.csv and train.pkl
df.to_pickle('train.pkl')
del df
_ = gc.collect()

In [None]:
%%time
s_time = time.time()
df_pkl = pd.read_pickle('train.pkl')
read_time = round(time.time() - s_time, 1)
print(df_pkl.info())

summary = summary.append({'method':'Pickle',
                          'read_time': read_time},
                         ignore_index=True)

del df_pkl
_ = gc.collect()

### DASK

In [None]:
import dask.dataframe as dd
import dask.multiprocessing

In [None]:
%%time
s_time = time.time()
df_dask = dd.read_csv(CSV_PATH,
                      parse_dates = True,
                      dtype = dtypes_dict).compute()
read_time = round(time.time() - s_time, 1)
print(df_dask.info())

summary = summary.append({'method':'Dask',
                          'read_time': read_time},
                         ignore_index=True)

del df_dask
gc.collect()

In [None]:
display(summary)

### Reading speed: pickle > dask > pandas, if pickle file is prepared.

# 2. Optimize dtypes to reduce memory usage
Ref. https://www.kaggle.com/wangqihanginthesky/baseline-tabnet

In [None]:
df_before = pd.read_pickle('train.pkl')
dtype_before = df_before.dtypes

In [None]:
def reduce_memory_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2 # Unit: MB
    print('Memory usage of input dataframe: {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        # categorical data
        if col_type == object:
            df[col] = df[col].astype('category')
        # numerical data
        else:
            c_min = df[col].min()
            c_max = df[col].max()
            # integer
            if str(col_type)[:3] == 'int' or str(col_type)[:4] == 'uint':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            # float
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
df_after = reduce_memory_usage(df_before)

In [None]:
print('Before optimization')
display(df_before.head())

In [None]:
print('After optimization')
display(df_after.head())

In [None]:
dtype_after = df_after.dtypes
dtypes = pd.DataFrame([dtype_before, dtype_after], index=['before optimization', 'after optimization'])
display(dtypes)

### Save optimized dataframe

In [None]:
df_after.to_pickle('train_reduced.pkl')

In [None]:
!rm train.pkl

## Please upvoke, if useful for you.