![kaggle-memory-limit](https://raw.githubusercontent.com/akmeghdad/data-science-note/master/src/images/kaggle-memory-limit.png)

# Imports

In [None]:
!pip install -q csv_diff

In [None]:
import time
import os
import pandas as pd
import numpy as np
import datatable as dt
import gc
import joblib
from csv_diff import load_csv, compare

# DATA_PATH

In [None]:
# Work in PC, Kaggle, Colab
IN_PC = ('VSCODE_CWD' in os.environ) and (not 'AK_KAGGLE_ENV' in os.environ)
IN_DOCKER = ('AK_KAGGLE_ENV' in os.environ)
IN_COLAB = ('COLAB_GPU' in os.environ)
# DATA_PATH = "../../data" if os.path.exists("../../data") and IN_PC else "../../../data" if os.path.exists("../../../data") and IN_PC else "/kaggle"

if IN_PC:
    DATA_PATH = os.path.abspath("../../data") if os.path.isdir("../../data") else os.path.abspath("../../../data")
elif IN_COLAB:
    DATA_PATH = "/content/drive/MyDrive/Colab_Kaggle/data"
else:
    DATA_PATH = "/kaggle"

# Variables

In [None]:
target_col='Cover_Type' # name of target column
id_col='Id'
n_splits=5
competition='tps1221'
fullname_competition ='tabular-playground-series-dec-2021'


# CSV_PATH = DATA_PATH + '/working/'+ competition + '/' + config['csv_name']
WORKING_PATH = DATA_PATH + '/working/'
JOBLIB_PATH = DATA_PATH + '/input/tps1221data/'
ORGIN_CSV_PATH = DATA_PATH + '/input/'+ fullname_competition +'/'
# DB_PATH = DATA_PATH + '/working/'+ competition + '/' + config['db_name']

int_colums = [
    'Id','Elevation','Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways',
     'Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points','Wilderness_Area1','Wilderness_Area2','Wilderness_Area3',
     'Wilderness_Area4','Soil_Type1','Soil_Type2','Soil_Type3','Soil_Type4','Soil_Type5','Soil_Type6','Soil_Type7','Soil_Type8','Soil_Type9',
     'Soil_Type10','Soil_Type11','Soil_Type12','Soil_Type13','Soil_Type14','Soil_Type15','Soil_Type16','Soil_Type17','Soil_Type18','Soil_Type19',
     'Soil_Type20','Soil_Type21','Soil_Type22','Soil_Type23','Soil_Type24','Soil_Type25','Soil_Type26','Soil_Type27','Soil_Type28','Soil_Type29','Soil_Type30',
     'Soil_Type31','Soil_Type32','Soil_Type33','Soil_Type34','Soil_Type35','Soil_Type36','Soil_Type37','Soil_Type38','Soil_Type39','Soil_Type40'
            ]

# Load data in traditional method with reduce memory

In [None]:
%%time
time_s = time.time()

train = pd.read_csv(ORGIN_CSV_PATH + 'train.csv',dtype='float32')
test = pd.read_csv(ORGIN_CSV_PATH + 'test.csv',dtype='float32')
submission = pd.read_csv(ORGIN_CSV_PATH + 'sample_submission.csv')

for col in int_colums:
    train[col] = train[col].astype(np.int32)
    test[col] = test[col].astype(np.int32)

train[target_col] = train[target_col].astype(np.int8)

time_e = time.time()

time__ = '{:.1f}'.format(time_e-time_s)
memory_ = (train.memory_usage(index = True).sum() + test.memory_usage(index = True).sum() + submission.memory_usage(index = True).sum() )/ 1e9
memory_str = '{:.2f}'.format(memory_)
types_ = (train.dtypes).value_counts()

In [None]:
print(f'{3*"="} traditional method with reduce memory {18*"="}\nMemory:\t{memory_str} gb\nTime:\t{time__} sec\n\n{types_}')

# Load data with joblib (all in one)

In [None]:
%%time
time_s_j = time.time()

train_job = joblib.load(JOBLIB_PATH + 'train-32.pkl')
test_job = joblib.load(JOBLIB_PATH + 'test-32.pkl').drop(columns=[id_col])
submission_job = joblib.load(JOBLIB_PATH + 'submission-32.pkl')

time_e_j = time.time()

time__j = '{:.1f}'.format(time_e_j-time_s_j)
memory_j = (train_job.memory_usage(index = True).sum() + test_job.memory_usage(index = True).sum() + submission_job.memory_usage(index = True).sum() )/ 1e9
memory_j_str = '{:.2f}'.format(memory_j)
types_j = (train_job.dtypes).value_counts()

In [None]:
print(f'{3*"="} joblib method (all in one){18*"="}\nMemory:\t{memory_j_str} gb\nTime:\t{time__j} sec\n\n{types_j}')
print()
print(25*'===')
print('With JOBLIB method,     Data load {:.2f}% faster and {:.2f}% less memory'.format(100 * (time_e-time_s-time_e_j+time_s_j) / (time_e-time_s), 100 * (memory_ - memory_j) / memory_))
print(25*'===')


# Compare traditional method vs joblib

In [None]:
# I have to save CSV
if not os.path.exists(WORKING_PATH + 'train-32.csv'):
    train_tmp = joblib.load(JOBLIB_PATH + 'train-32.pkl')
    train_tmp.to_csv(WORKING_PATH + 'train-32.csv', index=False, float_format='%g')
    
if not os.path.exists(WORKING_PATH + 'test-32.csv'):
    test_tmp = joblib.load(JOBLIB_PATH + 'test-32.pkl')
    test_tmp.to_csv(WORKING_PATH + 'test-32.csv', index=False, float_format='%g')

if not os.path.exists(WORKING_PATH + 'submission-32.csv'):
    submission_tmp = joblib.load(JOBLIB_PATH + 'submission-32.pkl')
    submission_tmp.to_csv(WORKING_PATH + 'submission-32.csv', index=False, float_format='%g')

In [None]:
del train
del train_job
del train_tmp
del test
del test_tmp
del test_job
del submission
del submission_job
del submission_tmp
gc.collect();

In [None]:
diff_test = compare(
    load_csv(open(ORGIN_CSV_PATH + 'test.csv'), key=id_col),
    load_csv(open(WORKING_PATH + 'test-32.csv'), key=id_col)
)
diff_test