In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

from fast_ml.eda import eda_summary
from fast_ml.eda import eda_numerical_variable, eda_numerical_plots, eda_numerical_plots_with_target 
from fast_ml.eda import eda_categorical_variable, eda_categorical_plots, eda_categorical_plots_with_target
from fast_ml.missing_data_analysis import MissingDataAnalysis
from fast_ml.utilities import display_all, reduce_memory_usage
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

#### Load Data - Transaction file in CSV format

In [2]:
%time trans = pd.read_csv('train_transaction.csv')

df_size = trans.memory_usage().sum() / 1024**2
print(f'Memory usage of dataframe is {df_size} MB')

print (f'Shape of dataframe is {trans.shape}')

CPU times: user 23.2 s, sys: 7.87 s, total: 31 s
Wall time: 32.5 s
Memory usage of dataframe is 1775.1524047851562 MB
Shape of dataframe is (590540, 394)


#### We will use a function from fast_ml to reduce the memory usage

In [3]:
from fast_ml.utilities import reduce_memory_usage

%time trans = reduce_memory_usage(trans, convert_to_category=False)

Memory usage of dataframe is 1775.15 MB
Memory usage after optimization is: 542.35 MB
Decreased by 69.4%
CPU times: user 2min 25s, sys: 2min 57s, total: 5min 23s
Wall time: 5min 56s


#### Create a sample dataset of 200k records from the original dataset

In [4]:
# Take a sample of 200k records
%time trans = trans.sample(n=200000)

df_size = trans.memory_usage().sum() / 1024**2
print(f'Memory usage of sample dataframe is {df_size} MB')

CPU times: user 1.39 s, sys: 776 ms, total: 2.16 s
Wall time: 2.43 s
Memory usage of sample dataframe is 185.20355224609375 MB


#### Save the sample dataset in local drive - CSV format

In [5]:
import os

os.makedirs('tmp', exist_ok=True)
%time trans.to_csv('tmp/train_transaction_sample.csv', index = False)

CPU times: user 1min 9s, sys: 1.32 s, total: 1min 10s
Wall time: 1min 12s


#### Save the sample dataset in local drive - Feather format

In [9]:
# little bit index alignment is required because the data we selected is randomly pulled from origian dataset
trans.reset_index(inplace = True, drop = True)

In [10]:
import os 

os.makedirs('tmp', exist_ok=True)
%time trans.to_feather('tmp/train_transaction_sample')

CPU times: user 1.61 s, sys: 235 ms, total: 1.84 s
Wall time: 904 ms


##### Load the saved sample data - CSV Format
Note : dataframe size was 185 MB and observe the difference in size after we load it in from a csv format

In [12]:
%time tmp1 = pd.read_csv('tmp/train_transaction_sample.csv')

df_size = tmp1.memory_usage().sum() / 1024**2
print(f'Memory usage of dataframe is {df_size} MB')
print (f'Shape of dataframe is {tmp1.shape}')

CPU times: user 7.37 s, sys: 1.06 s, total: 8.42 s
Wall time: 8.5 s
Memory usage of dataframe is 601.1964111328125 MB
Shape of dataframe is (200000, 394)


##### Load the saved sample data - Feather Format
Note : dataframe size was 185 MB and observe the difference in size after we load it in from a feather format

In [2]:
%time trans = pd.read_feather('tmp/train_transaction_sample')

df_size = trans.memory_usage().sum() / 1024**2
print(f'Memory usage of dataframe is {df_size} MB')
print (f'Shape of dataframe is {trans.shape}')

CPU times: user 1.32 s, sys: 930 ms, total: 2.25 s
Wall time: 1.92 s
Memory usage of dataframe is 183.67779541015625 MB
Shape of dataframe is (200000, 394)


### Split Data - Train, Valid, Test

In [8]:
from sklearn.model_selection import train_test_split

# Create a train, validation and test set
train, valid = train_test_split(trans, test_size = .10)
train, test = train_test_split(train, test_size = .10)

print(f'train shape : {train.shape} \t valid shape : {valid.shape} \t test shape : {test.shape}')

train shape : (162000, 394) 	 valid shape : (20000, 394) 	 test shape : (18000, 394)
