# Dataset Minification

### The goal of this notebook is to offer a first preprocessing step so that you can manipulate this huuuuuuge dataset easily. The final dataframe is saved as a .pkl, which allows you to load it quickly!

#### Imports

In [None]:
import json

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from time import time

#### Loading

In [None]:
%%time
N_ROWS = int(1e6)                     # number of rows = 11M
train = pd.read_csv("/kaggle/input/data-science-bowl-2019/train.csv", nrows=N_ROWS)
test = pd.read_csv("/kaggle/input/data-science-bowl-2019/test.csv", nrows=N_ROWS)

In [None]:
start_mem_usg = train.memory_usage().sum() / 1024 ** 2 
print("Memory usage of the train is : {:.1f} MB for now".format(start_mem_usg))
start_mem_usg = test.memory_usage().sum() / 1024 ** 2 
print("Memory usage of the test is : {:.1f} MB for now".format(start_mem_usg))

In [None]:
train.head(5)

## Exploring event_data column

### `event_data` seems interesting. I think it is the main source of information.
### The data is given in json format, so we'll parse it to be able to create columns

In [None]:
df_to_minify = [train, test]
for df in df_to_minify:
    df['event_data'] = df['event_data'].apply(lambda x: json.loads(x))

In [None]:
event_data = train['event_data'].tolist()
unique_keys = list()
for my_json in event_data:
    unique_keys += my_json.keys()
    
unique_keys = list(set(unique_keys))
print('event_data contains {} new columns'.format(len(unique_keys)))
print('Some new columns are:', unique_keys[:5])

In [None]:
for ky in tqdm(unique_keys):
    def give_me_keys(x):
        try:
            return x[ky]
        except KeyError:
            return np.nan
    train[ky] = train['event_data'].apply(give_me_keys)
    test[ky] = test['event_data'].apply(give_me_keys)
    
    
print('Train shape is:', train.shape)
print('Test shape is:', test.shape)
start_mem_usg = train.memory_usage().sum() / 1024 ** 2 
print("Memory usage of the train dataframe is : {:.1f} MB for now".format(start_mem_usg))

### Remove columns which variance is very low or with too many missing values

Please modify these two thresholds to fit your needs

In [None]:
# Use this filters if you want to drop columns with low variance or lot of nans
VAR_FILTER = True
NAN_FILTER = True

VAR_THRESH = .1
NAN_THRESH = .99

cols_to_drop = list()

if VAR_FILTER:
    var_dict = train.std() <= VAR_THRESH
    cols_to_drop += [k for k, v in var_dict.items() if v]

if NAN_FILTER:
    nan_dict = train.isna().mean() >= NAN_THRESH
    cols_to_drop += [k for k, v in nan_dict.items() if v]

cols_to_drop = list(set(cols_to_drop))
train.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)

print('We dropped {} columns'.format(len(cols_to_drop)))
print('Train shape is: ', train.shape)
print('Test shape is: ', test.shape)

In [None]:
# Now that we've the information contained in event_data, we can drop it
try:
    train.drop('event_data', axis=1, inplace=True)
    test.drop('event_data', axis=1, inplace=True)
except:
    pass

train.head()

## Now we will Label Encode some variable to stock them as small int (instead of objects)

In [None]:
col_to_label_encode = list()
for col in train.columns:
    try:
        if len(train[col].unique()) < 10:
            col_to_label_encode.append(col)
    except:
        pass

In [None]:
correspondance_dict = dict()

for col in col_to_label_encode:
    try:
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col])
        test[col] = le.transform(test[col])

        keys = le.classes_
        values = le.transform(le.classes_)
        dictionary = dict(zip(keys, values))
        correspondance_dict[col] = dictionary

    except:    # the variable is not label encodable
        pass

correspondance_dict

In [None]:
def reduce_mem_usage(props, log=False):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :", round(start_mem_usg, 2), " MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings and timestamps
            
            # Print current column type
            if log: print("******************************")
            if log: print("Column: ",col)
            if log: print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)            

            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            if log: print("dtype after: ",props[col].dtype)
            if log: print("******************************")
    
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is now: ", round(mem_usg, 2), " MB")
    print("This is ",round(100 * mem_usg / start_mem_usg, 2),"% of the initial size")
    return props

In [None]:
train = reduce_mem_usage(train, log=False)
test = reduce_mem_usage(test, log=False)

In [None]:
train.to_pickle('train.pkl')
test.to_pickle('test.pkl')