# Notebook to create smaller and manageable train,test data for future experimentation

In this notebook, I use libraries Pandas and Pickle to reduce the size of the training and test data which can be used by the community to work on this competition.

**If you like my work, please upvote the kernel !!**

In [None]:
#All imports goes here
import numpy as np 
import pandas as pd 
import os
import pickle
import glob

In [None]:
fn = "../input/amex-default-prediction/train_data.csv"

In [None]:
%%time

train_df = pd.read_csv(fn,nrows=100)
train_df.head()

In [None]:
all_cols = train_df.columns.to_list()
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
str_cols = ["customer_ID"]
date_cols = ["S_2"]

df_dtype = {col:"float16" for col in all_cols if col not in cat_cols + str_cols + date_cols}

for col in cat_cols:
    df_dtype[col] = "category"
    
df_dtype["customer_ID"] = "str"

I got the idea and the code from the below link: 

https://stackoverflow.com/questions/25962114/how-do-i-read-a-large-csv-file-with-pandas

In [None]:
%%time

out_path = "/kaggle/working" #Path to save the pickle files to
chunk_size = 400000 #size of chunks relies on your available memory


reader = pd.read_csv(fn,chunksize=chunk_size, low_memory=False,dtype=df_dtype,parse_dates=[1])    


for i, chunk in enumerate(reader):
    out_file = out_path + "/data_{}.pkl".format(i+1)
    with open(out_file, "wb") as f:
        pickle.dump(chunk,f,pickle.HIGHEST_PROTOCOL)
        print(f"Written chunk {i+1}")

In [None]:
pickle_path = "/kaggle/working" #Same Path as out_path i.e. where the pickle files are

data_p_files=[]
for name in glob.glob(pickle_path + "/data_*.pkl"):
    data_p_files.append(name)


train_df = pd.DataFrame([])
for i in range(len(data_p_files)):
    train_df = train_df.append(pd.read_pickle(data_p_files[i]),ignore_index=True)

In [None]:
train_df.shape

In [None]:
train_labels = pd.read_csv("../input/amex-default-prediction/train_labels.csv")
train_labels.head()

In [None]:
train_labels.shape

In [None]:
%%time

train_df = train_df.merge(train_labels,on="customer_ID",how="left")
train_df.shape

In [None]:
train_df.to_pickle("amex_train_data.pkl")

In [None]:
#Clean up the output area

for i in range(14):
    _ = os.system(f"rm /kaggle/working/data_{i+1}.pkl")

In [None]:
%%time

fn_test = "../input/amex-default-prediction/test_data.csv"
test_df = pd.read_csv(fn_test,nrows=100)
print(test_df.shape)
test_df.head()


In [None]:
%%time

out_path = "/kaggle/working" #Path to save the pickle files to
chunk_size = 400000 #size of chunks relies on your available memory


reader = pd.read_csv(fn_test,chunksize=chunk_size, low_memory=False,dtype=df_dtype,parse_dates=[1])    


for i, chunk in enumerate(reader):
    out_file = out_path + "/data_test{}.pkl".format(i+1)
    with open(out_file, "wb") as f:
        pickle.dump(chunk,f,pickle.HIGHEST_PROTOCOL)
        print(f"Written chunk {i+1}")

In [None]:
pickle_path = "/kaggle/working" #Same Path as out_path i.e. where the pickle files are

data_p_files=[]
for name in glob.glob(pickle_path + "/data_test*.pkl"):
    data_p_files.append(name)


test_df = pd.DataFrame([])
for i in range(len(data_p_files)):
    test_df = test_df.append(pd.read_pickle(data_p_files[i]),ignore_index=True)

In [None]:
test_df.shape

In [None]:
test_df.to_pickle("amex_test_data.pkl")

In [None]:
#Clean up the output area

for i in range(29):
    _ = os.system(f"rm /kaggle/working/data_test{i+1}.pkl")