# Loading unique customer data 
I use the fact that data is sorted by `customer_ID` to load data related to each customer in the training process. 


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_dir = "/kaggle/input/amex-default-prediction/"

In [None]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
data_temp = pd.read_csv(data_dir+"train_data.csv", nrows=5)
num_cols = [col for col in data_temp.columns.to_list() if col not in cat_cols]

## Read the customer IDs from train and test data

In [None]:
train_customers = pd.read_csv(data_dir+"train_data.csv", usecols=["customer_ID"])
test_customers = pd.read_csv(data_dir+"test_data.csv", usecols=["customer_ID"])

# Get the indices related to each customer
Since the data is sorted based on the `customer_ID`, one may use the `skiprows` and `nrows` arguments of the `pd.read_csv` to read the customer data without the need to read the whole dataset. 

In [None]:
train_customer_indices = train_customers.reset_index().set_index("customer_ID").groupby('customer_ID').apply(lambda x : x.to_numpy().reshape(-1, )).to_dict()
test_customer_indices = test_customers.reset_index().set_index("customer_ID").groupby('customer_ID').apply(lambda x : x.to_numpy().reshape(-1, )).to_dict()

I use the `Dataset` class from `pytorch` to form batches of customers by loading each customers data while training

In [None]:
import torch 
from torch.utils.data import Dataset 

In [None]:
class TrainCustomerData(Dataset):
    def __init__(self, customer_indices, data_dir=None, cat_cols=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']):
        #self.customer_indices = customer_indices
        self.customer_ids = tuple(customer_indices)
        self.customer_indices = tuple(customer_indices.values())
        self.train_data_dir = data_dir + "train_data.csv"
        self.train_labels = pd.read_csv(data_dir+"train_labels.csv").set_index("customer_ID")
        self.data_columns = pd.read_csv(data_dir+"train_data.csv", nrows=5).columns.to_list()
        self.cont_cols = [col for col in self.data_columns if col not in cat_cols + ["customer_ID", "S_2"]]
        self.data_dir = data_dir

    def __len__(self):
        return len(self.customer_indices)

    def __getitem__(self, index):
        customer_data_indices = self.customer_indices[index]
        skiprows = range(1, customer_data_indices[0]+1)
        nrows = customer_data_indices[-1] - customer_data_indices[0] + 1
        customer_data = pd.read_csv(self.train_data_dir, skiprows=skiprows, nrows=nrows, header=0)
        customer_id = customer_data.customer_ID.iloc[0]
        
        customer_data.drop(["customer_ID", "S_2"], axis=1, inplace=True)
        
        customer_cont_data = customer_data[self.cont_cols]
        customer_cont_tensor_data = torch.as_tensor(customer_cont_data.values, dtype=torch.float32)
        
        customer_cat_data = customer_data[cat_cols].values
        
        customer_label = torch.as_tensor(self.train_labels.loc[customer_id].values, dtype=torch.int32)
        
        return customer_cont_tensor_data, customer_cat_data, customer_label, customer_id



In [None]:
label = pd.read_csv("/kaggle/input/amex-default-prediction/train_labels.csv")