## Read all rows of train and test data but just a subset of columns. This way we can train a model using all rows !
## all generated files contains customer_ID and S_2 (date) which uniquely identify a row

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc # garbage collector to free memory

### Let's first get the columns of train in a list. For that we will just read 1 line !

In [None]:
train_1_line = pd.read_csv("/kaggle/input/amex-default-prediction/train_data.csv", nrows = 1)
train_1_line

### Store all columns and : 
* D_* = Delinquency variables
* S_* = Spend variables
* P_* = Payment variables
* B_* = Balance variables
* R_* = Risk variables

### in different lists

In [None]:
# All columns
all_cols = list(train_1_line.columns)

# Delinquency variables
delinquency_cols = [c for c in all_cols if c[0:2] == 'D_' ]


# Spend variables
spend_cols = [c for c in all_cols if c[0:2] == 'S_' ]
spend_cols.remove('S_2') # remove S_2 which is a date

# Payment variables
payment_cols = [c for c in all_cols if c[0:2] == 'P_' ]

# Balance variables
balance_cols = [c for c in all_cols if c[0:2] == 'B_' ]

# Risk variables
risk_cols = [c for c in all_cols if c[0:2] == 'R_' ]

# customerID and S_2
identification_cols = ['customer_ID', 'S_2']

### Check : total len of all lists must equals len of all_cols

In [None]:
len( identification_cols +  delinquency_cols + spend_cols + payment_cols + balance_cols + risk_cols) == len(all_cols)

### We will also need the indices of theses columns

In [None]:
# Delinquency variables
delinquency_cols_indices = [all_cols.index(c) for c in delinquency_cols]


# Spend variables
spend_cols_indices = [all_cols.index(c) for c in spend_cols]

# Payment variables
payment_cols_indices = [all_cols.index(c) for c in payment_cols]

# Balance variables
balance_cols_indices = [all_cols.index(c) for c in balance_cols]

# Risk variables
risk_cols_indices = [all_cols.index(c) for c in delinquency_cols]

# Identification cols
identification_cols_indices = [all_cols.index(c) for c in identification_cols]

### Chek for payment_cols

In [None]:
payment_cols

In [None]:
train_1_line[identification_cols + [all_cols[i] for i in payment_cols_indices]]

### Let's create functions for reading all rows of train but only a subset of columns
### We will need the number of rows of each file but I have already calculated them in [that notebook](https://www.kaggle.com/code/amineteffal/group-split-data-by-customer)
### This numbers are used for iterating on the original files

In [None]:
train_data_rows_count = 5531452 # including header
test_data_rows_count =  11363763 # including header
n_cols = 190

### Create a function that read a chunk of the file but only a subset of culumns

In [None]:
def read_a_chunk_cols(csv_file, chunk_size, chunk_order, cols_indices) :
    '''
        Read the chunk_order chunk from csv_file,
        take only columns passed as list of indices of those columns 
        in cols_indices. 
        The chunk to read is of size chunk_size
    
    '''
    
    chunk_data = pd.read_csv(csv_file, skiprows = range(1,chunk_order * chunk_size + 1),nrows=chunk_size)
    
    cols = chunk_data.columns
        
    return chunk_data[[cols[i] for i in cols_indices]]

In [None]:
def read_all_chunk_cols (csv_file, chunk_size, cols_indices):
    
    '''
        Read all the rows of csv_file each chunk at time,
        take only columns passed as list of indices of those columns 
        in cols_indices. 
        The chunk to read each time is of size chunk_size
    
    '''
    
    # read first chunck of cols
    chuncks_cols = read_a_chunk_cols(csv_file, chunk_size, 0, cols_indices)
    
    # read the following chuncks
    for i in range(1, int(train_data_rows_count/chunk_size) + 1) :
        # read a chunk
        chuncks_cols_temp = read_a_chunk_cols(csv_file, chunk_size, i, cols_indices)
        
        # concatenate with chunks_cols
        chuncks_cols = pd.concat([chuncks_cols, chuncks_cols_temp])
    
        # free memory and call garbage collector
        del chuncks_cols_temp
        gc.collect()
    
    return chuncks_cols

### Define chunk size

In [None]:
chunk_size = 1000000 

### Test the functions on Train data : Read the 10 first  columns (+ customerID and S_2)

In [None]:
n_cols_to_read = 10
cols_indices = identification_cols_indices + list(range( 2, n_cols_to_read + 2)) # + 2 because of customer_ID and S_2
train_10_first_cols = read_all_chunk_cols("/kaggle/input/amex-default-prediction/train_data.csv", chunk_size, cols_indices)

In [None]:
train_10_first_cols.head(10)

In [None]:
train_10_first_cols.shape

### Read all rows of spend columns (+ customerID and S_2)

In [None]:
# free memory
del train_10_first_cols
gc.collect()
cols_indices = identification_cols_indices + spend_cols_indices 
train_spend_cols = read_all_chunk_cols("/kaggle/input/amex-default-prediction/train_data.csv", chunk_size, cols_indices)


In [None]:
train_spend_cols.info()

In [None]:
train_spend_cols.head(5)

In [None]:
train_spend_cols.shape

### Get labels

In [None]:
train_labels = pd.read_csv("/kaggle/input/amex-default-prediction/train_labels.csv")
train_spend_cols = train_spend_cols.set_index('customer_ID')
train_labels = train_labels.set_index('customer_ID')
train_spend_cols = train_spend_cols.join(train_labels, lsuffix='_caller', rsuffix='_other', how='right')
train_spend_cols = train_spend_cols.reset_index()

In [None]:
train_spend_cols.head()

In [None]:
train_spend_cols.shape

In [None]:
# Save to csv
train_spend_cols.to_csv("/kaggle/working/train_spend_cols.csv", index = False)

### Do same thing for test data

In [None]:
# free memory
del train_spend_cols
gc.collect()
cols_indices = identification_cols_indices + spend_cols_indices 
test_spend_cols = read_all_chunk_cols("/kaggle/input/amex-default-prediction/test_data.csv", chunk_size, cols_indices)


In [None]:
test_spend_cols.head()

In [None]:
test_spend_cols.shape

In [None]:
test_spend_cols.to_csv("/kaggle/working/test_spend_cols.csv", index = False)

### If you want to split all rows of train data into several smaller (subset of columns) files and store them as ouput, uncomment this code !

In [None]:
# # clean memory
# del train_spend_cols
# del test_spend_cols
# gc.collect()

In [None]:
# n_cols_to_read = 20 # number of columns to read at each iteration
# n_chunks = int(n_cols/n_cols_to_read)

# for i in range(n_chunks + 1):
#     start_col = i*n_cols_to_read + 2  # + 2 because of customer_ID and S_2
#     end_col = min(n_cols, (i+1)*n_cols_to_read + 2) # we can't have indice bigger than n_cols
#     if start_col < n_cols:
#         # customer_ID and S_2 columns will be always present --> [0, 1]
#         cols_indices = identification_cols_indices + list(range(start_col, end_col)) 
#         chunk = read_all_chunk_cols("/kaggle/input/amex-default-prediction/train_data.csv", chunk_size, cols_indices)
#         chunk.to_csv("/kaggle/working/train_" + str(i) + ".csv", index=False)
#         del chunk
#         gc.collect()
    

## We can do the same thing for test data but unfortunately we can't store them as the limit of the output folder is 20 GB ! 
## So I commented the code.
## We can solve this by running this code in different notebooks (changing each time the range of the i variable) to generate all the test data and then import them in a another notebook (plus the splitted rain data) in order to train different models and then aggregate the predictions to get one predictiion.

In [None]:
# n_cols_to_read = 10 # as test twice huge, read by 10 cols
# n_chunks = int(n_cols/n_cols_to_read)

# for i in range(n_chunks + 1):
#     start_col = i*n_cols_to_read + 2  # + 2 because of customer_ID and S_2
#     end_col = min(n_cols, (i+1)*n_cols_to_read + 2) # we can't have indice bigger than n_cols
#     if start_col < n_cols:
#         # customer_ID and S_2 columns will be always present --> [0, 1]
#         cols_indices = identification_cols_indices + list(range(start_col, end_col)) 
#         chunk = read_all_chunk_cols("/kaggle/input/amex-default-prediction/test_data.csv", chunk_size, cols_indices)
#         chunk.to_csv("/kaggle/working/train_" + str(i) + ".csv", index=False)
#         del chunk
#         gc.collect()