### This notebook aims to group data by customer in order to have smaller files.
### The group function used is the min, mean and max (numeric variables only).

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc # garbage collector to free memory

### As the files are huge, let's first count the number of their lines

In [None]:
###################### code adapted from : https://www.geeksforgeeks.org/how-to-count-the-number-of-lines-in-a-csv-file-in-python/ ###############
def count_rows(csv_file):
    #Setting initial value of the counter to zero
    rowcount  = 0
    
    #iterating through the whole file
    for row in open(csv_file):
        rowcount += 1
    
    return rowcount
    

In [None]:
train_data_rows_count = count_rows("/kaggle/input/amex-default-prediction/train_data.csv")
test_data_rows_count = count_rows("/kaggle/input/amex-default-prediction/test_data.csv")
train_labels_rows_count = count_rows("/kaggle/input/amex-default-prediction/train_labels.csv")
sample_submission_rows_count = count_rows("/kaggle/input/amex-default-prediction/sample_submission.csv")
print("train_data_rows_count : ", train_data_rows_count)
print("test_data_rows_count : ", test_data_rows_count)
print("train_labels_rows_count : ", train_labels_rows_count)
print("sample_submission_rows_count : ", sample_submission_rows_count)

### Create a function that read a chunk of the file and aggregate it by customer_ID (keeping numeric columns only)

In [None]:
def read_a_chunk(csv_file, chunk_size, chunk_order) :
    '''
        Read the the chunk_order chunk from csv_file,
        group columns by customer_ID and return a dataframe. 
        The chunk to read is of size chunk_size
    
    '''
    
    chunk_data = pd.read_csv(csv_file, skiprows = range(1,chunk_order * chunk_size + 1),nrows=chunk_size)
    
    # Drop str columns except customer_ID
    chunk_data = chunk_data.drop(['S_2', 'D_63', 'D_64'], axis = 1)
    
    #Fill na's with mean of corresponding column
    chunk_data = chunk_data.fillna(chunk_data.drop('customer_ID', axis=1).mean())
    
    # group by customer_ID
    chunk_data = pd.DataFrame(chunk_data.groupby(by='customer_ID').agg(['min', 'mean', 'max'])).reset_index()
    
    # rename columns
    chunk_data.columns = [c[0]+c[1] for c in chunk_data]
    
    return chunk_data

### Define chunk size

In [None]:
chunk_size = 1000000 

### Train data

In [None]:
# read first chunk
train_data_groupped = read_a_chunk("/kaggle/input/amex-default-prediction/train_data.csv", chunk_size, 0)

for i in range(1, int(train_data_rows_count/chunk_size) + 1) :
    chunk_temp = read_a_chunk("/kaggle/input/amex-default-prediction/train_data.csv", chunk_size, i)
    train_data_groupped = pd.concat([train_data_groupped, chunk_temp])
    del chunk_temp
    gc.collect()

In [None]:
print('train_data_groupped info : ', train_data_groupped.info())
print('shape of train_data_groupped : ', train_data_groupped.shape)
train_data_groupped.to_csv("/kaggle/working/train_data_groupped.csv", index=False)
del train_data_groupped
gc.collect()

### Do same thing for test data

In [None]:
# read first chunk
test_data_groupped = read_a_chunk("/kaggle/input/amex-default-prediction/test_data.csv", chunk_size, 0)

for i in range(1, int(test_data_rows_count/chunk_size) + 1) :
    chunk_temp = read_a_chunk("/kaggle/input/amex-default-prediction/test_data.csv", chunk_size, i)
    test_data_groupped = pd.concat([test_data_groupped, chunk_temp])
    del chunk_temp
    gc.collect()

In [None]:
print('test_data_groupped info : ', test_data_groupped.info())
print('shape of test_data_groupped : ', test_data_groupped.shape)
test_data_groupped.to_csv("/kaggle/working/test_data_groupped.csv", index=False)
del test_data_groupped
gc.collect()