<a href="https://colab.research.google.com/github/sankardevisharath/amex-default-prediction/blob/master/notebooks/split_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Split Dataset into Multiple Files



## Load Data From Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%mkdir data
%cd data
%mkdir raw
%cd raw

/content/data
/content/data/raw


In [3]:
!cp /content/drive/MyDrive/amex-default-prediction/data/raw/amex-default-prediction.zip .

In [4]:
!unzip amex-default-prediction.zip train_data.csv

Archive:  amex-default-prediction.zip
  inflating: train_data.csv          


In [5]:
!unzip amex-default-prediction.zip train_labels.csv

Archive:  amex-default-prediction.zip
  inflating: train_labels.csv        


## Setup Environment

In [9]:
!pip install dask[dataframe]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
import gc

import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

import dask
import dask.dataframe as dd

In [11]:
TRAIN_DATA_PATH = '/content/data/raw/train_data.csv'
TRAIN_LABELS_PATH = '/content/data/raw/train_labels.csv'

In [12]:
def read_cols(cols):
  df = pd.read_csv(TRAIN_DATA_PATH, usecols=cols)
  df = append_label(df)
  return df

def append_label(source_df):
  return pd.merge(left=source_df, right=train_labels, how='inner')

## Load Train Label

In [13]:
train_labels = pd.read_csv(TRAIN_LABELS_PATH)

In [14]:
train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB


## Load Minimal Columns

In [16]:
cust_id_stmt_date_df = pd.read_csv(TRAIN_DATA_PATH, usecols=['customer_ID', 'S_2'])
print(f'Total number of rows in the dataset is {cust_id_stmt_date_df.shape[0]}')

In [18]:
customers = cust_id_stmt_date_df.customer_ID.unique().tolist()
print(f'Total number of unique customers is {len(customers)}')

Total number of unique customers is 458913


In [21]:
cust_id_stmt_date_df['S_2'] = pd.to_datetime(cust_id_stmt_date_df["S_2"])

## Split Data Customer Wise

Read data customerwise and save the result in `parquet` format to google drive.

In [20]:
df = dd.read_csv(TRAIN_DATA_PATH)
print(f'Total number of partitions in the Dask dataframe is {df.npartitions}')

Total number of partitions in the Dask dataframe is 257


In [None]:
rdf = pd.DataFrame()
for i in range(df.npartitions):
  print(i)
  ddf1 = df.partitions[i].compute()
  rdf = rdf.append(ddf1[ddf1.customer_ID.isin(customers[0:100000])], ignore_index=True)
  print(rdf.shape)

In [None]:
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1326261 entries, 0 to 1326260
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 1.9+ GB


In [None]:
rdf.to_parquet(path='data_1.parquet')

In [None]:
!cp data_1.parquet /content/drive/MyDrive/amex-default-prediction/data/raw/

## Split Data Month Wise

In [30]:
periods = list(cust_id_stmt_date_df.S_2.dt.to_period('M').unique().astype(str))

In [None]:
for period in periods:
  print(period)
  rdf = pd.DataFrame()
  for i in range(df.npartitions):
    ddf1 = df.partitions[i].compute()
    ddf1['S_2'] = pd.to_datetime(ddf1["S_2"])
    rdf = rdf.append(ddf1[ddf1.S_2.dt.to_period('M')== period], ignore_index=True)
    if(i%30 == 0):
      print(i)
      print(rdf.shape)
  rdf.to_parquet(path='data_' + period + '.parquet')
  del rdf


In [36]:
!cp /content/data/raw/data_*.parquet /content/drive/MyDrive/amex-default-prediction/data/raw/