# Preparing for colab

In [1]:
!pip install catboost
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 132 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 62 bytes


In [3]:
!kaggle competitions download -c amex-default-prediction -f sample_submission.csv
!unzip /content/sample_submission.csv.zip

Downloading sample_submission.csv.zip to /content
 77% 25.0M/32.4M [00:01<00:00, 17.6MB/s]
100% 32.4M/32.4M [00:01<00:00, 28.4MB/s]
Archive:  /content/sample_submission.csv.zip
  inflating: sample_submission.csv   


In [4]:
!kaggle competitions download -c amex-default-prediction -f train_labels.csv
!unzip /content/train_labels.csv.zip

Downloading train_labels.csv.zip to /content
 49% 8.00M/16.2M [00:00<00:00, 76.5MB/s]
100% 16.2M/16.2M [00:00<00:00, 113MB/s] 
Archive:  /content/train_labels.csv.zip
  inflating: train_labels.csv        


In [5]:
!kaggle datasets download -d raddar/amex-data-integer-dtypes-parquet-format
!unzip /content/amex-data-integer-dtypes-parquet-format.zip

Downloading amex-data-integer-dtypes-parquet-format.zip to /content
100% 4.06G/4.07G [00:21<00:00, 230MB/s]
100% 4.07G/4.07G [00:21<00:00, 204MB/s]
Archive:  /content/amex-data-integer-dtypes-parquet-format.zip
  inflating: test.parquet            
  inflating: train.parquet           


# Create Dataset

In [10]:
# ====================================================
# Library
# ====================================================
import gc; gc.enable()
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

# ====================================================
# Read & preprocess data and save it to disk
# ====================================================
def read_preprocess_data():
    train = pd.read_parquet('/content/train.parquet')
    features = train.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    
    # Train FE
    print('Starting train feature extraction')
    train_num_agg = train.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)

    # Lag Features
    for col in train_num_agg:
        if 'last' in col and col.replace('last', 'first') in train_num_agg:
            train_num_agg[col + '_lag_sub'] = train_num_agg[col] - train_num_agg[col.replace('last', 'first')]
            train_num_agg[col + '_lag_div'] = train_num_agg[col] / train_num_agg[col.replace('last', 'first')]

        if 'max' in col and col.replace('min', 'max') in train_num_agg:
            train_num_agg[col + '_lag_sub'] = train_num_agg[col] - train_num_agg[col.replace('max', 'min')]
            train_num_agg[col + '_lag_div'] = train_num_agg[col] / train_num_agg[col.replace('max', 'min')]

    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(['count', 'first', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)
    
    train_labels = pd.read_csv('/content/train_labels.csv')
    train = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID')
    print('Train shape: ', train.shape)    
    del train_num_agg, train_cat_agg
    gc.collect()
    
    train.to_parquet('train_fe_plus_plus.parquet')
    del train
    gc.collect()
    
    # Test FE
    test = pd.read_parquet('/content/test.parquet')
    print('Starting test feature extraction')
    test_num_agg = test.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)

    # Lag Features
    for col in test_num_agg:
        if 'last' in col and col.replace('last', 'first') in test_num_agg:
            test_num_agg[col + '_lag_sub'] = test_num_agg[col] - test_num_agg[col.replace('last', 'first')]
            test_num_agg[col + '_lag_div'] = test_num_agg[col] / test_num_agg[col.replace('last', 'first')]
        
        if 'max' in col and col.replace('min', 'max') in test_num_agg:
            test_num_agg[col + '_lag_sub'] = test_num_agg[col] - test_num_agg[col.replace('max', 'min')]
            test_num_agg[col + '_lag_div'] = test_num_agg[col] / test_num_agg[col.replace('max', 'min')]

    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(['count', 'first', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    
    test = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID')
    print('Test shape: ', test.shape)
    del test_num_agg, test_cat_agg
    gc.collect()
    
    
    # Save files to disk
    test.to_parquet('test_fe_plus_plus.parquet')
    del test
    gc.collect()
    
# Read & Preprocess Data
read_preprocess_data()

Starting train feature extraction
Train shape:  (458913, 1816)
Starting test feature extraction
Test shape:  (924621, 1815)


In [11]:
!mkdir amex-fe-plus

In [12]:
!mv /content/train_fe_plus_plus.parquet /content/amex-fe-plus/
!mv /content/test_fe_plus_plus.parquet /content/amex-fe-plus/

In [13]:
!kaggle datasets init -p /content/amex-fe-plus

Data package template written to: /content/amex-fe-plus/dataset-metadata.json


In [16]:
%%writefile /content/amex-fe-plus/dataset-metadata.json
{
  "title": "Amex-FE-Plus",
  "id": "ryuina/amex-fe-plus",
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ]
}

Overwriting /content/amex-fe-plus/dataset-metadata.json


In [17]:
!kaggle datasets version -p /content/amex-fe-plus -m "added max and min lags"

Starting upload for file train_fe_plus_plus.parquet
100% 1.84G/1.84G [00:28<00:00, 70.2MB/s]
Upload successful: train_fe_plus_plus.parquet (2GB)
Starting upload for file test_fe_plus_plus.parquet
100% 3.36G/3.36G [00:45<00:00, 78.9MB/s]
Upload successful: test_fe_plus_plus.parquet (3GB)
Dataset version is being created. Please check progress at https://www.kaggle.com/ryuina/amex-fe-plus
