In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


In [2]:
from catboost import CatBoostClassifier, Pool, cv, to_classifier
from tqdm.notebook import tqdm

In [3]:
batch_size = 100_000

def add_dt_columns(df):
    
    datetime_ = pd.to_datetime(df['S_2']).dt
    
    df['date'] = datetime_.year.values * 10_000 + datetime_.month.values * 100 + datetime_.day.values
    df['month'] = datetime_.month.values
    df['year'] = datetime_.year.values
    df['day'] = datetime_.day.values
    
    df['date_diff'] = df.groupby('customer_ID')['date'].transform(np.max) - df.groupby('customer_ID')['date'].transform(np.min)

    df.drop('S_2', axis=1, inplace=True)

def preprocess(df, labels = {}, csv_name='preprocessed.csv', isFirst=True):
    
    add_dt_columns(df)
    df['count'] = df.groupby('customer_ID')['date'].transform('count')
    
    df = df.drop([k for k, v in df.dtypes.to_dict().items() if v == 'object'], axis = 1).round(4).astype(np.float32)

    if len(labels) > 0:
        df['label'] = df.reset_index().customer_ID.map(labels).fillna(0).values
        df['label'] = df['label'].astype(int)
        df = df.reset_index().drop('customer_ID', axis = 1)

    #     df = df.reset_index().groupby('customer_ID').agg([np.min, np.max, np.std, np.mean])
    
        df = df[['label'] + [c for c in df.columns if c != 'label']]
    else:
        return df
    
    if isFirst:
        df.to_csv(csv_name, header=False, index=False)
    else:
        df.to_csv('preprocessed.csv', mode='a', header=False, index=False)

def load_preprocess():
    train = pd.read_csv(
        "/kaggle/input/amex-default-prediction/train_data.csv", 
        chunksize=batch_size,
#         nrows=10_000,
        index_col='customer_ID'
    )
    train_labels = pd.read_csv(
        "/kaggle/input/amex-default-prediction/train_labels.csv", 
        index_col='customer_ID'
    )
    
    train_labels = {k:v for k, v in train_labels['target'].to_dict().items() if v > 0}
    
    result = []
    
    isFirst = True
    for train_ in tqdm(train, total = int(5.53 * 1e6) // batch_size + 1):
        preprocess(train_, train_labels, isFirst=isFirst)
        isFirst=False
        

In [4]:

class_weights = {0.0: 20.0, 1.0: 1.0}


def get_train_val():

    pool = Pool('preprocessed.csv', delimiter=',')
    
    return pool



In [None]:
params = {"iterations": 4_000,
          "depth": 8,
          "loss_function": "Logloss",
          "verbose": 100,
          "learning_rate": 0.1,
          "scale_pos_weight": 10.0,
#           "task_type": "GPU"
         }




def produce_cat():
    
    load_preprocess()
    print("load done")
    cv_dataset = get_train_val()
    print("split done")
    
    
    scores, models = cv(cv_dataset,
            params,
            fold_count=3, 
            return_models=True
    )

    
    return scores, [to_classifier(m) for m in models]

scores, models = produce_cat()

  0%|          | 0/56 [00:00<?, ?it/s]

load done
split done
Training on fold [0/3]
0:	learn: 0.5910838	test: 0.5910592	best: 0.5910592 (0)	total: 4.78s	remaining: 5h 18m 40s
100:	learn: 0.2368418	test: 0.2373635	best: 0.2373635 (100)	total: 5m 36s	remaining: 3h 36m 12s
200:	learn: 0.2302953	test: 0.2316399	best: 0.2316399 (200)	total: 10m 41s	remaining: 3h 21m 58s
300:	learn: 0.2255541	test: 0.2278982	best: 0.2278982 (300)	total: 15m 57s	remaining: 3h 16m 5s
400:	learn: 0.2218921	test: 0.2251894	best: 0.2251894 (400)	total: 21m 9s	remaining: 3h 9m 51s


In [None]:
result = []


n_iter = 11363761 // batch_size + 1
test_iterator = pd.read_csv("/kaggle/input/amex-default-prediction/test_data.csv", chunksize=batch_size, index_col='customer_ID')

for test in tqdm(test_iterator, total=n_iter):
    
    test_full = preprocess(test)
    test_full['prediction'] = 0
    
    for cat in models:
        test_full['prediction'] += cat.predict_proba(test_full.values)[:,1]
    
    result.append(test_full['prediction'] / len(models))


In [None]:
sub = pd.DataFrame(pd.concat(result, copy=False)).reset_index()

In [None]:
sub.info()

In [None]:
sub = sub.groupby('customer_ID').max().reset_index()

In [None]:
sub.info()

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)