In [1]:
import os, tqdm, json, pickle, gc, zipfile, itertools, time, collections, sys, requests, schedule
import pandas as pd
import numpy as np
from dateutil import parser
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
from collections import defaultdict
from datetime import datetime
from sklearn.model_selection import train_test_split
import catboost as cb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error
import catboost as cb
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.inspection import permutation_importance

import matplotlib.pyplot as plt
import seaborn as sns

def amex_metric_official(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

def get_metrics(model, X_eval, y_eval):

    """ calulate metrics """

    pred = model.predict(X_eval)
    pred_prob = model.predict_proba(X_eval)[:, 1]

    d= {}

    d['accuracy'] = accuracy_score(y_eval, pred)
    d['f1'] = f1_score(y_eval, pred)
    d['auc'] = roc_auc_score(y_eval, pred_prob)

    y_true, y_predprob = y_eval.to_frame('target'), pd.Series(pred_prob, index = y_eval.index).to_frame('prediction')
    d['amex_metric_official'] = amex_metric_official(y_true, y_predprob)

    d['tp'] = ((y_eval==1)&(pred==1)).sum()
    d['tn'] = ((y_eval==0)&(pred==0)).sum()
    d['fp'] = ((y_eval==0)&(pred==1)).sum()
    d['fn'] = ((y_eval==1)&(pred==0)).sum()

    d['importances'] = ser_imp = pd.Series(dict(zip(X_eval.columns, model.feature_importances_))).sort_values(ascending = False)

    return d

def eval_catboost(X_train, y_train, X_eval, y_eval, verbose):

    """ evaluate model """

    cat_features = np.where(X_train.dtypes=='category')[0]

    params_c = {}
    params_c['cat_features'] = cat_features
    params_c['od_type'] = 'Iter'
    params_c['od_wait'] = 20   
    params_c['verbose'] = verbose

    model = cb.CatBoostClassifier(**params_c)
    model.fit(X_train, y_train, eval_set = (X_eval, y_eval))    
    best_iter = model.best_iteration_
    return model, best_iter, get_metrics(model, X_eval, y_eval) 

def get_catboost_eval_results(X_train, y_train, X_eval, y_eval):

    """ feature selection and eval results """

    i=1
    while True:

        print('> iter#{}. n_features: {}'.format(i, X_train.shape[1]))

        model, best_iter, d_eval_results = eval_catboost(X_train, y_train, X_eval, y_eval,verbose=1)

        mask = model.feature_importances_>0

        if np.all(mask):
            break
        else:
            best_features = X_train.columns[mask]
            X_train, X_eval = X_train[best_features], X_eval[best_features]
            i+=1
    i = 1       
    while True:

        L_perm_imp = []
        for _ in tqdm.tqdm(range(10)):
            d_perm_imp = permutation_importance(model, X_eval, y_eval, scoring='roc_auc', n_jobs=-1, n_repeats = 1, random_state = _)    
            L_perm_imp.append(d_perm_imp['importances_mean'])
        mean_perm_imp = np.c_[L_perm_imp].mean(0)

        mask = mean_perm_imp>0

        if np.all(mask):
            break
        else:
            best_features = X_train.columns[mask]
            X_train, X_eval = X_train[best_features], X_eval[best_features]
            model, best_iter, d_eval_results = eval_catboost(X_train, y_train, X_eval, y_eval,verbose=0)
            i+=1

    return {
        'iterations':best_iter,
        'X_train':X_train, 'X_eval':X_eval,
        'y_train':y_train, 'y_eval':y_eval,
        'eval_results':d_eval_results
    }

def aggregate_customer(subdf):

    """ 
    aggregates customer:
        1. write every activity
        2. aggregate all activity
        3. aggregate activity history
    """
    
    subdf = subdf.sort_values(DATE_KEY, ascending = False)

    # features
    d = {}
    d[ID_KEY] = customer_ID
    d['count'] = len(subdf)
    d.update(subdf[CAT_FEATURES].iloc[0].to_dict())
    d.update(subdf.add_suffix(f'__nan_rate').isna().mean())

    # every activity
    for i in range(len(subdf)):
        d.update(subdf[DATE_KEYS].add_suffix(f'__last_{i+1}').iloc[i].to_dict())
        d.update(subdf[NUM_FEATURES].add_suffix(f'__last_{i+1}').iloc[i].to_dict())

    # aggregate all activities
    subdf = subdf.sort_values(DATE_KEY)
    d.update(subdf[NUM_FEATURES].add_suffix('__mean').mean().to_dict())
    d.update(subdf[NUM_FEATURES].add_suffix('__sum').mean().to_dict())
    d.update(subdf[NUM_FEATURES].fillna(0).add_suffix('__diff_mean').diff().mean().to_dict())    

    # for by_key in DATE_KEYS[1:]:
    #     for by_value, subsubdf in subdf.groupby(by_key):
    #         d.update(subsubdf[NUM_FEATURES].add_suffix(f'{by_key}_{by_value}__mean').mean().to_dict())
    #         d.update(subsubdf[NUM_FEATURES].add_suffix(f'{by_key}_{by_value}__sum').mean().to_dict())
    #         d.update(subsubdf[NUM_FEATURES].fillna(0).add_suffix(f'{by_key}_{by_value}__diff_mean').diff().mean().to_dict())    


    # aggregate history for every activity
    d['duraion'] = (subdf['timestamp'].max() - subdf['timestamp'].min())
    for i in range(len(subdf)):
        curr = subdf.iloc[i]
        hist = subdf.iloc[:i]
        try:
            d.update(hist[NUM_FEATURES].add_suffix(f'__history_{i+1}__mean').mean().to_dict())
            d.update(hist[NUM_FEATURES].add_suffix(f'__history_{i+1}__sum').mean().to_dict())
            d.update(hist[NUM_FEATURES].add_suffix(f'__history_{i+1}__diff_mean').diff().mean().to_dict())
        except:
            pass
    
    return d

def run_ml_pipeline(df_data_train):

    # features
    X = df_data_train.set_index(ID_KEY).drop(TARGET_KEY, 1)
    obj_keys = X.select_dtypes('object').columns
    X[obj_keys] = X[obj_keys].fillna('default').astype('category')

    # target
    y = df_data_train.set_index(ID_KEY)[TARGET_KEY].astype('int')

    # split
    X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size = TEST_SIZE, shuffle = True, random_state = SEED)
    del X, y
    gc.collect()

    # scale
    num_features= X_train.select_dtypes('number').columns
    scaler = MinMaxScaler()
    scaler.fit(X_train[num_features])
    
    X_train[num_features] = scaler.transform(X_train[num_features])
    X_eval[num_features] = scaler.transform(X_eval[num_features])

    # evaluate
    d_eval_results = get_catboost_eval_results(X_train, y_train, X_eval, y_eval)

    return d_eval_results

In [2]:
# dir with data
PATH_TO_DATA= 'data'

# seed
SEED = 13

# target key
TARGET_KEY='target'

# customer id key
ID_KEY = 'customer_ID'

# split
TEST_SIZE = .1

# date key
DATE_KEY = 'S_2'
DATE_KEYS = ['timestamp', 'year','month','day','weekday']

# categories
CAT_FEATURES = [
    'D_63', 'D_64', 'D_66', 'D_68', 'B_30', 'B_38', 'D_114', 'D_116',
    'D_117', 'D_120', 'D_126'
]

# numbers
NUM_FEATURES = [
    'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5',
    'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3',
    'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5',
    'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19',
    'B_20', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73', 'P_4',
    'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16', 'D_80', 
    'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16',
    'B_29', 'S_18', 'D_86', 'D_87', 'R_17', 'R_18', 'D_88', 'B_31', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21',
    'B_33', 'D_89', 'R_22', 'R_23', 'D_91', 'D_92', 'D_93', 'D_94', 'R_24', 'R_25', 'D_96', 'S_22', 'S_23', 'S_24',
    'S_25', 'S_26', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'B_36', 'B_37', 'R_26', 'R_27', 'D_108',
    'D_109', 'D_110', 'D_111', 'B_39', 'D_112', 'B_40', 'S_27', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122',
    'D_123', 'D_124', 'D_125', 'D_127', 'D_128', 'D_129', 'B_41', 'B_42', 'D_130', 'D_131', 'D_132', 'D_133', 'R_28',
    'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145'
    
]

# percentiles 
PS = np.linspace(2.5, 97.5, 10)

In [3]:
df_train = pd.read_feather(os.path.join(PATH_TO_DATA, 'train_data.ftr'))

In [4]:
# date
df_train['year'] = df_train[DATE_KEY].dt.year
df_train['month'] =df_train[DATE_KEY].dt.month
df_train['day'] =df_train[DATE_KEY].dt.day
df_train['weekday'] =df_train[DATE_KEY].dt.weekday

# timestamp
df_train['timestamp'] = df_train[DATE_KEY].map(lambda row: row.timestamp())

In [5]:
# unique customers
UNIQUE_ID_TRAIN = df_train[ID_KEY].unique()

In [18]:
# number of repeats
N_ITERATIONS = 5
# amount of samples per repeat
SAMPLE_SIZE = 1000

# results
L_SAMPLE_RESULTS = []

# for iterations
for _ in range(N_ITERATIONS):

    print('> ITER #{}/{}'.format(_+1, N_ITERATIONS))

    # sample
    np.random.seed(SEED+_)
    id2use= np.random.choice(UNIQUE_ID_TRAIN, SAMPLE_SIZE)

    # data table
    df_data_train = pd.DataFrame()

    # add features for every customer
    for customer_ID in tqdm.tqdm(id2use):

        # customer table
        subdf = df_train.query('customer_ID==@customer_ID')
        
        # aggregations
        d = aggregate_customer(subdf)      

        # target
        ser_target = df_train.query('customer_ID==@customer_ID')['target']
        assert ser_target.nunique()==1
        d['target'] = ser_target.iloc[0]
        
        # add
        df_data_train=df_data_train.append(d, ignore_index = True)  
        del d

    # ml pipeline
    d_eval_results = run_ml_pipeline(df_data_train)

    # collect result
    L_SAMPLE_RESULTS.append((id2use, d_eval_results))

    del d_eval_results
    gc.collect()

with open('L_SAMPLE_RESULTS.pickle', 'wb') as f:
    pickle.dump(L_SAMPLE_RESULTS, f)

In [11]:
metrics, imps = {}, []
for i, l in enumerate(L_SAMPLE_RESULTS):
    r = l[1]['eval_results']
    metric, imp = r['amex_metric_official'], r['importances']
    imps.append(imp.to_frame(f'v{i+1}'))
    metrics[f'v{i+1}'] = metric

df_importances = pd.concat(imps, 1)


In [16]:
pd.Series(metrics).to_frame('metric').T

Unnamed: 0,v1,v2,v3,v4
metric,0.897061,0.84613,0.784509,0.927039


In [17]:
df_importances

Unnamed: 0,v1,v2,v3,v4
P_2__last_1,13.427694,27.111435,16.283873,14.217815
B_37__last_2,7.227109,,,
P_2__last_2,6.478304,,,
B_1__last_1,6.322912,12.954312,12.659972,
D_61__last_6,6.30375,,,
R_3__last_2,5.638317,,,
P_2__sum,5.468804,,,
D_41__diff_mean,5.266093,,5.149191,
D_65__mean,4.001682,,,
S_3__mean,3.881935,,,


In [42]:
PATH_TO_TRAIN_FEATURES = 'features_train'

in_filenames = os.listdir(PATH_TO_TRAIN_FEATURES)
all_filenames = ['{}.pickle'.format(x) for x in UNIQUE_ID_TRAIN]
new_filenames = list(set(all_filenames)-set(in_filenames))

assert len(in_filenames)+len(new_filenames) == len(all_filenames)

NEW_ID_TRAIN = [fnm.split('.')[0] for fnm in new_filenames]

# add features for every customer
for customer_ID in tqdm.tqdm_notebook(NEW_ID_TRAIN):

    # filename
    fnm= '{}.pickle'.format(customer_ID)

    # path to out file
    pth = os.path.join(PATH_TO_TRAIN_FEATURES, fnm)

    # customer table
    subdf = df_train.query('customer_ID==@customer_ID')
    
    # aggregations
    d = aggregate_customer(subdf)      

    # target
    ser_target = df_train.query('customer_ID==@customer_ID')['target']
    assert ser_target.nunique()==1
    d['target'] = ser_target.iloc[0]
    
    with open(pth, 'wb') as f:
        pickle.dump(d, f)
    del d
    gc.collect()

  0%|          | 0/456997 [00:00<?, ?it/s]

In [20]:
with open(pth, 'rb') as f:
    d = pickle.load(f)

In [21]:
d

{'customer_ID': '00f818189762974961f3789949feb68a2254132d8f63e88c9e2d6d62b4094c24',
 'count': 13,
 'D_63': 'CO',
 'D_64': 'U',
 'D_66': nan,
 'D_68': 5.0,
 'B_30': 1.0,
 'B_38': 7.0,
 'D_114': 1.0,
 'D_116': 0.0,
 'D_117': -1.0,
 'D_120': 0.0,
 'D_126': 1.0,
 'customer_ID__nan_rate': 0.0,
 'S_2__nan_rate': 0.0,
 'P_2__nan_rate': 0.0,
 'D_39__nan_rate': 0.0,
 'B_1__nan_rate': 0.0,
 'B_2__nan_rate': 0.0,
 'R_1__nan_rate': 0.0,
 'S_3__nan_rate': 0.3076923076923077,
 'D_41__nan_rate': 0.0,
 'B_3__nan_rate': 0.0,
 'D_42__nan_rate': 1.0,
 'D_43__nan_rate': 0.0,
 'D_44__nan_rate': 0.0,
 'B_4__nan_rate': 0.0,
 'D_45__nan_rate': 0.0,
 'B_5__nan_rate': 0.0,
 'R_2__nan_rate': 0.0,
 'D_46__nan_rate': 0.0,
 'D_47__nan_rate': 0.0,
 'D_48__nan_rate': 0.0,
 'D_49__nan_rate': 1.0,
 'B_6__nan_rate': 0.0,
 'B_7__nan_rate': 0.0,
 'B_8__nan_rate': 0.0,
 'D_50__nan_rate': 0.0,
 'D_51__nan_rate': 0.0,
 'B_9__nan_rate': 0.0,
 'R_3__nan_rate': 0.0,
 'D_52__nan_rate': 0.0,
 'P_3__nan_rate': 0.0,
 'B_10__nan_rat