# https://www.kaggle.com/lonnieqin/ubiquant-market-prediction-with-dnn/notebook

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import gc
import random
import copy
import gc
from collections import Counter, defaultdict
from scipy import stats

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error

from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
TRAIN_CSV_PATH = '../input/ubiquant-market-prediction/train.csv'
TRAIN_PKL_PATH = '../input/ump-train-picklefile/train.pkl'
EXP_TEST_PATH = '../input/ubiquant-market-prediction/example_test.csv'
EXP_SUB_PATH = '../input/ubiquant-market-prediction/example_sample_submission.csv'
USE_COLS = ['investment_id', ] + [f'f_{i}' for i in range(300)]
CAT_COLS = ['investment_id']
TARGET_COLS = 'target'

In [None]:
def transform_csv2pickle(csv_path, pkl_path):
    '''将CSV转为PKL文件'''
    basecols = ['row_id', 'time_id', 'investment_id', 'target']
    features = [f'f_{i}' for i in range(300)]
    usecols = basecols+features
    
    dtypes = {
        'row_id': 'str',
        'time_id': 'uint16',
        'investment_id': 'uint16',
        'target': 'float32',
    }
    for col in features:
        dtypes[col] = 'float32'

    train = pd.read_csv(csv_path, usecols=usecols, dtype=dtypes)
    train.to_pickle(pkl_path)


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df    
    
def fix_seed(seed = 2022):
    '''固定种子数'''
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
def stratified_group_k_fold(y, groups, k, seed=None):
    '''
    将 具备相似bin label样本分布 的timeids收集起来放入fold的测试集中。

    stratified是保证fold内的每个class下的样本分布尽可能一致/相似。
    stratified_group_kfold是保证fold内的每个timeids下的样本分布尽可能一致/相似。

    params:
        y: target的bin label
        group: time_id
        k: k折数量
    '''
    labels_num = np.max(y) + 1 # bin label的数量
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1 # 每个timeid下，给定bin label下，样本数量
        y_distr[label] += 1 # 每个timeid下的样本数量

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    
    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts # {fold0:[bin1_num, bin2_num, ...],...}
        std_per_label = [] # [fold0的bin label样本分布的标准差,..]
        for label in range(labels_num):
            # 求fold下，各bin的样本占比，然后求std，得到每个fold的bin label样本分布的标准差
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)]) 
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts # 还原成初始值
        return np.mean(std_per_label) # 求各个fold下，bin label样本分布的平均标准差
    
    groups_and_y_counts = list(y_counts_per_group.items()) # [(timeid,[bin1_num, bin2_num, ...]), (timeid,[bin1_num, bin2_num, ...]), ...]，内部每个list对应一个timeid的各bin样本数
    random.Random(seed).shuffle(groups_and_y_counts) # 打乱timeid

    # 整个目的：保证每个fold里的timeids们，都有相似的bin label样本分布！！！
    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])): # 某timeid内的target分布越离散，-std越小，排序靠上
        # g为timeid, y_counts为[bin1_num, bin2_num, ...]
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i) # 求各个fold下，bin label样本分布的平均标准差
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts # 将bin label样本分布std最小的fold下，[bin1_num, bin2_num, ...]追加进去
        groups_per_fold[best_fold].add(g) # {最好的fold: (timeid, timeid, ...)}

    all_groups = set(groups)
    for i in range(k):
        # 获取每个fold的训练和测试集
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [None]:
def pearson_coef(data):
    return data.corr()['target']['preds']

def comp_metric(valid_df):
    # 对每个time_id求pearson系数再平均
    return np.mean(valid_df[['time_id', 'target', 'preds']].groupby('time_id').apply(pearson_coef))

In [None]:
# 读取数据集
trn_df = pd.read_pickle(TRAIN_PKL_PATH)
trn_df = reduce_mem_usage(trn_df)
display(trn_df.info())
display(trn_df.head())

In [None]:
def init_intlookup_layer(ts_id_df):
    '''针对时序标识特征，初始化tf的IntegerLookup层'''
    ts_ids = list(ts_id_df.unique())
    ts_id_size = len(ts_ids) + 1
    ts_id_lookup_layer = layers.IntegerLookup(max_tokens=ts_id_size)
    ts_id_lookup_layer.adapt(ts_id_df)
    return ts_id_lookup_layer

def preprocess(X, y):
    '''数据预处理'''
    return X, y

def make_dataset(feature, investment_id, y, batch_size=800, mode="train"):
    '''准备数据集'''
    # 将multi-D的tensor -> 1d tensor
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    ds = ds.map(preprocess)
    if mode == "train":
        # 避免内存不够，每次选buffer_size个数据进行shuffle
        ds = ds.shuffle(buffer_size=4096, seed=2020)
    # 提升训练流程，batch放入内存，在GPU训练的同时，CPU在准备下一次训练用的数据
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE) 
    return ds

def get_model(id_size, id_df, feat_num=300):
    '''构建DNN模型
    params:
        id_size(int):时序id的数量
        id_df(pd.DataFrame):时序id的枚举值dataframe
        feat_num(int):特征数量
        
    '''
    id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((feat_num, ), dtype=tf.float16)
    
    # 构建id的索引层
    id_lookup_layer = layers.IntegerLookup(max_tokens=id_size)
    id_lookup_layer.adapt(id_df)
    
    id_x = id_lookup_layer(id_inputs)
    id_x = layers.Embedding(id_size, 32, input_length=1)(id_x) # 对时序id做embedding
    id_x = layers.Reshape((-1, ))(id_x)
    id_x = layers.Dense(64, activation='swish')(id_x)
    id_x = layers.Dense(64, activation='swish')(id_x)
    id_x = layers.Dense(64, activation='swish')(id_x)
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    
    x = layers.Concatenate(axis=1)([id_x, feature_x])
    x = layers.Dense(512, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    output = layers.Dense(1)(x)
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', "mae", "mape", rmse])
    return model

def pearson_coef(data):
    return data.corr()['target']['preds']

def comp_metric(valid_df):
    # 对每个time_id求pearson系数再平均
    return np.mean(valid_df[['time_id', 'target', 'preds']].groupby('time_id').apply(pearson_coef))

In [None]:
investment_id_df = pd.DataFrame({'investment_ids':list(trn_df['investment_id'].unique())})
feat_num = 300 # 特征数量
id_size = len(investment_id_df) + 1 # 时序id的数量+1
USE_COLS = [f'f_{i}' for i in range(300)]
CAT_COLS = ['investment_id']

# 展示模型
model = get_model(id_size, investment_id_df, feat_num)
model.summary()
keras.utils.plot_model(model, show_shapes=True)
del model

In [None]:
# stratified_group_k_fold：将具备相似bin label样本分布的timeids收集起来放入fold的验证集中
num_bins = 12
trn_df['bins'] = pd.cut(trn_df['target'], bins=num_bins, labels=False)
trn_y = trn_df[["target"]]
trn_df = trn_df.drop(['target'], axis=1)
groups = np.array(trn_df['time_id'].values)

In [None]:
# 模型训练和验证
fold_num = 0

trn_df['preds'] = np.zeros(len(trn_df))
pred_valid_scores = []
sgkf = stratified_group_k_fold(trn_df['bins'].values, groups, k=5, seed=2020)
del groups

for trn_ind, val_ind in sgkf:
    fold_num += 1
    print('-'*20, 'Fold '+str(fold_num), '-'*20)
    
    # 准备数据集
    X_train, X_val = trn_df[USE_COLS].iloc[trn_ind,:], trn_df[USE_COLS].iloc[val_ind,:]
    y_train, y_val = trn_y[TARGET_COLS].iloc[trn_ind], trn_y[TARGET_COLS].iloc[val_ind]
    investment_id_train, investment_id_val = trn_df[['investment_id']].iloc[trn_ind,:], trn_df[['investment_id']].iloc[val_ind,:]
    train_ds = make_dataset(X_train, investment_id_train, y_train)
    valid_ds = make_dataset(X_val, investment_id_val, y_val, mode="valid")
    del X_train, y_train, investment_id_train, investment_id_val
    
    # 获取model
    model = get_model(id_size, investment_id_df, feat_num)
    checkpoint = keras.callbacks.ModelCheckpoint(f"model_{fold_num}", save_best_only=True)
    early_stop = keras.callbacks.EarlyStopping(patience=10)
    history = model.fit(train_ds, epochs=30, validation_data=valid_ds, callbacks=[checkpoint, early_stop])
    model = keras.models.load_model(f"model_{fold_num}")
    
    
    # 预测
    pred_valid = model.predict(valid_ds).ravel()
    trn_df['preds'].iloc[val_ind] = pred_valid
    pred_valid_scores.append(np.min(history.history["val_mse"]))
    pd.DataFrame(history.history, columns=["mse", "val_mse"]).plot()
    plt.title("MSE")
    plt.show()
    pd.DataFrame(history.history, columns=["mae", "val_mae"]).plot()
    plt.title("MAE")
    plt.show()
    pd.DataFrame(history.history, columns=["rmse", "val_rmse"]).plot()
    plt.title("RMSE")
    plt.show()
    
    del X_val, y_val, train_ds, valid_ds, model, history
    gc.collect()

trn_df['target'] = trn_y
valid_pearson_coef = comp_metric(trn_df)
print('='*20)
print('The valid dataset | %s is %0.4f and pearson coef. is %.4f' % (model_params['metric'],
                                                                     np.mean(pred_valid_scores),
                                                                     valid_pearson_coef))

In [None]:
def preprocess_test(investment_id, feature):
    '''数据预处理'''
    return (investment_id, feature), 0
            
def make_test_dataset(feature, investment_id, batch_size=800):
    '''准备数据集'''
    # 将multi-D的tensor -> 1d tensor
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    ds = ds.map(preprocess_test)
    # 提升训练流程，batch放入内存，在GPU训练的同时，CPU在准备下一次训练用的数据
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE) 
    return ds

def inference(models, ds):
    y_preds = []
    for fold_num in range(1,6):
        model = keras.models.load_model(f"model_{fold_num}")
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
        del model
    return np.mean(y_preds, axis=0)    
    

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    ds = make_test_dataset(test_df[features], test_df["investment_id"])
    sample_prediction_df['target'] = inference(models, ds)
    env.predict(sample_prediction_df) 