In [None]:
# api是否能本地调用 A：不可以
# 测试集数据能否拉出来 A：不可以
# 看下测试集每个investiment预测长度H A：不可以

# 每个investiment的匿名特征是否为单一属性值，还是时序变化？A：随时序变化,但有个别特征比较奇怪，例如很多都为0，待深入
# 选择递归预测还是直接预测？
# 使用lgb
# 采用不同的交叉验证方式，看何种方法跟线上更基本保持一致

# 加入统计特征
# 声明categorical features
# 采用mlp
# 借鉴deepar方式，引入协变量

# 处理time_id不等距问题

# 在线学习，随着测试集的引入，不断训练再预测

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import gc
import random
import copy
import joblib
from collections import Counter, defaultdict

import lightgbm as lgb
from sklearn.metrics import mean_squared_error

from tqdm import tqdm
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')

In [None]:
TRAIN_CSV_PATH = '../input/ubiquant-market-prediction/train.csv'
TRAIN_PKL_PATH = '../input/ump-train-picklefile/train.pkl'
EXP_TEST_PATH = '../input/ubiquant-market-prediction/example_test.csv'
EXP_SUB_PATH = '../input/ubiquant-market-prediction/example_sample_submission.csv'
USE_COLS = ['investment_id', ] + [f'f_{i}' for i in range(300)]
CAT_COLS = ['investment_id']
TARGET_COLS = 'target'

In [None]:
def transform_csv2pickle(csv_path, pkl_path):
    '''将CSV转为PKL文件'''
    basecols = ['row_id', 'time_id', 'investment_id', 'target']
    features = [f'f_{i}' for i in range(300)]
    usecols = basecols+features
    
    dtypes = {
        'row_id': 'str',
        'time_id': 'uint16',
        'investment_id': 'uint16',
        'target': 'float32',
    }
    for col in features:
        dtypes[col] = 'float32'

    train = pd.read_csv(csv_path, usecols=usecols, dtype=dtypes)
    train.to_pickle(pkl_path)


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df      
    
def fix_seed(seed = 2022):
    '''固定种子数'''
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


In [None]:
def stratified_group_k_fold(y, groups, k, seed=None):
    '''
    将 具备相似bin label样本分布 的timeids收集起来放入fold的测试集中。

    stratified是保证fold内的每个class下的样本分布尽可能一致/相似。
    stratified_group_kfold是保证fold内的每个timeids下的样本分布尽可能一致/相似。

    params:
        X: 自变量特征
        y: target的bin label
        group: time_id
        k: k折数量
    '''
    labels_num = np.max(y) + 1 # bin label的数量
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1 # 每个timeid下，给定bin label下，样本数量
        y_distr[label] += 1 # 每个timeid下的样本数量

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    
    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts # {fold0:[bin1_num, bin2_num, ...],...}
        std_per_label = [] # [fold0的bin label样本分布的标准差,..]
        for label in range(labels_num):
            # 求fold下，各bin的样本占比，然后求std，得到每个fold的bin label样本分布的标准差
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)]) 
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts # 还原成初始值
        return np.mean(std_per_label) # 求各个fold下，bin label样本分布的平均标准差
    
    groups_and_y_counts = list(y_counts_per_group.items()) # [(timeid,[bin1_num, bin2_num, ...]), (timeid,[bin1_num, bin2_num, ...]), ...]，内部每个list对应一个timeid的各bin样本数
    random.Random(seed).shuffle(groups_and_y_counts) # 打乱timeid

    # 整个目的：保证每个fold里的timeids们，都有相似的bin label样本分布！！！
    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])): # 某timeid内的target分布越离散，-std越小，排序靠上
        # g为timeid, y_counts为[bin1_num, bin2_num, ...]
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i) # 求各个fold下，bin label样本分布的平均标准差
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts # 将bin label样本分布std最小的fold下，[bin1_num, bin2_num, ...]追加进去
        groups_per_fold[best_fold].add(g) # {最好的fold: (timeid, timeid, ...)}

    all_groups = set(groups)
    for i in range(k):
        # 获取每个fold的训练和测试集
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [None]:
def split_train_valid(data, method=1):
    '''划分数据集->训练集和验证集
    每个investment_id下最近20%的数据作为验证集'''
    if method == 1:
        # 取每条时序的后面部分作为验证集
        train_idxs, valid_idxs = [], []
        data = data.sort_values(by=['investment_id', 'time_id'], ascending=(True, True))
        valid_ratio = 0.2
        for investment_id in tqdm(data['investment_id'].unique()):
            tgt_data = data[data['investment_id'] == investment_id]
            valid_num = int(valid_ratio * len(tgt_data))
            train_idxs.extend(tgt_data.iloc[:-valid_num,:].index.to_list())
            valid_idxs.extend(tgt_data.iloc[-valid_num:,:].index.to_list())
        train = data.iloc[train_idxs,:]
        valid = data.iloc[valid_idxs,:]
    return train, valid

# # 划分数据集
# train_df, valid_df = split_train_valid(trn_df, method=1)
# print(trn_df.shape, train_df.shape, valid_df.shape)
# del trn_df # 释放内存，否则会报错

In [None]:
def pearson_coef(data):
    return data.corr()['target']['preds']

def comp_metric(valid_df):
    # 对每个time_id求pearson系数再平均
    return np.mean(valid_df[['time_id', 'target', 'preds']].groupby('time_id').apply(pearson_coef))

In [None]:
model_params = {
    'metric':'rmse',
    'objective':'regression',
    'learning_rate':0.1,
    'seed':2020,
    'boosting_type':'gbdt', # 也可用其他的，但dart不支持early stopping
    'early_stopping_round':30,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'subsample': 0.8,
    'lambda_l1': 0.014647978740193088, 
    'lambda_l2': 1.0739045659323434e-05,
    'num_leaves': 88, 
    'colsample_bytree': 0.5658895874313851, 
    'subsample': 0.8999680059621733, 
    'bagging_freq': 3, 
    'max_depth': 13, 
    'max_bin': 442, 
    'min_data_in_leaf': 316,
    'n_jobs': -1,
    'verbose':-1
}

In [None]:
trn_df = pd.read_pickle(TRAIN_PKL_PATH)
print(len(trn_df['investment_id'].unique()))
trn_df = reduce_mem_usage(trn_df)
print(len(trn_df['investment_id'].unique()))

In [None]:
# transform_csv2pickle(TRAIN_CSV_PATH, TRAIN_PKL_PATH)

# 读取数据集
trn_df = pd.read_pickle(TRAIN_PKL_PATH)
trn_df = reduce_mem_usage(trn_df)
display(trn_df.info())
display(trn_df.head())
gc.collect()

In [None]:
# stratified_group_k_fold：将具备相似bin label样本分布的timeids收集起来放入fold的验证集中
num_bins = 12
trn_df['bins'] = pd.cut(trn_df['target'], bins=num_bins, labels=False)
trn_y = trn_df[['target']]
trn_df = trn_df.drop(['target'], axis=1)
groups = np.array(trn_df['time_id'].values)

In [None]:
# investment过多，不适用单独建模

trn_df['preds'] = np.zeros(len(trn_df))
fold_num = 0
pred_valid_scores = []

for trn_ind, val_ind in stratified_group_k_fold(trn_df['bins'].values, groups, k=5, seed=2020):
    fold_num += 1
    print('-'*20, 'Fold '+str(fold_num), '-'*20)
    
    # 创建LGBM数据集
    train_matrix = lgb.Dataset(trn_df[USE_COLS].iloc[trn_ind,:], label=trn_y[TARGET_COLS].iloc[trn_ind], categorical_feature=CAT_COLS)
    valid_matrix = lgb.Dataset(trn_df[USE_COLS].iloc[val_ind,:], label=trn_y[TARGET_COLS].iloc[val_ind], categorical_feature=CAT_COLS)
    
    # 训练模型
    lgb_model = lgb.train(model_params,
                            train_matrix,
                            num_boost_round=1000,
                            valid_sets=[valid_matrix], 
                            categorical_feature=[],
                            verbose_eval=250)

    # 预测划分后验证集 和 输出评估分数
    pred_valid = lgb_model.predict(trn_df[USE_COLS].iloc[val_ind,:], num_iteration =  lgb_model.best_iteration)
    pred_valid_score = lgb_model.best_score['valid_0']['rmse']
    pred_valid_scores.append(pred_valid_score)
    trn_df['preds'].iloc[val_ind] = pred_valid
    
    # 保存模型
    lgb_model.save_model(f'lgbm_{fold_num}.txt', num_iteration=lgb_model.best_iteration)
#     joblib.dump(lgb_model, f'lgbm_{fold_num}.pkl')
    
    # 清空变量和内存
    del train_matrix, valid_matrix, lgb_model
    gc.collect()
    
    
trn_df['target'] = trn_y
valid_pearson_coef = comp_metric(trn_df)
print('='*20)
print('The valid dataset | %s is %0.4f and pearson coef. is %.4f' % (model_params['metric'],
                                                                     np.mean(pred_valid_scores),
                                                                     valid_pearson_coef))
del trn_df, trn_y
gc.collect()

In [None]:
# 读取模型
# models = [joblib.load(f'lgbm_{fold}.pkl') for fold in range(1,6)]
models = [lgb.Booster(model_file=f'lgbm_{fold}.txt') for fold in range(1,6)]

import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    pred = 0 
    for lgb_model in models:
        pred += lgb_model.predict(test_df[USE_COLS])
    sample_prediction_df['target'] = pred / len(models)
    env.predict(sample_prediction_df) 