In [None]:
import os
import gc
import joblib
import random
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from argparse import Namespace
from collections import defaultdict

import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, train_test_split


import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 64)

def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)#生成指定随机数
    os.environ["PYTHONHASHSEED"] = str(seed)#使实验可复现
    
# 从int8-int64占用内存逐渐增大，float16-float64同理。
#考察数据集中每一列的最大值与最小值，为每一列分配合适的数据类型，减少内存占用
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
args = Namespace(
    debug=False,
    seed=21,
    folds=5,
    workers=4,
    min_time_id=None, 
    holdout=True,
    num_bins=16,
    data_path=Path("../input/ubiquant-parquet/"),
)
seed_everything(args.seed)

if args.debug:
    setattr(args, 'min_time_id', 1100)#args.min_time_id=1100

In [None]:
#cell代码运行一次所花费的时间
%%time
train = pd.read_parquet(args.data_path.joinpath("train_low_mem.parquet"))
assert train.isnull().any().sum() == 0, "null exists."
assert train.row_id.str.extract(r"(?P<time_id>\d+)_(?P<investment_id>\d+)").astype(train.time_id.dtype).equals(train[["time_id", "investment_id"]]), "row_id!=time_id_investment_id"

if args.min_time_id is not None:
    train = train.query("time_id>=@args.min_time_id").reset_index(drop=True)
    gc.collect()
train.shape

# StratifiedKFold by time_span: [discussion](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302429)

In [None]:
time_id_df = (
    train.filter(regex=r"^(?!f_).*")#过滤掉f_开头
    .groupby("investment_id")#基于行操作，按investment_id进行分类
    .agg({"time_id": ["min", "max"]})#基于列操作，找到最大最小值
    .reset_index()#重置索引
)
time_id_df["time_span"] = time_id_df["time_id"].diff(axis=1)["max"]#time_span=max-min
time_id_df.head(6)#显示前六行

In [None]:
train = train.merge(time_id_df.drop(columns="time_id").droplevel(level=1, axis=1), on="investment_id")#按investment_id合并time_id_df到训练集，在它删除time_id中max这一列后
train.time_span.hist(bins=args.num_bins, figsize=(16,8))#显示训练集时间span直方图，16个竖条，宽16，高8
del time_id_df#删除变量
gc.collect()

In [None]:
if args.holdout:
    _target = pd.cut(train.time_span, args.num_bins, labels=False)#数据离散化，只返回time_span中的数据在哪个bin
    _train, _valid = train_test_split(_target, stratify=_target)#分离训练集和验证集，stratify保持类的分布
    print(f"train length: {len(_train)}", f"holdout length: {len(_valid)}")
    valid = train.iloc[_valid.index].sort_values(by=["investment_id", "time_id"]).reset_index(drop=True)#通过行号取行数据并排序
    train = train.iloc[_train.index].sort_values(by=["investment_id", "time_id"]).reset_index(drop=True)
    train.time_span.hist(bins=args.num_bins, figsize=(16,8), alpha=0.8)#alpha透明度，训练集的直方图
    valid.time_span.hist(bins=args.num_bins, figsize=(16,8), alpha=0.8)#测试集的直方图
    valid.drop(columns="time_span").to_parquet("valid.parquet")
    del valid, _train, _valid, _target
    gc.collect()

In [None]:
train["fold"] = -1
_target = pd.cut(train.time_span, args.num_bins, labels=False)#数据离散化，只返回time_span中的数据在哪个bin
skf = StratifiedKFold(n_splits=args.folds)#5折交叉验证，对划分的训练集和测试集保留每个类别的样本百分比
for fold, (train_index, valid_index) in enumerate(skf.split(_target, _target)):
    train.loc[valid_index, 'fold'] = fold#按索引取数据
    
fig, axs = plt.subplots(nrows=args.folds, ncols=1, sharex=True, figsize=(16,8), tight_layout=True)#绘制子图，5行1列
for ax, (fold, df) in zip(axs, train[["fold", "time_span"]].groupby("fold")):
    ax.hist(df.time_span, bins=args.num_bins)#直方图
    ax.text(0, 40000, f"fold: {fold}, count: {len(df)}", fontsize=16)
plt.show()
del _target, train_index, valid_index
_=gc.collect()

In [None]:
cat_features = ["investment_id"]
num_features = list(train.filter(like="f_").columns)#数字特征，如f_
features = num_features + cat_features

train = reduce_mem_usage(train.drop(columns="time_span"))#减少内存使用
train[["investment_id", "time_id"]] = train[["investment_id", "time_id"]].astype(np.uint16)#将数据类型转化为16位无符号整型
train["fold"] = train["fold"].astype(np.uint8)#将数据类型转化为8位无符号整型
gc.collect()
features += ["time_id"] 
len(features)

In [None]:
# corr_matrix = train.filter(like="f_").corr().abs()
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# # Find features with correlation greater than 0.97
# to_drop = [column for column in upper.columns if any(upper[column] >= 0.97)]
# sorted(to_drop)

In [None]:
#均根方误差
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# 自定义损失函数
def feval_rmse(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'rmse', rmse(y_true, y_pred), False

#皮尔逊系数
def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', pearsonr(y_true, y_pred)[0], True

def run():    
    params = {
        'learning_rate':0.05,
        "objective": "regression",
        "metric": "rmse",
        'boosting_type': "gbdt",
        'verbosity': -1,
        'n_jobs': -1, 
        'seed': args.seed,
        'lambda_l1': 2.7223413643193285e-08, 
        'lambda_l2': 0.009462714717237544, 
        'num_leaves': 108, 
        'feature_fraction': 0.5298125662824026, 
        'bagging_fraction': 0.7279540797730281, 
        'bagging_freq': 6, 
        'max_depth': 10, 
        'max_bin': 487, 
        'min_data_in_leaf': 158,
        'n_estimators': 1000, 
    }
    
    y = train['target']
    train['preds'] = -1000
    scores = defaultdict(list)#字典，通过键值对来存取，可为不存在的键值返回一个默认值
    features_importance= pd.DataFrame()#创建重要特征数据集
    
    #每一次迭代完都调用feval_rmse，传入验证数据集的此轮迭代的预测值与验证数据集
    for fold in range(args.folds):
        print(f"=====================fold: {fold}=====================")
        trn_ind, val_ind = train.fold!=fold, train.fold==fold
        print(f"train length: {trn_ind.sum()}, valid length: {val_ind.sum()}")
        train_dataset = lgb.Dataset(train.loc[trn_ind, features], y.loc[trn_ind], categorical_feature=cat_features)#按标签取数据
        valid_dataset = lgb.Dataset(train.loc[val_ind, features], y.loc[val_ind], categorical_feature=cat_features)

        model = lgb.train(
            params,
            train_set = train_dataset, 
            valid_sets = [train_dataset, valid_dataset], 
            verbose_eval=100,
            early_stopping_rounds=50,
            feval = feval_pearsonr
        )
        joblib.dump(model, f'lgbm_seed{args.seed}_{fold}.pkl')#序列化对象

        preds = model.predict(train.loc[val_ind, features])
        train.loc[val_ind, "preds"] = preds
        
        scores["rmse"].append(rmse(y.loc[val_ind], preds))
        scores["pearsonr"].append(pearsonr(y.loc[val_ind], preds)[0])
        
        fold_importance_df= pd.DataFrame({'feature': features, 'importance': model.feature_importance(), 'fold': fold})#字典创建，列分别为feature、importance、fold
        features_importance = pd.concat([features_importance, fold_importance_df], axis=0)#沿着纵坐标将多个对象堆叠在一起
        
        del train_dataset, valid_dataset, model
        gc.collect()
    print(f"lgbm {args.folds} folds mean rmse: {np.mean(scores['rmse'])}, mean pearsonr: {np.mean(scores['pearsonr'])}")
    train.filter(regex=r"^(?!f_).*").to_csv("preds.csv", index=False)
    return features_importance

In [None]:
features_importance = run()
df = train[["target", "preds"]].query("preds!=-1000")#对数据框进行挑选行的操作
print(f"lgbm {args.folds} folds mean rmse: {rmse(df.target, df.preds)}, mean pearsonr: {pearsonr(df.target, df.preds)[0]}")
del df, train
gc.collect()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

folds_mean_importance = (
    features_importance.groupby("feature")
    .importance.mean()
    .reset_index()
    .sort_values(by="importance", ascending=False)
)
features_importance.to_csv("features_importance.csv", index=False)
folds_mean_importance.to_csv("folds_mean_feature_importance.csv", index=False)

#绘制头部重要特征、尾部重要特征条形图
plt.figure(figsize=(16, 10))
plt.subplot(1,2,1)#1代表行，2代表列，1代表此时绘制第1个图
sns.barplot(x="importance", y="feature", data=folds_mean_importance.head(50))#条形图，横纵坐标
plt.title(f'Head LightGBM Features (avg over {args.folds} folds)')
plt.subplot(1,2,2)#1代表行，2代表列，2代表此时绘制第2个图
sns.barplot(x="importance", y="feature", data=folds_mean_importance.tail(50))
plt.title(f'Tail LightGBM Features (avg over {args.folds} folds)')
plt.tight_layout()
plt.show()

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()

models = [joblib.load(f'lgbm_seed{args.seed}_{fold}.pkl') for fold in range(args.folds)]
if args.holdout:
    valid = pd.read_parquet("valid.parquet")#读文件
    valid_pred = np.mean(np.stack([models[fold].predict(valid[features]) for fold in range(args.folds)]), axis=0)#对各列求均值
    print(f"lgbm {args.folds} folds holdout rmse: {rmse(valid.target, valid_pred)}, holdout pearsonr: {pearsonr(valid.target, valid_pred)[0]}")
    del valid, valid_pred
    gc.collect()

for (test_df, sample_prediction_df) in iter_test:
    test_df["time_id"] = test_df.row_id.str.extract(r"(\d+)_.*").astype(np.uint16) # extract time_id form row_id
    final_pred = [models[fold].predict(test_df[features]) for fold in range(args.folds)]
    sample_prediction_df['target'] = np.mean(np.stack(final_pred), axis=0)#对各列求均值
    env.predict(sample_prediction_df) 
    display(sample_prediction_df)