In [1]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

def q1(x):
    return x.quantile(0.25)

def q2(x):
    return x.quantile(0.75)

def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']

    #df1 = df.groupby(['model_adcode'])['salesVolume'].agg(['median', 'std', q1,q2])
    #df=pd.merge(df,df1,on=['model_adcode'],how='left')
    #stat_feat+=['median','std','q1','q2']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000,
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']
def quantile_clip(group):
    #group.plot()
    group[group < group.quantile(.05)] = group.quantile(.05)
    group[group > group.quantile(.95)] = group.quantile(.95)
    #group.plot()
    #plt.show()
    return group

In [2]:
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    data['salesVolume'] = data.groupby(['adcode', 'model'])['salesVolume'].transform(quantile_clip)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'xgb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/myx_xgb_quantile_abnormal.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.33it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 18.57it/s]


32 32
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
[0]	validation_0-rmse:757.617	validation_1-rmse:931.125
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:718.08	validation_1-rmse:891.171
[200]	validation_0-rmse:229.523	validation_1-rmse:365.122
[300]	validation_0-rmse:124.108	validation_1-rmse:252.668
[400]	validation_0-rmse:107.776	validation_1-rmse:243.108
[500]	validation_0-rmse:100.375	validation_1-rmse:238.056
[600]	validation_0-rmse:94.4662	validation_1-rmse:234.209
[700]	validation_0-rmse:88.8747	validation_1-rmse:231.041
[800]	validation_0-rmse:83.6664	validation_1-rmse:228.863
[900]	validation_0-rmse:79.7014	validation_1-rmse:225.99
[1000]	validation_0-rmse:75.7866	validation_1-rmse:224.489
[1100]	validation_0-rmse:71.9663	validation_1-rmse:223.368
[1200]	validation_0-rmse:68.4655	validation_1-rmse:223.076
Stopping. Bes

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.83it/s]


32 32
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
[0]	validation_0-rmse:778.807	validation_1-rmse:898.617
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:739.264	validation_1-rmse:859.909
[200]	validation_0-rmse:235.811	validation_1-rmse:423.555
[300]	validation_0-rmse:123.719	validation_1-rmse:357.222
[400]	validation_0-rmse:112.581	validation_1-rmse:351.103
[500]	validation_0-rmse:105.382	validation_1-rmse:350.471
Stopping. Best iteration:
[436]	validation_0-rmse:109.313	validation_1-rmse:349.817

0.5938173711118979
valid mean: 380.1831970214844
true  mean: 531.6218680709541
test  mean: 356.3283386230469


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.05it/s]


32 32
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
[0]	validation_0-rmse:791.604	validation_1-rmse:956.125
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:752.093	validation_1-rmse:916.346
[200]	validation_0-rmse:244.98	validation_1-rmse:427.263
[300]	validation_0-rmse:130.936	validation_1-rmse:334.986
[400]	validation_0-rmse:115.848	validation_1-rmse:329.047
[500]	validation_0-rmse:107.751	validation_1-rmse:325.475
[600]	validation_0-rmse:101.715	validation_1-rmse:323.231
[700]	validation_0-rmse:98.2076	validation_1-rmse:321.05
[800]	validation_0-rmse:93.8661	validation_1-rmse:320.444
[900]	validation_0-rmse:89.82	validation_1-rmse:320.186
[1000]	validation_0-rmse:86.3518	validation_1-rmse:320.719
Stopping. Best iteration:
[913]	validation_0-rmse:89.2528	validation_1-rmse:319.882

0.6508045630795574
valid mean: 443.1093444824219

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.43it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 21.33it/s]


32 32
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
[0]	validation_0-rmse:807.946	validation_1-rmse:1189.55
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:768.364	validation_1-rmse:1148.69
[200]	validation_0-rmse:252.186	validation_1-rmse:606.758
[300]	validation_0-rmse:134.437	validation_1-rmse:536.715
Stopping. Best iteration:
[274]	validation_0-rmse:141.189	validation_1-rmse:532.994

0.40847847763405454
valid mean: 395.0877990722656
true  mean: 719.7307926829268
test  mean: 285.6610107421875


In [None]:
data_df.sample(10)

In [None]:
data_df[features].sample(10)

In [None]:
data.sample(10)

In [3]:
data_pred=data.copy()
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    data['salesVolume'] = data.groupby(['adcode', 'model'])['salesVolume'].transform(quantile_clip)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']
data_true=data.copy()

In [4]:
data_true['label_true']=data_true['label']
data_true.drop('label',axis=1,inplace=True)
data_true=data_true[['adcode','model','regMonth','regYear','label_true']]

In [8]:
data_final=data_pred.merge(data_true,on=['adcode','model','regMonth','regYear'],how='left')
print('毒瘤')

毒瘤


In [10]:
data_final.sample(10)

Unnamed: 0,adcode,bodyType,forecastVolum,id,model,province,regMonth,regYear,salesVolume,popularity,carCommentVolum,newsReplyVolum,label,mt,area_sales_volume,label_true
17823,110000,1,,0,30,北京,2,2017,313.55,5994.0,1078.0,3703.0,313.55,14,16101.15,313.55
45802,610000,0,,2551,53,陕西,2,2018,158.0,,,,158.0,26,,
10450,310000,0,,0,55,上海,8,2016,484.0,3392.0,284.0,328.0,484.0,8,28539.65,484.0
42893,150000,3,,0,65,内蒙古,6,2016,64.0,9366.0,0.0,1006.0,64.0,6,13592.05,64.0
26514,510000,0,,0,5,四川,9,2017,1101.0,2910.0,482.0,2960.0,1101.0,21,57785.95,1101.0
10471,230000,0,,0,55,黑龙江,8,2016,271.0,2829.0,284.0,328.0,271.0,8,23859.2,271.0
25768,370000,2,,0,31,山东,8,2017,1912.0,9253.0,253.0,1513.0,1912.0,20,80442.7,1912.0
43372,320000,1,,77,3,江苏,1,2018,973.0,,,,973.0,25,,
33530,330000,1,,0,71,浙江,3,2016,293.0,109165.0,2.0,11372.0,293.0,3,59911.75,293.0
15334,310000,1,,0,37,上海,12,2016,556.6,610.0,190.0,1743.0,556.6,12,39870.1,556.6


In [7]:
data_true.head()

Unnamed: 0,adcode,model,regMonth,regYear,label_true
0,310000,0,1,2016,292.0
1,530000,0,1,2016,452.7
2,150000,0,1,2016,233.45
3,110000,0,1,2016,408.0
4,510000,0,1,2016,604.0


In [1]:
#定义lgboost模型
def Lgb_To_Pred(Xtrain,label,val,Xtest,params):
    Dtrain = lgb.Dataset(np.array(Xtrain),label);
    best_round=params['nrounds'];
    clf = lgb.train(params,Dtrain,best_round);
      
    return clf.predict(Xtest),clf.predict(val),clf.feature_importance()
#定义xgboost模型
def Xgb_To_Pred(Xtrain,label,val,Xtest,params):
    DMtrain = xgb.DMatrix(np.array(Xtrain),label);
    DMtest = xgb.DMatrix(np.array(Xtest));
    DMval = xgb.DMatrix(np.array(val));
    best_round=params['nrounds'];
    clf = xgb.train(params,DMtrain,best_round);
    
    return clf.predict(DMtest),clf.predict(DMval)

In [None]:
path = '../ccf_car/'
train_sales = pd.read_csv(path + 'train_sales_data.csv')
train_search = pd.read_csv(path + 'train_search_data.csv')
train_user = pd.read_csv(path + 'train_user_reply_data.csv')
evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
submit_example = pd.read_csv(path + 'submit_example.csv')
data = pd.concat([train_sales, evaluation_public], ignore_index=True)
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['salesVolume'] = data.groupby(['adcode', 'model'])['salesVolume'].transform(quantile_clip)
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
# LabelEncoder
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']


In [23]:
df = data_df.copy()
# 数据集划分
st = 13
m=25
all_idx   = (df['mt'].between(st , m-1))
train_idx = (df['mt'].between(st , m-5))
valid_idx = (df['mt'].between(m-4, m-4))
test_idx  = (df['mt'].between(m  , m+4))
print('all_idx  :',st ,m-1)
print('train_idx:',st ,m-5)
print('valid_idx:',m-4,m-4)
print('test_idx :',m  ,m  )
# 最终确认
train_x = df[train_idx][features]
train_y = df[train_idx]['label']
valid_x = df[valid_idx][features]
valid_y = df[valid_idx]['label']
test_x  = df[test_idx][features]
#xgb预测
xgb_params = {
    #'tree_method':"gpu_hist",
    'objective': 'reg:linear',
    'learning_rate': 0.3,
    'max_depth': 1,
    'subsample':1,
    'colsample_bytree':0.06,
    'alpha':50,
    'lambda':5,
    'nrounds':2100
}
xgby,xgbval = Xgb_To_Pred(train_x,train_y,valid_x,test_x,xgb_params)
#lgb预测
lgb_params = {
   # 'device':'gpu',
    'application':'regression_l1',
    'metric':'mae',
    'seed': 0,
    'learning_rate':0.04,
    'max_depth':1,
    'feature_fraction':0.5,
    'lambda_l1':1,
    'nrounds':900
}
lgby,lgbval,q = Lgb_To_Pred(train_x,train_y,valid_x,test_x,lgb_params)
## 融合第一层预测结果和第二层特征，生成第二层训练、测试集
final_train_x=pd.DataFrame()
final_train_y=pd.DataFrame()
final_test_x=pd.DataFrame()
final_train_x['xgbval']=xgbval
final_train_x['lgbval']=lgbval
final_train_y=valid_y
final_test_x['xgby']=xgby
final_test_x['lgby']=lgby

# # 开始第二层训练、预测
lgb2_params = {
   # 'device':'gpu',
    'application':'regression_l1',
    'seed':0,
    'learning_rate': 0.02,
    'max_depth':1,
    'feature_fraction':0.8,
    'nrounds':1400
}


y_pred,yval,q = Lgb_To_Pred(final_train_x,final_train_y,final_train_x,final_test_x,lgb2_params)

all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25


In [25]:
y_pred[:15]

array([ 259.436715  ,  283.07301227,  137.14633132,  249.32370713,
        399.72283201,  137.67707059,  392.17984225,  137.14633132,
       2826.68827283,  350.97233976,  633.81060282,  137.10812464,
        278.80952085,  279.34026012,  586.69971917])

In [27]:
rst=submit_example.copy()
rst['forecastVolum']=y_pred.round().astype(int)
rst['forecastVolum']=rst['forecastVolum'].apply(lambda x: 0 if x < 0 else x)
rst.to_csv('../rst/real_stacking.csv',index=False)
# 0.49281591000

In [12]:
xgby[:5]

array([214.59834, 259.77722, 159.94762, 239.67267, 425.10458],
      dtype=float32)

In [13]:
df.head()

Unnamed: 0,adcode,bodyType,forecastVolum,id,model,province,regMonth,regYear,salesVolume,popularity,carCommentVolum,newsReplyVolum,label,mt,area_sales_volume,model_adcode,model_adcode_mt,model_adcode_mt_label_1,shift_model_adcode_mt_label_1,model_adcode_mt_label_2,shift_model_adcode_mt_label_2,model_adcode_mt_label_3,shift_model_adcode_mt_label_3,model_adcode_mt_label_4,shift_model_adcode_mt_label_4,model_adcode_mt_label_5,shift_model_adcode_mt_label_5,model_adcode_mt_label_6,shift_model_adcode_mt_label_6,model_adcode_mt_label_7,shift_model_adcode_mt_label_7,model_adcode_mt_label_8,shift_model_adcode_mt_label_8,model_adcode_mt_label_9,shift_model_adcode_mt_label_9,model_adcode_mt_popularity_1,shift_model_adcode_mt_popularity_1,model_adcode_mt_popularity_2,shift_model_adcode_mt_popularity_2,model_adcode_mt_popularity_3,shift_model_adcode_mt_popularity_3,model_adcode_mt_popularity_4,shift_model_adcode_mt_popularity_4,model_adcode_mt_popularity_5,shift_model_adcode_mt_popularity_5,model_adcode_mt_popularity_6,shift_model_adcode_mt_popularity_6,model_adcode_mt_popularity_7,shift_model_adcode_mt_popularity_7,model_adcode_mt_popularity_8,shift_model_adcode_mt_popularity_8,model_adcode_mt_popularity_9,shift_model_adcode_mt_popularity_9,model_adcode_mt_area_sales_volume_1,shift_model_adcode_mt_area_sales_volume_1,model_adcode_mt_area_sales_volume_2,shift_model_adcode_mt_area_sales_volume_2,model_adcode_mt_area_sales_volume_3,shift_model_adcode_mt_area_sales_volume_3,model_adcode_mt_area_sales_volume_4,shift_model_adcode_mt_area_sales_volume_4,model_adcode_mt_area_sales_volume_5,shift_model_adcode_mt_area_sales_volume_5,model_adcode_mt_area_sales_volume_6,shift_model_adcode_mt_area_sales_volume_6,model_adcode_mt_area_sales_volume_7,shift_model_adcode_mt_area_sales_volume_7,model_adcode_mt_area_sales_volume_8,shift_model_adcode_mt_area_sales_volume_8,model_adcode_mt_area_sales_volume_9,shift_model_adcode_mt_area_sales_volume_9
0,6,0,,0,0,上海,0,2016,292.0,1479.0,11.0,106.0,292.0,1,37142.6,310000,31000001,31000002,,31000003,,31000004,,31000005,,31000006,,31000007,,31000008,,31000009,,31000010,,31000002,,31000003,,31000004,,31000005,,31000006,,31000007,,31000008,,31000009,,31000010,,31000002,,31000003,,31000004,,31000005,,31000006,,31000007,,31000008,,31000009,,31000010,
1,20,0,,0,0,云南,0,2016,452.7,1594.0,11.0,106.0,452.7,1,34749.05,530000,53000001,53000002,,53000003,,53000004,,53000005,,53000006,,53000007,,53000008,,53000009,,53000010,,53000002,,53000003,,53000004,,53000005,,53000006,,53000007,,53000008,,53000009,,53000010,,53000002,,53000003,,53000004,,53000005,,53000006,,53000007,,53000008,,53000009,,53000010,
2,3,0,,0,0,内蒙古,0,2016,233.45,1479.0,11.0,106.0,233.45,1,23645.4,150000,15000001,15000002,,15000003,,15000004,,15000005,,15000006,,15000007,,15000008,,15000009,,15000010,,15000002,,15000003,,15000004,,15000005,,15000006,,15000007,,15000008,,15000009,,15000010,,15000002,,15000003,,15000004,,15000005,,15000006,,15000007,,15000008,,15000009,,15000010,
3,0,0,,0,0,北京,0,2016,408.0,2370.0,11.0,106.0,408.0,1,40538.85,110000,11000001,11000002,,11000003,,11000004,,11000005,,11000006,,11000007,,11000008,,11000009,,11000010,,11000002,,11000003,,11000004,,11000005,,11000006,,11000007,,11000008,,11000009,,11000010,,11000002,,11000003,,11000004,,11000005,,11000006,,11000007,,11000008,,11000009,,11000010,
4,19,0,,0,0,四川,0,2016,604.0,3562.0,11.0,106.0,604.0,1,68720.1,510000,51000001,51000002,,51000003,,51000004,,51000005,,51000006,,51000007,,51000008,,51000009,,51000010,,51000002,,51000003,,51000004,,51000005,,51000006,,51000007,,51000008,,51000009,,51000010,,51000002,,51000003,,51000004,,51000005,,51000006,,51000007,,51000008,,51000009,,51000010,


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
from time import time
BeginTime = time()

#读取数据
path = '../ccf_car'

test_correlation = pd.read_csv(path+'/test_correlation.csv')
train_correlation = pd.read_csv(path+'/train_correlation.csv')
all_correlation = pd.merge(train_correlation,test_correlation,how='left')
TargetID = all_correlation['Unnamed: 0']


test_fund_return =  pd.read_csv(path+'/test_fund_return.csv')
train_fund_return =  pd.read_csv(path+'/train_fund_return.csv')
all_fund_return = pd.merge(train_fund_return,test_fund_return,how='left')


test_fund_benchmark_return =  pd.read_csv(path+'/test_fund_benchmark_return.csv')
train_fund_benchmark_return =  pd.read_csv(path+'/train_fund_benchmark_return.csv')
all_fund_benchmark_return = pd.merge(train_fund_benchmark_return,test_fund_benchmark_return,how='left')


test_index_return = pd.read_csv(path+'/test_index_return.csv',encoding='GBK',index_col=0)
train_index_return =  pd.read_csv(path+'/train_index_return.csv',encoding='GBK',index_col=0)
index_return = pd.concat([train_index_return,test_index_return],axis=1)

#根据TargetID把基金对拆分为两列ID，分别为基金1和基金2 
Target1 = TargetID.map(lambda x:x.split('-')[0])
Target2 = TargetID.map(lambda x:x.split('-')[1])
SplitID = pd.concat([Target1,Target2],axis=1)
SplitID.columns = ['Target1','Target2']


#根据评分规则，定义验证函数
from sklearn.metrics import mean_absolute_error  
def model_metrics(ypred,ytrue):
    msum = 0;
    mcount = 0;
    for i in range(len(ypred)):
        msum += abs((ypred[i]-ytrue[i]) / (1.5-ytrue[i]));
        mcount +=1;
    mae = mean_absolute_error(ytrue,ypred);
    metrics_result = ((2/(2+mae+msum/mcount))**2);
    return metrics_result

#定义xgboost模型
def Xgb_To_Pred(Xtrain,label,val,Xtest,params):

    DMtrain = xgb.DMatrix(np.array(Xtrain),label);
    DMtest = xgb.DMatrix(np.array(Xtest));
    DMval = xgb.DMatrix(np.array(val));
    
    best_round=params['nrounds'];
    clf = xgb.train(params,DMtrain,best_round);
    
    return clf.predict(DMtest),clf.predict(DMval)


#定义lgboost模型
def Lgb_To_Pred(Xtrain,label,val,Xtest,params):
    
    Dtrain = lgb.Dataset(np.array(Xtrain),label);
    
    best_round=params['nrounds'];
    clf = lgb.train(params,Dtrain,best_round);
      
    return clf.predict( Xtest ),clf.predict( val ),clf.feature_importance()

#定义IdData函数：根据输入的数据集和起止时间，提取基金1和基金2的数据作为特征

def IdData(DataSet,StartTime,EndTime):
    
    DataID = DataSet[DataSet.columns[0]]
    Data   = DataSet[DataSet.columns[StartTime:EndTime]]
    
    FundData = pd.concat((DataID,Data),axis=1)
    FundData.rename(columns={FundData.columns[0]:"Target1"},inplace=True)
    Target1  = pd.merge(SplitID,FundData,how = 'left')      
    FundData.rename(columns={FundData.columns[0]:"Target2"},inplace=True)
    Target2 = pd.merge(SplitID,FundData,on = 'Target2',how = 'left')
    
    Target1 = Target1[Target1.columns[2:]]
    Target2 = Target2[Target2.columns[2:]]
    Target1.columns=range(0,Target1.shape[1])
    Target2.columns=range(0,Target2.shape[1])
    return Target1,Target2


#从相关性计算结果表中提取与TargetID相对应的数据作为特征
#因为相关性计算结果表是n*n的矩阵，我们按顺序取对角线左下区域的相关性数据。
def GetCorr(q):
    for j in range(test_fund_return.shape[0]):
        if j ==0:
            trainr = q[j][j+1:];
        else:
            x = q[j][j+1:];
            trainr = np.hstack([trainr,x]);
    return trainr


#计算各基金对Index的相关性，并计算基金对之间的曼哈顿距离之和作为特征
def GetIndexCorr(Data,StartTime,EndTime):
    a = pd.concat([Data[Data.columns[StartTime:EndTime]].T,index_return[index_return.columns[StartTime:EndTime]].T],axis=1)
    b = a.corr()[-35:]
    c = b[b.columns[:-35]].T
    d = c.rank(axis=1,ascending=False)
    e = pd.concat([all_fund_return['Unnamed: 0'],c],axis=1)
    A,B = IdData(e,1,None)
    return abs(A-B).sum(axis=1)

#计算数据集的平均值，25%、50%、75%分位值，作为特征之一
def Describe(data,StartTime,EndTime):
    a = data[data.columns[StartTime:EndTime]].T
    b= a.mean()
    c = a.quantile(0.25)
    d = a.quantile(0.5)
    e = a.quantile(0.75)
    return np.vstack([b,c,d,e]).T


#提取第一层训练集特征共5组特征:
#1、特征分别为基金对的fund_return相关性\benchmark_return相关性\fund_return累计值的相关性\fund_return累计值的曼哈顿距离\fund_return相关性

def GetFeature(StartTime,EndTime): 
    
    Date = all_fund_return.columns[StartTime:EndTime]
    FRData = all_fund_return[Date].T ;
    FRCorr = GetCorr(FRData.corr()) ;#计算并提取各基金对的fund_return相关性
    FRCumCor = GetCorr(FRData.cumsum(axis=1).corr())#计算并提取各基金对的fund_return累计值的相关性
    
    BRData = all_fund_benchmark_return[Date].T ;
    BRData = BRData.corr() ;
    BRCorr = GetCorr(BRData) ;#计算并提取各基金对的benchmark_return相关性
    
    Target1FR,Target2FR = IdData(all_fund_return,StartTime,EndTime)
    A,B = Target1FR.cumsum(axis=1), Target2FR.cumsum(axis=1)
    FRCum = abs(A[A.columns[-1]]-B[B.columns[-1]])#计算并提取各基金对fund_return累计值的曼哈顿距离
    TargetCor = (Target1FR.T).corrwith(Target2FR.T)#计算并提取各基金对fund_return相关性
    
    return np.vstack([FRCorr,FRCumCor,BRCorr,FRCum,TargetCor]).T
    

#第二层训练集特征：
#第二层特征为基金对的fund_return的曼哈顿距离求和
#定义函数：融合第一层预测结果和第二次训练集特征

Feature2date = [5,30,60,90]  #第二层训练集的统计时间段，分别为5天、30天、60天、90天

def StackFeature2(date,StackData,StartTime,EndTime):
    for i in tqdm(date):
        
        Target1FR,Target2FR = IdData(all_fund_return,-i+StartTime,EndTime)
        
        MDTargetFR = abs(Target1FR-Target2FR).sum(axis=1)     #计算基金1、2 fund_return的曼哈顿距离并求和   
        
        StackData = np.vstack([StackData,MDTargetFR])
        
    return StackData.T


#定义函数：根据给定时间间隔和次数，叠加特征集，并增加一组特征：计算基金对相关性的平均值，25%、50%、75%分位值。

def StackFeature(StartTime,EndTime,times):
    for i in tqdm(range(times)):        
        if i ==0:
            xtrain = GetFeature(StartTime,EndTime) ;
            TCorrDes = Describe(all_correlation,1,None)#计算基金对相关性的 平均值，25%、50%、75%分位值
            xtrain = np.hstack([TCorrDes,xtrain])
        else:
            DayF = StartTime-day*(i+1)
            StackTrain = GetFeature(DayF,EndTime) ;
            
            xtrain = np.hstack([xtrain,StackTrain]) ;

    return xtrain


#根据给的的时间段和叠加次数，叠加训练集以增加训练集的数据量

def StackTrain(EndTime,Time,long):
    for i in range(Time):
        
        Stacktrain = StackFeature(-day+EndTime,EndTime,times) #生成训练集
        StackTarget = all_correlation[all_correlation.columns[EndTime+60-i]] #生成训练集对应的目标集
                                      
        if i == 0 :
            TrainData = Stacktrain
            TrainTarget = StackTarget
        else:
            TrainData = np.vstack([TrainData,Stacktrain])  #叠加训练集
            TrainTarget = np.hstack([TrainTarget,StackTarget])  #叠加训练集对应的目标集
        
    return TrainData,TrainTarget


# # 生成第一层训练、预测数据

#1、定义训练目标和验证集目标
trainday=-62#训练集日期
valday=-61#验证集日期
testday=-61  #用于线下测试集，用于模型验证，
ytrain = all_correlation[all_correlation.columns[trainday+60]] ;
test_val1 = all_correlation[all_correlation.columns[valday+60]]
test_val2 = all_correlation[all_correlation.columns[testday+60]]#用于线下测试集，用于模型验证，

#设定:间隔每20天提取一次FRCorr,FRCumCor,BRCorr,FRCum,FRCorr特征，即0-20，0-40……0-400天的数据，生成训练、验证、测试数据集
#加上基金对相关性的 平均值，25%、50%、75%分位值共1004列特征
day=20
times=20
#xtrain = StackFeature(-day+trainday,trainday,times) ;
xval1 = StackFeature(-day+valday,valday,times) ;
xtest = StackFeature(-day,None,times) ;



#叠加训练集以增加训练集的数据量
xtrain,ytrain=StackTrain(trainday,10,1)


# # 开始第一层训练、预测


#xgb预测
xgb_params = {
    #'tree_method':"gpu_hist",
    'objective': 'reg:linear',
    'learning_rate': 0.3,
    'max_depth': 1,
    'subsample':1,
    'colsample_bytree':0.06,
    'alpha':50,
    'lambda':5,
    'nrounds':2100
}

xgby,xgbval = Xgb_To_Pred(xtrain,ytrain,xval1,xtest,xgb_params)

model_metrics(xgbval,test_val1),model_metrics(xgby,test_val2)

#lgb预测
lgb_params = {
   # 'device':'gpu',
    'application':'regression_l1',
    'metric':'mae',
    'seed': 0,
    'learning_rate':0.04,
    'max_depth':1,
    'feature_fraction':0.5,
    'lambda_l1':1,
    'nrounds':900
}
lgby,lgbval,q = Lgb_To_Pred(xtrain,ytrain,xval1,xtest,lgb_params)

model_metrics(lgbval,test_val1),model_metrics(lgby,test_val2)


# # 融合第一层预测结果和第二层特征，生成第二层训练、测试集

#第一层预测结果融合
strain=np.vstack([xgbval,lgbval]);
stest=np.vstack([lgby,lgby]);

#第一次预测结果和第二层特征融合
strain = StackFeature2(Feature2date,strain,valday,valday)
stest = StackFeature2(Feature2date,stest,0,None)


# # 开始第二层训练、预测

lgbs_params = {
   # 'device':'gpu',
    'application':'regression_l1',
    'seed':0,
    'learning_rate': 0.02,
    'max_depth':1,
    'feature_fraction':0.8,
    'nrounds':1400
}


y_pred,yval,q = Lgb_To_Pred(strain,test_val1,strain,stest,lgbs_params,)
print("The prediction had almost complited and It takes about " + str(time()-BeginTime) + 'second')

model_metrics(yval,test_val1),model_metrics(y_pred,test_val2)

df = pd.DataFrame({'ID':TargetID,'value':y_pred})


# In[ ]:


df.to_csv('For The Dream.csv',index=None)