# 模块导入

In [31]:
import sys
import numpy as np
import pandas as pd
import os 
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# 数据处理

In [32]:
path  = '../ccf_car/'
train_sales  = pd.read_csv(path+'train_sales_data.csv')
train_search = pd.read_csv(path+'train_search_data.csv')
train_user   = pd.read_csv(path+'train_user_reply_data.csv')
evaluation_public = pd.read_csv(path+'evaluation_public.csv')
submit_example    = pd.read_csv(path+'submit_example.csv')
data = pd.concat([train_sales, evaluation_public], ignore_index=True)
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
#LabelEncoder
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

In [33]:
# 0.58973169000
def quantile_clip(group):
    #group.plot()
    group[group < group.quantile(.05)] = group.quantile(.05)
    group[group > group.quantile(.9)] = group.quantile(.9)
    #group.plot()
    #plt.show()
    return group
data['salesVolume'] = data.groupby(['adcode', 'model'])['salesVolume'].transform(quantile_clip)
def quantile_clip(group):
    #group.plot()
    group[group < group.quantile(.05)] = group.quantile(.05)
    group[group > group.quantile(.9)] = group.quantile(.95)
    #group.plot()
    #plt.show()
    return group
data['popularity'] = data.groupby(['adcode', 'model'])['popularity'].transform(quantile_clip)
data['carCommentVolum'] = data.groupby(['adcode', 'model'])['popularity'].transform(quantile_clip)
data['newsReplyVolum'] = data.groupby(['adcode', 'model'])['popularity'].transform(quantile_clip)

# 提取特征

In [34]:
def get_stat_feature(df_):   
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']
    for col in tqdm(['label','popularity']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9,10,11,12]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])    
    return df,stat_feat

# 评价指标

In [35]:
def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)	

# 模型选择

In [36]:
def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):   
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              categorical_feature=cate_feat, 
              early_stopping_rounds=100, verbose=100)      
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000, 
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9, 
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse' 
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              early_stopping_rounds=100, verbose=100)   
    return model

# 模型训练

In [37]:
def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )  
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']   
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)  
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx]) 
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features]) 
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)  
    return sub,df[valid_idx]['pred_label']

# 逐步预测

In [38]:
for month in [25,26,27,28]: 
    m_type = 'lgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear'] + stat_feat
    cate_feat = ['adcode','bodyType','model','regMonth']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)   
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values	
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
sub[['id','forecastVolum']].round().astype(int).to_csv('../rst/yulao_original.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.96it/s]


29 29
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 8308.15	valid_1's l2: 34245
[200]	training's l2: 4322.09	valid_1's l2: 31409.9
[300]	training's l2: 2896.51	valid_1's l2: 30772.2
[400]	training's l2: 2035.28	valid_1's l2: 30354.8
[500]	training's l2: 1518.59	valid_1's l2: 30108.8
[600]	training's l2: 1163.05	valid_1's l2: 29944.5
[700]	training's l2: 926.6	valid_1's l2: 29905.8
[800]	training's l2: 748.193	valid_1's l2: 29825.8
[900]	training's l2: 605.816	valid_1's l2: 29831.7
[1000]	training's l2: 491.775	valid_1's l2: 29795.5
[1100]	training's l2: 404.655	valid_1's l2: 29729.2
[1200]	training's l2: 337.881	valid_1's l2: 29698.4
[1300]	training's l2: 285.208	valid_1's l2: 29695
[1400]	training's l2: 239.801	valid_1's l2: 29672.6
[1500]	training's l2: 203.022	valid_1's l2: 29676.6
Early stopping, best iteration is:
[1402]	training's l2: 239.108	valid_1's l2: 29672
0.759097560761

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.87it/s]


29 29
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 8825.53	valid_1's l2: 44247.9
[200]	training's l2: 4920.57	valid_1's l2: 44070.1
[300]	training's l2: 3315.36	valid_1's l2: 43857.1
[400]	training's l2: 2402.55	valid_1's l2: 43561.5
[500]	training's l2: 1822.84	valid_1's l2: 43445.9
[600]	training's l2: 1441.56	valid_1's l2: 43346.2
Early stopping, best iteration is:
[531]	training's l2: 1697.73	valid_1's l2: 43310.6
0.7378242786091564
valid mean: 621.9037097903743
true  mean: 616.5537878787878
test  mean: 334.36317043215075


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.87it/s]


29 29
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 10047.7	valid_1's l2: 32061.3
[200]	training's l2: 5706.69	valid_1's l2: 31838.9
[300]	training's l2: 3924	valid_1's l2: 31792
Early stopping, best iteration is:
[228]	training's l2: 5092.85	valid_1's l2: 31538.3
0.7812724457688527
valid mean: 643.8194847673321
true  mean: 673.0143939393939
test  mean: 518.6337449981689


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.31it/s]


29 29
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 10612.8	valid_1's l2: 336780
[200]	training's l2: 6139.32	valid_1's l2: 329476
Early stopping, best iteration is:
[187]	training's l2: 6485.59	valid_1's l2: 328740
0.5988395843124066
valid mean: 640.9268782741995
true  mean: 899.8204545454546
test  mean: 506.2733406612413


# 将0.62的四月与鱼佬融合

In [39]:
best_062=pd.read_csv('../rst/057_lqbz_quanguize.csv')
yulao_original=pd.read_csv('../rst/yulao_original.csv')
mask=list(data.loc[(data.regMonth==4)&(data.regYear==2018),'id'])
mask=[x in mask for x in yulao_original.id.values]
yulao_original.loc[mask,'forecastVolum']=best_062.loc[mask,'forecastVolum']
yulao_original.to_csv('../rst/yulao_original_4.csv',index=False)

In [40]:
yulao_original.tail()

Unnamed: 0,id,forecastVolum
5275,5364,90
5276,5365,88
5277,5366,97
5278,5367,189
5279,5368,63


In [27]:
best_062.tail()

Unnamed: 0,id,forecastVolum
5275,5364,90
5276,5365,88
5277,5366,97
5278,5367,189
5279,5368,63


In [28]:
yulao_original.head()

Unnamed: 0,id,forecastVolum
0,1,265
1,2,336
2,3,167
3,4,317
4,5,420


In [29]:
best_062.head()

Unnamed: 0,id,forecastVolum
0,1,253
1,2,304
2,3,200
3,4,267
4,5,434
