# Solution - 分月预测

### 导入相关库

In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all' 

np.random.seed(2019)

### 读取数据

In [2]:
data_path = './data/' 
train_user_reply_data = pd.read_csv(data_path + 'train_user_reply_data.csv', encoding='utf-8')
train_search_data = pd.read_csv(data_path + 'train_search_data.csv', encoding='utf-8')
train_sales_data = pd.read_csv(data_path + 'train_sales_data.csv', encoding='utf-8')
evaluation_public = pd.read_csv(data_path + 'evaluation_public.csv', encoding='utf-8')

train_sales_data = train_sales_data.merge(train_search_data, on=['province', 'adcode', 'model', 'regYear', 'regMonth'])

### 固定车型和省份顺序-样本顺序

In [3]:
cars = ['f8a6975573af1b33', '2a2ab41f8f6ff1cb', 'd4efbebb087fd03f', '3e21824be728cbec', 'ea489c253676aafc', '6155b214590c66e6', 'fc32b1a017b34efe', '9c1c7ee8ebdda299', 'fde95ea242abd896', '7a7885e2d7c00bcf', '7245e0ee27b195cd', 'b25c4e2e3856af22', '7aab7fca2470987e', 'feabbf46658382b9', '04e66e578f653ab9', '5d7fb682edd0f937', 'b4be3a4917289c82', '54fc07138d70374c', 'ef76a85c4b39f693', 'bb9fbec9a2833839', '3c974920a76ac9c1', '212083a9246d2fd3', '4f79773e600518a6', 'af6f4f548684e14d', '936168bd4850913d', 'cd5841d44fd7625e', '0797526c057dcf5b', 'a207df29ec9583f0', '3d7554f1f56dd664', '7023efdab9cedc03', 'da457d15788fe8ee', '12f8b7e14947c34d', '28e29f2c03dcd84c', '63065128401bb3ff', 'a432c483b5beb856', '37aa9169b575ef79', '17bc272c93f19d56', '61e73e32ad101892', '4a103c30d593fbbe', '2d0d2c3403909fdb', '6858d6dfe680bdf7', '17363f08d683d52b', '346393c2c6305fb1', '5b1c11c3efed5312', '97f15de12cfabbd5', 'a9a43d1a7ecbe75d', '7cf283430b3b5e38', 'c6833cb891626c17', 'a28bb927b6fcb33c', 'dff803b4024d261d', '02aab221aabc03b9', 'f5d69960089c3614', '06880909932890ca', '79de4e4b24c35b04', 'd0f245b8781e3631', 'c06a2a387c0ee510', 'cc21c7e91a3b5a0c', 'f270f6a489c6a9d7', '8c915fe4632fb9fa', 'c6cd4e0e073f5ac2']
provinces = ['浙江', '福建', '四川', '陕西', '安徽', '湖南', '广东', '云南', '上海', '山东', '湖北', '黑龙江', '江苏', '广西', '内蒙古', '辽宁', '北京', '重庆', '河北', '山西', '江西', '河南']

### 评测函数

In [4]:
from sklearn.metrics import mean_squared_error

def metrics(y_true, y_pred, model):
    data = pd.DataFrame({'model': model, 'salesVolume': y_true, 'label': y_pred})
    data['label'] = data['label'].map(lambda index: -index if index < 0 else index)
    res, count = 0, 0
    for index, cars in data.groupby('model'):
        a = np.array(cars['salesVolume'])
        b = np.array(cars['label'])
        temp = np.sqrt(np.sum((a - b) ** 2) / len(a)) / np.mean(a)
        res += temp
        count += 1
        print(temp)
    return 1 - (res / count)

### 获取训练/测试数据索引下标

In [5]:
def get_train_feature(windows_size, before):
    features = pd.DataFrame()
    for car in cars:
        for province in provinces:
            car_province_part = train_sales_data[(train_sales_data['model'] == car) & (train_sales_data['province'] == province)]
            car_province_part['label'] = car_province_part['salesVolume'].shift(-windows_size)
            car_province_part = car_province_part[before:24 - windows_size]
            features = pd.concat([features, car_province_part], axis=0)
    features.index = range(len(features))
    return features

def get_test_feature(windows_size, before):
    features = pd.DataFrame()
    for car in cars:
        for province in provinces:
            car_province_part = train_sales_data[(train_sales_data['model'] == car) & (train_sales_data['province'] == province)]
            car_province_part['label'] = car_province_part['salesVolume'].shift(-windows_size)
            car_province_part = car_province_part[-1:]
            features = pd.concat([features, car_province_part], axis=0)
    features.index = range(len(features))
    return features

### 特征提取

In [6]:
def get_basic_feature(windows_size, before, data_set_name):
    features = pd.DataFrame()
    for car in cars:
        for province in provinces:
            car_province_part = train_sales_data[(train_sales_data['model'] == car) & (train_sales_data['province'] == province)].copy()
            car_province_part['popularity'] = car_province_part['popularity'].apply(lambda index: np.log(index))     ###
            car_province_part['salesVolume'] = car_province_part['salesVolume'].apply(lambda index: np.log(index))   ###
            
            # 春节标记特征
            car_province_part['is_pring_festival'] = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            car_province_part['distance_spring_festival'] = [1, 0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2]
            
            # 一阶差分
            for index in range(1, before, 1):
                car_province_part['salesVolume_' + str(index)] = car_province_part['salesVolume'].shift(index)
                car_province_part['salesVolume_diff_' + str(index)] = car_province_part['salesVolume'].diff(index)
                car_province_part['salesVolume_qoq_' + str(index)] = car_province_part['salesVolume'] / car_province_part['salesVolume_' + str(index)]

                car_province_part['popularity_' + str(index)] = car_province_part['popularity'].shift(index)
                car_province_part['popularity_diff_' + str(index)] = car_province_part['popularity'].diff(index)
                car_province_part['popularity_hb_' + str(index)] = car_province_part['popularity'] / car_province_part['popularity_' + str(index)]

            # 二阶差分
            for index in range(1, before - 1, 1):   
                car_province_part['salesVolume_diff2_{}'.format(str(index))] = car_province_part['salesVolume_diff_' + str(index)].diff(1)

            # 历史统计特征
            salesVolume = list(car_province_part['salesVolume'])
            popularity = list(car_province_part['popularity'])
            car_province_part['index'] = 1
            car_province_part['index'] = car_province_part['index'].cumsum()
            car_province_part['salesVolume_his'] = car_province_part['index'].map(lambda index: salesVolume[index - 7:index])
            car_province_part['popularity_his'] = car_province_part['index'].map(lambda index: popularity[index - 7:index])

            car_province_part['salesVolume_his_diff'] = car_province_part['salesVolume_his'].map(lambda index: np.diff(index))
            car_province_part['popularity_his_diff'] = car_province_part['popularity_his'].map(lambda index: np.diff(index))

            def pth(array):
                return np.max(array) - np.min(array)

            fea_name = ['max', 'min', 'aver', 'var', 'pth']
            fun_name = [np.max, np.min, np.average, np.var, pth]
            for i in range(len(fun_name)):
                car_province_part['salesVolume_his_' + fea_name[i]] = car_province_part['salesVolume_his'].apply(lambda index: 0 if len(index) == 0 else fun_name[i](index))                
                car_province_part['salesVolume_his_diff_' + fea_name[i]] = car_province_part['salesVolume_his_diff'].apply(lambda index: 0 if len(index) == 0 else fun_name[i](index))
            
            car_province_part.drop(['index', 'salesVolume_his', 'popularity_his', 'salesVolume_his_diff', 'popularity_his_diff'], axis=1, inplace=True)
            
            # 数据集划分
            if data_set_name == 'train':
                car_province_part = car_province_part[before : 24 - windows_size]
            else:
                car_province_part = car_province_part[-1:]

            car_province_part.drop(['popularity'], axis=1, inplace=True)    ###  , 'day_count', 'day_salesVolume', 'popularity'
            features = pd.concat([features, car_province_part], axis=0, ignore_index=True)

    print(features)
    return features

### Begain

In [7]:
test_prob_collection = pd.DataFrame()

### Model-LightGBM - 一月

In [8]:
size, pre = 1, 10  # 4
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

categorial_name = [0, 1, 2, 3, 4, 6, 7]
drop_cols = ['salesVolume', 'popularity']

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 1
###############################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 11)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

      province  adcode             model bodyType  regYear  regMonth  \
0           浙江  330000  f8a6975573af1b33    Sedan     2016        11   
1           浙江  330000  f8a6975573af1b33    Sedan     2016        12   
2           浙江  330000  f8a6975573af1b33    Sedan     2017         1   
3           浙江  330000  f8a6975573af1b33    Sedan     2017         2   
4           浙江  330000  f8a6975573af1b33    Sedan     2017         3   
...        ...     ...               ...      ...      ...       ...   
17155       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         7   
17156       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         8   
17157       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         9   
17158       河南  410000  c6cd4e0e073f5ac2    Sedan     2017        10   
17159       河南  410000  c6cd4e0e073f5ac2    Sedan     2017        11   

       salesVolume  is_pring_festival  distance_spring_festival  \
0         6.543912                  0                         2   
1

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2016,11,1454.0,6.543912,0,2,...,6.752270,0.190701,6.419995,-0.208359,6.593033,-0.016429,0.012979,0.020524,0.332275,0.399060
1,浙江,330000,f8a6975573af1b33,Sedan,2016,12,752.0,7.282074,0,1,...,7.282074,0.738162,6.419995,-0.208359,6.684402,0.135729,0.072106,0.088094,0.862079,0.946520
2,浙江,330000,f8a6975573af1b33,Sedan,2017,1,503.0,6.622736,1,0,...,7.282074,0.738162,6.419995,-0.659337,6.706551,0.033790,0.065451,0.177449,0.862079,1.397499
3,浙江,330000,f8a6975573af1b33,Sedan,2017,2,780.0,6.220590,0,1,...,7.282074,0.738162,6.220590,-0.659337,6.678064,-0.065018,0.086645,0.195256,1.061483,1.397499
4,浙江,330000,f8a6975573af1b33,Sedan,2017,3,679.0,6.659294,0,2,...,7.282074,0.738162,6.220590,-0.659337,6.685007,-0.009146,0.085999,0.229692,1.061483,1.397499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17155,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,7,991.0,6.577861,0,6,...,6.899723,0.470004,5.768321,-1.131402,6.411382,-0.053644,0.107535,0.273930,1.131402,1.601406
17156,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,8,1189.0,6.898715,0,6,...,6.898715,0.470004,5.768321,-0.169675,6.411238,0.188399,0.107394,0.045126,1.130394,0.639678
17157,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,9,1206.0,7.080868,0,5,...,7.080868,0.320853,6.238325,-0.169675,6.598745,0.140424,0.077244,0.029614,0.842543,0.490528
17158,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,10,1267.0,7.095064,0,4,...,7.095064,0.320853,6.322565,-0.169675,6.721136,0.100471,0.078898,0.028527,0.772499,0.490528


     province  adcode             model bodyType  regYear  regMonth  \
0          浙江  330000  f8a6975573af1b33    Sedan     2017        12   
1          福建  350000  f8a6975573af1b33    Sedan     2017        12   
2          四川  510000  f8a6975573af1b33    Sedan     2017        12   
3          陕西  610000  f8a6975573af1b33    Sedan     2017        12   
4          安徽  340000  f8a6975573af1b33    Sedan     2017        12   
...       ...     ...               ...      ...      ...       ...   
1315       重庆  500000  c6cd4e0e073f5ac2    Sedan     2017        12   
1316       河北  130000  c6cd4e0e073f5ac2    Sedan     2017        12   
1317       山西  140000  c6cd4e0e073f5ac2    Sedan     2017        12   
1318       江西  360000  c6cd4e0e073f5ac2    Sedan     2017        12   
1319       河南  410000  c6cd4e0e073f5ac2    Sedan     2017        12   

      salesVolume  is_pring_festival  distance_spring_festival  salesVolume_1  \
0        7.357556                  0                         2    

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2017,12,,7.357556,0,2,...,7.357556,0.363114,6.632002,-0.125718,7.001934,0.120926,0.053544,0.026056,0.725554,0.488831
1,福建,350000,f8a6975573af1b33,Sedan,2017,12,,6.605298,0,2,...,6.605298,0.190729,6.006353,-0.220184,6.296365,0.063127,0.032470,0.021035,0.598945,0.410912
2,四川,510000,f8a6975573af1b33,Sedan,2017,12,,6.864848,0,2,...,6.864848,0.329607,6.165418,-0.247408,6.437167,0.109376,0.051337,0.038224,0.699430,0.577015
3,陕西,610000,f8a6975573af1b33,Sedan,2017,12,,5.480639,0,2,...,5.480639,0.293253,5.105945,-0.135624,5.264980,0.062449,0.018698,0.024172,0.374693,0.428877
4,安徽,340000,f8a6975573af1b33,Sedan,2017,12,,6.834109,0,2,...,6.834109,0.340355,6.212606,-0.120674,6.451389,0.083472,0.033873,0.021090,0.621503,0.461028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,重庆,500000,c6cd4e0e073f5ac2,Sedan,2017,12,,4.812184,0,2,...,4.812184,0.356675,3.828641,-0.287682,4.287344,0.080242,0.095821,0.061696,0.983543,0.644357
1316,河北,130000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.361302,0,2,...,6.361302,0.367341,5.652489,-0.143569,5.949899,0.094207,0.041561,0.034474,0.708813,0.510910
1317,山西,140000,c6cd4e0e073f5ac2,Sedan,2017,12,,5.645447,0,2,...,6.324359,1.287406,5.036953,-0.647605,5.537708,0.101416,0.148402,0.379318,1.287406,1.935012
1318,江西,360000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.327937,0,2,...,6.327937,0.341258,5.521461,0.021931,5.950425,0.134413,0.074671,0.011275,0.806476,0.319328


province                     0
adcode                       0
model                        0
bodyType                     0
regYear                      0
                            ..
salesVolume_his_diff_aver    0
salesVolume_his_var          0
salesVolume_his_diff_var     0
salesVolume_his_pth          0
salesVolume_his_diff_pth     0
Length: 82, dtype: int64

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2017,12,,7.357556,0,2,...,7.357556,0.363114,6.632002,-0.125718,7.001934,0.120926,0.053544,0.026056,0.725554,0.488831
1,福建,350000,f8a6975573af1b33,Sedan,2017,12,,6.605298,0,2,...,6.605298,0.190729,6.006353,-0.220184,6.296365,0.063127,0.032470,0.021035,0.598945,0.410912
2,四川,510000,f8a6975573af1b33,Sedan,2017,12,,6.864848,0,2,...,6.864848,0.329607,6.165418,-0.247408,6.437167,0.109376,0.051337,0.038224,0.699430,0.577015
3,陕西,610000,f8a6975573af1b33,Sedan,2017,12,,5.480639,0,2,...,5.480639,0.293253,5.105945,-0.135624,5.264980,0.062449,0.018698,0.024172,0.374693,0.428877
4,安徽,340000,f8a6975573af1b33,Sedan,2017,12,,6.834109,0,2,...,6.834109,0.340355,6.212606,-0.120674,6.451389,0.083472,0.033873,0.021090,0.621503,0.461028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,重庆,500000,c6cd4e0e073f5ac2,Sedan,2017,12,,4.812184,0,2,...,4.812184,0.356675,3.828641,-0.287682,4.287344,0.080242,0.095821,0.061696,0.983543,0.644357
1316,河北,130000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.361302,0,2,...,6.361302,0.367341,5.652489,-0.143569,5.949899,0.094207,0.041561,0.034474,0.708813,0.510910
1317,山西,140000,c6cd4e0e073f5ac2,Sedan,2017,12,,5.645447,0,2,...,6.324359,1.287406,5.036953,-0.647605,5.537708,0.101416,0.148402,0.379318,1.287406,1.935012
1318,江西,360000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.327937,0,2,...,6.327937,0.341258,5.521461,0.021931,5.950425,0.134413,0.074671,0.011275,0.806476,0.319328


In [9]:
# LightGBM model
params = {
      'boosting_type': 'gbdt',
      'objective': 'rmse',
      'metric': ['rmse'],   # 'l2', 'binary_logloss',
      'learning_rate': 0.03,
      'num_leaves': 2 ** 5 - 1,    # 2 ** 5 - 1
      # 'min_child_samples': 100,
      'max_depth': 6,    # 6
      'subsample': 0.8,   # 0.8
      'subsample_freq': 5,
      'colsample_bytree': 0.8,
      'seed': 2019,
      'nthread': -1,
      'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# num_boost_round: 5000   early_stopping_rounds:100
module = lgb.train(params, lgb_train, num_boost_round=5000, valid_sets=lgb_eval, early_stopping_rounds=100, categorical_feature=categorial_name)

# feature importance
importance = module.feature_importance()
print(importance)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params, lgb_data, num_boost_round=iters, categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print(predict)

print('model train over, rmse:', nrmse)   
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)
print(train_feature.shape)

[1]	valid_0's rmse: 1.67476
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 1.63239
[3]	valid_0's rmse: 1.59259
[4]	valid_0's rmse: 1.55287
[5]	valid_0's rmse: 1.51429
[6]	valid_0's rmse: 1.47701
[7]	valid_0's rmse: 1.44037
[8]	valid_0's rmse: 1.40546
[9]	valid_0's rmse: 1.37129
[10]	valid_0's rmse: 1.33709
[11]	valid_0's rmse: 1.30434
[12]	valid_0's rmse: 1.27327
[13]	valid_0's rmse: 1.24497
[14]	valid_0's rmse: 1.21766
[15]	valid_0's rmse: 1.19072
[16]	valid_0's rmse: 1.16434
[17]	valid_0's rmse: 1.13661
[18]	valid_0's rmse: 1.1117
[19]	valid_0's rmse: 1.08678
[20]	valid_0's rmse: 1.06251
[21]	valid_0's rmse: 1.0408
[22]	valid_0's rmse: 1.01939
[23]	valid_0's rmse: 0.998044
[24]	valid_0's rmse: 0.97876
[25]	valid_0's rmse: 0.959951
[26]	valid_0's rmse: 0.941889
[27]	valid_0's rmse: 0.92493
[28]	valid_0's rmse: 0.907888
[29]	valid_0's rmse: 0.89043
[30]	valid_0's rmse: 0.87381
[31]	valid_0's rmse: 0.859602
[32]	valid_0's rmse: 0.844244
[33]	valid_0's

```
model train over, rmse: 0.6966820323740932
```

### Model-LightGBM - 二月

In [10]:
size, pre = 2, 9  # 4
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 2
######################################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 10)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

      province  adcode             model bodyType  regYear  regMonth  \
0           浙江  330000  f8a6975573af1b33    Sedan     2016        10   
1           浙江  330000  f8a6975573af1b33    Sedan     2016        11   
2           浙江  330000  f8a6975573af1b33    Sedan     2016        12   
3           浙江  330000  f8a6975573af1b33    Sedan     2017         1   
4           浙江  330000  f8a6975573af1b33    Sedan     2017         2   
...        ...     ...               ...      ...      ...       ...   
17155       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         6   
17156       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         7   
17157       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         8   
17158       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         9   
17159       河南  410000  c6cd4e0e073f5ac2    Sedan     2017        10   

       salesVolume  is_pring_festival  distance_spring_festival  \
0         6.752270                  0                         3   
1

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2016,10,1454.0,6.752270,0,3,...,6.752270,0.190701,6.419995,-0.174788,6.598675,0.028144,0.012616,0.013348,0.332275,0.365489
1,浙江,330000,f8a6975573af1b33,Sedan,2016,11,752.0,6.543912,0,2,...,6.752270,0.190701,6.419995,-0.208359,6.593033,-0.016429,0.012979,0.020524,0.332275,0.399060
2,浙江,330000,f8a6975573af1b33,Sedan,2016,12,503.0,7.282074,0,1,...,7.282074,0.738162,6.419995,-0.208359,6.684402,0.135729,0.072106,0.088094,0.862079,0.946520
3,浙江,330000,f8a6975573af1b33,Sedan,2017,1,780.0,6.622736,1,0,...,7.282074,0.738162,6.419995,-0.659337,6.706551,0.033790,0.065451,0.177449,0.862079,1.397499
4,浙江,330000,f8a6975573af1b33,Sedan,2017,2,679.0,6.220590,0,1,...,7.282074,0.738162,6.220590,-0.659337,6.678064,-0.065018,0.086645,0.195256,1.061483,1.397499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17155,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,6,991.0,6.580639,0,5,...,7.367709,0.470004,5.768321,-1.131402,6.524217,-0.131178,0.221495,0.296100,1.599388,1.601406
17156,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,7,1189.0,6.577861,0,6,...,6.899723,0.470004,5.768321,-1.131402,6.411382,-0.053644,0.107535,0.273930,1.131402,1.601406
17157,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,8,1206.0,6.898715,0,6,...,6.898715,0.470004,5.768321,-0.169675,6.411238,0.188399,0.107394,0.045126,1.130394,0.639678
17158,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,9,1267.0,7.080868,0,5,...,7.080868,0.320853,6.238325,-0.169675,6.598745,0.140424,0.077244,0.029614,0.842543,0.490528


     province  adcode             model bodyType  regYear  regMonth  \
0          浙江  330000  f8a6975573af1b33    Sedan     2017        12   
1          福建  350000  f8a6975573af1b33    Sedan     2017        12   
2          四川  510000  f8a6975573af1b33    Sedan     2017        12   
3          陕西  610000  f8a6975573af1b33    Sedan     2017        12   
4          安徽  340000  f8a6975573af1b33    Sedan     2017        12   
...       ...     ...               ...      ...      ...       ...   
1315       重庆  500000  c6cd4e0e073f5ac2    Sedan     2017        12   
1316       河北  130000  c6cd4e0e073f5ac2    Sedan     2017        12   
1317       山西  140000  c6cd4e0e073f5ac2    Sedan     2017        12   
1318       江西  360000  c6cd4e0e073f5ac2    Sedan     2017        12   
1319       河南  410000  c6cd4e0e073f5ac2    Sedan     2017        12   

      salesVolume  is_pring_festival  distance_spring_festival  salesVolume_1  \
0        7.357556                  0                         2    

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2017,12,,7.357556,0,2,...,7.357556,0.363114,6.632002,-0.125718,7.001934,0.120926,0.053544,0.026056,0.725554,0.488831
1,福建,350000,f8a6975573af1b33,Sedan,2017,12,,6.605298,0,2,...,6.605298,0.190729,6.006353,-0.220184,6.296365,0.063127,0.032470,0.021035,0.598945,0.410912
2,四川,510000,f8a6975573af1b33,Sedan,2017,12,,6.864848,0,2,...,6.864848,0.329607,6.165418,-0.247408,6.437167,0.109376,0.051337,0.038224,0.699430,0.577015
3,陕西,610000,f8a6975573af1b33,Sedan,2017,12,,5.480639,0,2,...,5.480639,0.293253,5.105945,-0.135624,5.264980,0.062449,0.018698,0.024172,0.374693,0.428877
4,安徽,340000,f8a6975573af1b33,Sedan,2017,12,,6.834109,0,2,...,6.834109,0.340355,6.212606,-0.120674,6.451389,0.083472,0.033873,0.021090,0.621503,0.461028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,重庆,500000,c6cd4e0e073f5ac2,Sedan,2017,12,,4.812184,0,2,...,4.812184,0.356675,3.828641,-0.287682,4.287344,0.080242,0.095821,0.061696,0.983543,0.644357
1316,河北,130000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.361302,0,2,...,6.361302,0.367341,5.652489,-0.143569,5.949899,0.094207,0.041561,0.034474,0.708813,0.510910
1317,山西,140000,c6cd4e0e073f5ac2,Sedan,2017,12,,5.645447,0,2,...,6.324359,1.287406,5.036953,-0.647605,5.537708,0.101416,0.148402,0.379318,1.287406,1.935012
1318,江西,360000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.327937,0,2,...,6.327937,0.341258,5.521461,0.021931,5.950425,0.134413,0.074671,0.011275,0.806476,0.319328


province                     0
adcode                       0
model                        0
bodyType                     0
regYear                      0
                            ..
salesVolume_his_diff_aver    0
salesVolume_his_var          0
salesVolume_his_diff_var     0
salesVolume_his_pth          0
salesVolume_his_diff_pth     0
Length: 75, dtype: int64

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2017,12,,7.357556,0,2,...,7.357556,0.363114,6.632002,-0.125718,7.001934,0.120926,0.053544,0.026056,0.725554,0.488831
1,福建,350000,f8a6975573af1b33,Sedan,2017,12,,6.605298,0,2,...,6.605298,0.190729,6.006353,-0.220184,6.296365,0.063127,0.032470,0.021035,0.598945,0.410912
2,四川,510000,f8a6975573af1b33,Sedan,2017,12,,6.864848,0,2,...,6.864848,0.329607,6.165418,-0.247408,6.437167,0.109376,0.051337,0.038224,0.699430,0.577015
3,陕西,610000,f8a6975573af1b33,Sedan,2017,12,,5.480639,0,2,...,5.480639,0.293253,5.105945,-0.135624,5.264980,0.062449,0.018698,0.024172,0.374693,0.428877
4,安徽,340000,f8a6975573af1b33,Sedan,2017,12,,6.834109,0,2,...,6.834109,0.340355,6.212606,-0.120674,6.451389,0.083472,0.033873,0.021090,0.621503,0.461028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,重庆,500000,c6cd4e0e073f5ac2,Sedan,2017,12,,4.812184,0,2,...,4.812184,0.356675,3.828641,-0.287682,4.287344,0.080242,0.095821,0.061696,0.983543,0.644357
1316,河北,130000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.361302,0,2,...,6.361302,0.367341,5.652489,-0.143569,5.949899,0.094207,0.041561,0.034474,0.708813,0.510910
1317,山西,140000,c6cd4e0e073f5ac2,Sedan,2017,12,,5.645447,0,2,...,6.324359,1.287406,5.036953,-0.647605,5.537708,0.101416,0.148402,0.379318,1.287406,1.935012
1318,江西,360000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.327937,0,2,...,6.327937,0.341258,5.521461,0.021931,5.950425,0.134413,0.074671,0.011275,0.806476,0.319328


In [11]:
# LightGBM model
params = {
      'boosting_type': 'gbdt',
      'objective': 'rmse',
      'metric': ['rmse'],   # 'l2', 'binary_logloss',
      'learning_rate': 0.03,
      'num_leaves': 2 ** 5 - 1,
      # 'min_child_samples': 100,
      'max_depth': 6,
      'subsample': 0.8,
      'subsample_freq': 5,
      'colsample_bytree': 0.8,
      'seed': 2019,
      'nthread': -1,
      'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# categorial_name = ['adcode', 'model', 'bodyType', 'regYear', 'regMonth']

module = lgb.train(params, lgb_train, num_boost_round=5000, valid_sets=lgb_eval, early_stopping_rounds=100, categorical_feature=categorial_name)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params, lgb_data, num_boost_round=iters, categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print(predict)

print('model train over, rmse:', nrmse)   
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)

[1]	valid_0's rmse: 1.6755
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 1.63889
[3]	valid_0's rmse: 1.59946
[4]	valid_0's rmse: 1.56084
[5]	valid_0's rmse: 1.52297
[6]	valid_0's rmse: 1.49016
[7]	valid_0's rmse: 1.45433
[8]	valid_0's rmse: 1.42036
[9]	valid_0's rmse: 1.38844
[10]	valid_0's rmse: 1.35654
[11]	valid_0's rmse: 1.32748
[12]	valid_0's rmse: 1.29708
[13]	valid_0's rmse: 1.26861
[14]	valid_0's rmse: 1.2422
[15]	valid_0's rmse: 1.21711
[16]	valid_0's rmse: 1.1929
[17]	valid_0's rmse: 1.1697
[18]	valid_0's rmse: 1.14586
[19]	valid_0's rmse: 1.12288
[20]	valid_0's rmse: 1.09985
[21]	valid_0's rmse: 1.07949
[22]	valid_0's rmse: 1.05776
[23]	valid_0's rmse: 1.0373
[24]	valid_0's rmse: 1.01853
[25]	valid_0's rmse: 0.999943
[26]	valid_0's rmse: 0.983757
[27]	valid_0's rmse: 0.968678
[28]	valid_0's rmse: 0.952805
[29]	valid_0's rmse: 0.939959
[30]	valid_0's rmse: 0.925434
[31]	valid_0's rmse: 0.911317
[32]	valid_0's rmse: 0.8987
[33]	valid_0's rm

```
model train over, rmse: 0.6583423626150228
```

### Model-LightGBM - 三月

In [12]:
size, pre = 3, 8   # 5
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 3
##############################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 9)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

      province  adcode             model bodyType  regYear  regMonth  \
0           浙江  330000  f8a6975573af1b33    Sedan     2016         9   
1           浙江  330000  f8a6975573af1b33    Sedan     2016        10   
2           浙江  330000  f8a6975573af1b33    Sedan     2016        11   
3           浙江  330000  f8a6975573af1b33    Sedan     2016        12   
4           浙江  330000  f8a6975573af1b33    Sedan     2017         1   
...        ...     ...               ...      ...      ...       ...   
17155       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         5   
17156       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         6   
17157       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         7   
17158       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         8   
17159       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         9   

       salesVolume  is_pring_festival  distance_spring_festival  \
0         6.714171                  0                         4   
1

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2016,9,1454.0,6.714171,0,4,...,6.900731,0.190701,6.419995,-0.317321,6.619884,-0.031093,0.021830,0.029714,0.480736,0.508023
1,浙江,330000,f8a6975573af1b33,Sedan,2016,10,752.0,6.752270,0,3,...,6.752270,0.190701,6.419995,-0.174788,6.598675,0.028144,0.012616,0.013348,0.332275,0.365489
2,浙江,330000,f8a6975573af1b33,Sedan,2016,11,503.0,6.543912,0,2,...,6.752270,0.190701,6.419995,-0.208359,6.593033,-0.016429,0.012979,0.020524,0.332275,0.399060
3,浙江,330000,f8a6975573af1b33,Sedan,2016,12,780.0,7.282074,0,1,...,7.282074,0.738162,6.419995,-0.208359,6.684402,0.135729,0.072106,0.088094,0.862079,0.946520
4,浙江,330000,f8a6975573af1b33,Sedan,2017,1,679.0,6.622736,1,0,...,7.282074,0.738162,6.419995,-0.659337,6.706551,0.033790,0.065451,0.177449,0.862079,1.397499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17155,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,5,991.0,6.322565,0,4,...,7.367709,0.470004,5.768321,-1.131402,6.573917,-0.100995,0.241924,0.324153,1.599388,1.601406
17156,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,6,1189.0,6.580639,0,5,...,7.367709,0.470004,5.768321,-1.131402,6.524217,-0.131178,0.221495,0.296100,1.599388,1.601406
17157,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,7,1206.0,6.577861,0,6,...,6.899723,0.470004,5.768321,-1.131402,6.411382,-0.053644,0.107535,0.273930,1.131402,1.601406
17158,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,8,1267.0,6.898715,0,6,...,6.898715,0.470004,5.768321,-0.169675,6.411238,0.188399,0.107394,0.045126,1.130394,0.639678


     province  adcode             model bodyType  regYear  regMonth  \
0          浙江  330000  f8a6975573af1b33    Sedan     2017        12   
1          福建  350000  f8a6975573af1b33    Sedan     2017        12   
2          四川  510000  f8a6975573af1b33    Sedan     2017        12   
3          陕西  610000  f8a6975573af1b33    Sedan     2017        12   
4          安徽  340000  f8a6975573af1b33    Sedan     2017        12   
...       ...     ...               ...      ...      ...       ...   
1315       重庆  500000  c6cd4e0e073f5ac2    Sedan     2017        12   
1316       河北  130000  c6cd4e0e073f5ac2    Sedan     2017        12   
1317       山西  140000  c6cd4e0e073f5ac2    Sedan     2017        12   
1318       江西  360000  c6cd4e0e073f5ac2    Sedan     2017        12   
1319       河南  410000  c6cd4e0e073f5ac2    Sedan     2017        12   

      salesVolume  is_pring_festival  distance_spring_festival  salesVolume_1  \
0        7.357556                  0                         2    

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2017,12,,7.357556,0,2,...,7.357556,0.363114,6.632002,-0.125718,7.001934,0.120926,0.053544,0.026056,0.725554,0.488831
1,福建,350000,f8a6975573af1b33,Sedan,2017,12,,6.605298,0,2,...,6.605298,0.190729,6.006353,-0.220184,6.296365,0.063127,0.032470,0.021035,0.598945,0.410912
2,四川,510000,f8a6975573af1b33,Sedan,2017,12,,6.864848,0,2,...,6.864848,0.329607,6.165418,-0.247408,6.437167,0.109376,0.051337,0.038224,0.699430,0.577015
3,陕西,610000,f8a6975573af1b33,Sedan,2017,12,,5.480639,0,2,...,5.480639,0.293253,5.105945,-0.135624,5.264980,0.062449,0.018698,0.024172,0.374693,0.428877
4,安徽,340000,f8a6975573af1b33,Sedan,2017,12,,6.834109,0,2,...,6.834109,0.340355,6.212606,-0.120674,6.451389,0.083472,0.033873,0.021090,0.621503,0.461028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,重庆,500000,c6cd4e0e073f5ac2,Sedan,2017,12,,4.812184,0,2,...,4.812184,0.356675,3.828641,-0.287682,4.287344,0.080242,0.095821,0.061696,0.983543,0.644357
1316,河北,130000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.361302,0,2,...,6.361302,0.367341,5.652489,-0.143569,5.949899,0.094207,0.041561,0.034474,0.708813,0.510910
1317,山西,140000,c6cd4e0e073f5ac2,Sedan,2017,12,,5.645447,0,2,...,6.324359,1.287406,5.036953,-0.647605,5.537708,0.101416,0.148402,0.379318,1.287406,1.935012
1318,江西,360000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.327937,0,2,...,6.327937,0.341258,5.521461,0.021931,5.950425,0.134413,0.074671,0.011275,0.806476,0.319328


province                     0
adcode                       0
model                        0
bodyType                     0
regYear                      0
                            ..
salesVolume_his_diff_aver    0
salesVolume_his_var          0
salesVolume_his_diff_var     0
salesVolume_his_pth          0
salesVolume_his_diff_pth     0
Length: 68, dtype: int64

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2017,12,,7.357556,0,2,...,7.357556,0.363114,6.632002,-0.125718,7.001934,0.120926,0.053544,0.026056,0.725554,0.488831
1,福建,350000,f8a6975573af1b33,Sedan,2017,12,,6.605298,0,2,...,6.605298,0.190729,6.006353,-0.220184,6.296365,0.063127,0.032470,0.021035,0.598945,0.410912
2,四川,510000,f8a6975573af1b33,Sedan,2017,12,,6.864848,0,2,...,6.864848,0.329607,6.165418,-0.247408,6.437167,0.109376,0.051337,0.038224,0.699430,0.577015
3,陕西,610000,f8a6975573af1b33,Sedan,2017,12,,5.480639,0,2,...,5.480639,0.293253,5.105945,-0.135624,5.264980,0.062449,0.018698,0.024172,0.374693,0.428877
4,安徽,340000,f8a6975573af1b33,Sedan,2017,12,,6.834109,0,2,...,6.834109,0.340355,6.212606,-0.120674,6.451389,0.083472,0.033873,0.021090,0.621503,0.461028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,重庆,500000,c6cd4e0e073f5ac2,Sedan,2017,12,,4.812184,0,2,...,4.812184,0.356675,3.828641,-0.287682,4.287344,0.080242,0.095821,0.061696,0.983543,0.644357
1316,河北,130000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.361302,0,2,...,6.361302,0.367341,5.652489,-0.143569,5.949899,0.094207,0.041561,0.034474,0.708813,0.510910
1317,山西,140000,c6cd4e0e073f5ac2,Sedan,2017,12,,5.645447,0,2,...,6.324359,1.287406,5.036953,-0.647605,5.537708,0.101416,0.148402,0.379318,1.287406,1.935012
1318,江西,360000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.327937,0,2,...,6.327937,0.341258,5.521461,0.021931,5.950425,0.134413,0.074671,0.011275,0.806476,0.319328


In [13]:
# LightGBM model
params = {
      'boosting_type': 'gbdt',
      'objective': 'rmse',
      'metric': ['rmse'],   # 'l2', 'binary_logloss',
      'learning_rate': 0.03,
      'num_leaves': 2 ** 5 - 1,
      # 'min_child_samples': 100,
      'max_depth': 6,
      'subsample': 0.8,
      'subsample_freq': 5,
      'colsample_bytree': 0.8,
      'seed': 2019,
      'nthread': -1,
      'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# categorial_name = ['adcode', 'model', 'bodyType', 'regYear', 'regMonth']

module = lgb.train(params, lgb_train, num_boost_round=5000, valid_sets=lgb_eval, early_stopping_rounds=100, categorical_feature=categorial_name)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params, lgb_data, num_boost_round=iters, categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print(predict)

print('model train over, rmse:', nrmse)
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)

[1]	valid_0's rmse: 1.67651
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 1.63556
[3]	valid_0's rmse: 1.60274
[4]	valid_0's rmse: 1.56524
[5]	valid_0's rmse: 1.52906
[6]	valid_0's rmse: 1.49246
[7]	valid_0's rmse: 1.45878
[8]	valid_0's rmse: 1.42526
[9]	valid_0's rmse: 1.39313
[10]	valid_0's rmse: 1.36356
[11]	valid_0's rmse: 1.33438
[12]	valid_0's rmse: 1.30693
[13]	valid_0's rmse: 1.28003
[14]	valid_0's rmse: 1.25353
[15]	valid_0's rmse: 1.22805
[16]	valid_0's rmse: 1.20463
[17]	valid_0's rmse: 1.18135
[18]	valid_0's rmse: 1.15869
[19]	valid_0's rmse: 1.13921
[20]	valid_0's rmse: 1.11803
[21]	valid_0's rmse: 1.09802
[22]	valid_0's rmse: 1.0797
[23]	valid_0's rmse: 1.06829
[24]	valid_0's rmse: 1.05158
[25]	valid_0's rmse: 1.03417
[26]	valid_0's rmse: 1.0173
[27]	valid_0's rmse: 1.00016
[28]	valid_0's rmse: 0.985802
[29]	valid_0's rmse: 0.970048
[30]	valid_0's rmse: 0.954406
[31]	valid_0's rmse: 0.941495
[32]	valid_0's rmse: 0.927126
[33]	valid_0's 

```
model train over, rmse: 0.6151369735179166
```

### Model-LightGBM - 四月

In [14]:
size, pre = 4, 7   # 5
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 4
###############################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 8)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

      province  adcode             model bodyType  regYear  regMonth  \
0           浙江  330000  f8a6975573af1b33    Sedan     2016         8   
1           浙江  330000  f8a6975573af1b33    Sedan     2016         9   
2           浙江  330000  f8a6975573af1b33    Sedan     2016        10   
3           浙江  330000  f8a6975573af1b33    Sedan     2016        11   
4           浙江  330000  f8a6975573af1b33    Sedan     2016        12   
...        ...     ...               ...      ...      ...       ...   
17155       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         4   
17156       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         5   
17157       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         6   
17158       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         7   
17159       河南  410000  c6cd4e0e073f5ac2    Sedan     2017         8   

       salesVolume  is_pring_festival  distance_spring_festival  \
0         6.610696                  0                         5   
1

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2016,8,1454.0,6.610696,0,5,...,6.900731,0.377168,6.419995,-0.317321,6.592654,0.014522,0.021143,0.052395,0.480736,0.694490
1,浙江,330000,f8a6975573af1b33,Sedan,2016,9,752.0,6.714171,0,4,...,6.900731,0.190701,6.419995,-0.317321,6.619884,-0.031093,0.021830,0.029714,0.480736,0.508023
2,浙江,330000,f8a6975573af1b33,Sedan,2016,10,503.0,6.752270,0,3,...,6.752270,0.190701,6.419995,-0.174788,6.598675,0.028144,0.012616,0.013348,0.332275,0.365489
3,浙江,330000,f8a6975573af1b33,Sedan,2016,11,780.0,6.543912,0,2,...,6.752270,0.190701,6.419995,-0.208359,6.593033,-0.016429,0.012979,0.020524,0.332275,0.399060
4,浙江,330000,f8a6975573af1b33,Sedan,2016,12,679.0,7.282074,0,1,...,7.282074,0.738162,6.419995,-0.208359,6.684402,0.135729,0.072106,0.088094,0.862079,0.946520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17155,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,4,991.0,6.492240,0,3,...,7.367709,0.470004,5.768321,-1.131402,6.651535,-0.062275,0.239052,0.326331,1.599388,1.601406
17156,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,5,1189.0,6.322565,0,4,...,7.367709,0.470004,5.768321,-1.131402,6.573917,-0.100995,0.241924,0.324153,1.599388,1.601406
17157,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,6,1206.0,6.580639,0,5,...,7.367709,0.470004,5.768321,-1.131402,6.524217,-0.131178,0.221495,0.296100,1.599388,1.601406
17158,河南,410000,c6cd4e0e073f5ac2,Sedan,2017,7,1267.0,6.577861,0,6,...,6.899723,0.470004,5.768321,-1.131402,6.411382,-0.053644,0.107535,0.273930,1.131402,1.601406


     province  adcode             model bodyType  regYear  regMonth  \
0          浙江  330000  f8a6975573af1b33    Sedan     2017        12   
1          福建  350000  f8a6975573af1b33    Sedan     2017        12   
2          四川  510000  f8a6975573af1b33    Sedan     2017        12   
3          陕西  610000  f8a6975573af1b33    Sedan     2017        12   
4          安徽  340000  f8a6975573af1b33    Sedan     2017        12   
...       ...     ...               ...      ...      ...       ...   
1315       重庆  500000  c6cd4e0e073f5ac2    Sedan     2017        12   
1316       河北  130000  c6cd4e0e073f5ac2    Sedan     2017        12   
1317       山西  140000  c6cd4e0e073f5ac2    Sedan     2017        12   
1318       江西  360000  c6cd4e0e073f5ac2    Sedan     2017        12   
1319       河南  410000  c6cd4e0e073f5ac2    Sedan     2017        12   

      salesVolume  is_pring_festival  distance_spring_festival  salesVolume_1  \
0        7.357556                  0                         2    

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2017,12,,7.357556,0,2,...,7.357556,0.363114,6.632002,-0.125718,7.001934,0.120926,0.053544,0.026056,0.725554,0.488831
1,福建,350000,f8a6975573af1b33,Sedan,2017,12,,6.605298,0,2,...,6.605298,0.190729,6.006353,-0.220184,6.296365,0.063127,0.032470,0.021035,0.598945,0.410912
2,四川,510000,f8a6975573af1b33,Sedan,2017,12,,6.864848,0,2,...,6.864848,0.329607,6.165418,-0.247408,6.437167,0.109376,0.051337,0.038224,0.699430,0.577015
3,陕西,610000,f8a6975573af1b33,Sedan,2017,12,,5.480639,0,2,...,5.480639,0.293253,5.105945,-0.135624,5.264980,0.062449,0.018698,0.024172,0.374693,0.428877
4,安徽,340000,f8a6975573af1b33,Sedan,2017,12,,6.834109,0,2,...,6.834109,0.340355,6.212606,-0.120674,6.451389,0.083472,0.033873,0.021090,0.621503,0.461028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,重庆,500000,c6cd4e0e073f5ac2,Sedan,2017,12,,4.812184,0,2,...,4.812184,0.356675,3.828641,-0.287682,4.287344,0.080242,0.095821,0.061696,0.983543,0.644357
1316,河北,130000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.361302,0,2,...,6.361302,0.367341,5.652489,-0.143569,5.949899,0.094207,0.041561,0.034474,0.708813,0.510910
1317,山西,140000,c6cd4e0e073f5ac2,Sedan,2017,12,,5.645447,0,2,...,6.324359,1.287406,5.036953,-0.647605,5.537708,0.101416,0.148402,0.379318,1.287406,1.935012
1318,江西,360000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.327937,0,2,...,6.327937,0.341258,5.521461,0.021931,5.950425,0.134413,0.074671,0.011275,0.806476,0.319328


province                     0
adcode                       0
model                        0
bodyType                     0
regYear                      0
                            ..
salesVolume_his_diff_aver    0
salesVolume_his_var          0
salesVolume_his_diff_var     0
salesVolume_his_pth          0
salesVolume_his_diff_pth     0
Length: 61, dtype: int64

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,label,salesVolume,is_pring_festival,distance_spring_festival,...,salesVolume_his_max,salesVolume_his_diff_max,salesVolume_his_min,salesVolume_his_diff_min,salesVolume_his_aver,salesVolume_his_diff_aver,salesVolume_his_var,salesVolume_his_diff_var,salesVolume_his_pth,salesVolume_his_diff_pth
0,浙江,330000,f8a6975573af1b33,Sedan,2017,12,,7.357556,0,2,...,7.357556,0.363114,6.632002,-0.125718,7.001934,0.120926,0.053544,0.026056,0.725554,0.488831
1,福建,350000,f8a6975573af1b33,Sedan,2017,12,,6.605298,0,2,...,6.605298,0.190729,6.006353,-0.220184,6.296365,0.063127,0.032470,0.021035,0.598945,0.410912
2,四川,510000,f8a6975573af1b33,Sedan,2017,12,,6.864848,0,2,...,6.864848,0.329607,6.165418,-0.247408,6.437167,0.109376,0.051337,0.038224,0.699430,0.577015
3,陕西,610000,f8a6975573af1b33,Sedan,2017,12,,5.480639,0,2,...,5.480639,0.293253,5.105945,-0.135624,5.264980,0.062449,0.018698,0.024172,0.374693,0.428877
4,安徽,340000,f8a6975573af1b33,Sedan,2017,12,,6.834109,0,2,...,6.834109,0.340355,6.212606,-0.120674,6.451389,0.083472,0.033873,0.021090,0.621503,0.461028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,重庆,500000,c6cd4e0e073f5ac2,Sedan,2017,12,,4.812184,0,2,...,4.812184,0.356675,3.828641,-0.287682,4.287344,0.080242,0.095821,0.061696,0.983543,0.644357
1316,河北,130000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.361302,0,2,...,6.361302,0.367341,5.652489,-0.143569,5.949899,0.094207,0.041561,0.034474,0.708813,0.510910
1317,山西,140000,c6cd4e0e073f5ac2,Sedan,2017,12,,5.645447,0,2,...,6.324359,1.287406,5.036953,-0.647605,5.537708,0.101416,0.148402,0.379318,1.287406,1.935012
1318,江西,360000,c6cd4e0e073f5ac2,Sedan,2017,12,,6.327937,0,2,...,6.327937,0.341258,5.521461,0.021931,5.950425,0.134413,0.074671,0.011275,0.806476,0.319328


In [15]:
# LightGBM model
params = {
      'boosting_type': 'gbdt',
      'objective': 'rmse',
      'metric': ['rmse'],   # 'l2', 'binary_logloss',
      'learning_rate': 0.03,
      'num_leaves': 2 ** 5 - 1,
      # 'min_child_samples': 100,
      'max_depth': 6,
      'subsample': 0.8,
      'subsample_freq': 5,
      'colsample_bytree': 0.8,
      'seed': 2019,
      'nthread': -1,
      'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# categorial_name = ['adcode', 'model', 'bodyType', 'regYear', 'regMonth']

module = lgb.train(params, lgb_train, num_boost_round=5000, valid_sets=lgb_eval, early_stopping_rounds=100, categorical_feature=categorial_name)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params, lgb_data, num_boost_round=iters, categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print(predict)

print('model train over, rmse:', nrmse)
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)

[1]	valid_0's rmse: 1.67869
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 1.64097
[3]	valid_0's rmse: 1.60314
[4]	valid_0's rmse: 1.56881
[5]	valid_0's rmse: 1.53494
[6]	valid_0's rmse: 1.50036
[7]	valid_0's rmse: 1.46534
[8]	valid_0's rmse: 1.43114
[9]	valid_0's rmse: 1.40004
[10]	valid_0's rmse: 1.37074
[11]	valid_0's rmse: 1.3434
[12]	valid_0's rmse: 1.31375
[13]	valid_0's rmse: 1.28759
[14]	valid_0's rmse: 1.26039
[15]	valid_0's rmse: 1.23489
[16]	valid_0's rmse: 1.21427
[17]	valid_0's rmse: 1.19194
[18]	valid_0's rmse: 1.17058
[19]	valid_0's rmse: 1.14827
[20]	valid_0's rmse: 1.12862
[21]	valid_0's rmse: 1.10935
[22]	valid_0's rmse: 1.09189
[23]	valid_0's rmse: 1.07487
[24]	valid_0's rmse: 1.05868
[25]	valid_0's rmse: 1.04174
[26]	valid_0's rmse: 1.0237
[27]	valid_0's rmse: 1.00797
[28]	valid_0's rmse: 0.991184
[29]	valid_0's rmse: 0.975118
[30]	valid_0's rmse: 0.962787
[31]	valid_0's rmse: 0.949398
[32]	valid_0's rmse: 0.936719
[33]	valid_0's 

```
model train over, rmse: 0.5989938881637746
```

In [16]:
train_feature.shape

(17160, 59)

In [17]:
test_prob_collection

Unnamed: 0,province,adcode,model,regYear,regMonth,forecastVolum
0,浙江,330000,f8a6975573af1b33,2018,1,959.553651
1,福建,350000,f8a6975573af1b33,2018,1,463.297235
2,四川,510000,f8a6975573af1b33,2018,1,661.884054
3,陕西,610000,f8a6975573af1b33,2018,1,169.590516
4,安徽,340000,f8a6975573af1b33,2018,1,639.350203
...,...,...,...,...,...,...
5275,重庆,500000,c6cd4e0e073f5ac2,2018,4,33.751382
5276,河北,130000,c6cd4e0e073f5ac2,2018,4,156.643194
5277,山西,140000,c6cd4e0e073f5ac2,2018,4,92.214727
5278,江西,360000,c6cd4e0e073f5ac2,2018,4,139.670105


In [18]:
test_prob_collection.index = range(len(test_prob_collection))
evaluation_public = evaluation_public.merge(test_prob_collection, on=['province', 'adcode', 'model', 'regYear', 'regMonth'], how='left')
evaluation_public['forecastVolum'] = evaluation_public['forecastVolum_y']
evaluation_public['forecastVolum'] = evaluation_public['forecastVolum'].apply(lambda index: int(np.round(index)))
evaluation_public['forecastVolum'].mean()

468.8373106060606

In [19]:
evaluation_public[['id', 'forecastVolum']].to_csv('./submit/sub_method_one.csv', encoding='utf-8', index=None)

In [20]:
evaluation_public.describe()
evaluation_public.groupby(['regMonth'], as_index=False)['forecastVolum'].mean()

Unnamed: 0,id,adcode,regYear,regMonth,forecastVolum_x,forecastVolum_y,forecastVolum
count,5280.0,5280.0,5280.0,5280.0,0.0,5280.0,5280.0
mean,2677.166667,347727.272727,2018.0,2.5,,468.833732,468.837311
std,1549.503211,136292.426601,0.0,1.11814,,583.428974,583.427264
min,1.0,110000.0,2018.0,1.0,,7.050601,7.0
25%,1342.75,230000.0,2018.0,1.75,,125.338305,125.0
50%,2684.5,355000.0,2018.0,2.5,,281.487677,281.5
75%,4026.25,440000.0,2018.0,3.25,,562.791071,563.0
max,5368.0,610000.0,2018.0,4.0,,6636.718831,6637.0


Unnamed: 0,regMonth,forecastVolum
0,1,554.086364
1,2,369.732576
2,3,471.607576
3,4,479.922727
