In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error as mse
import lightgbm as lgb
from time import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

random.seed(2019)
np.random.seed(2019)

# 读取数据\整合数据

In [2]:
data_path = './data/'

train_sales_data = pd.read_csv(data_path + 'train_sales_data.csv', encoding='utf-8')
train_search_data = pd.read_csv(data_path + 'train_search_data.csv', encoding='utf-8')
test_data = pd.read_csv(data_path + 'evaluation_public.csv', encoding='utf-8')

data = pd.concat([train_sales_data, test_data], ignore_index=True)
data = data.merge(train_search_data, on=['province', 'adcode', 'model', 'regYear', 'regMonth'],how='left')


data['label'] = data['salesVolume']

del data['salesVolume'], data['forecastVolum']
data['id'] = data['id'].fillna(0).astype(int)
data['bodyType'] = data['model'].map(train_sales_data.drop_duplicates('model').set_index('model')['bodyType'])
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))

data['seq'] = (data['regYear']-2016) * 12 + data['regMonth']#获取时间序列标记

data['model_adcode'] = data['adcode'] + data['model']
data['model_adcode_seq'] = data['model_adcode'] * 100 + data['seq']

data['adcode_seq'] = data['adcode']*100+data['seq']
data['model_seq'] = (data['model'])*10000+data['seq']

data['label'] = np.log1p(data['label'])

# 评分函数

In [3]:
from sklearn.metrics import mean_squared_error

def metrics(y_true, y_pred, model):
    data = pd.DataFrame({'model': model, 'salesVolume': y_true, 'label': y_pred})
    data['label'] = data['label'].map(lambda index: -index if index < 0 else index)
    res, count = 0, 0
    for index, cars in data.groupby('model'):
        a = np.array(cars['salesVolume'])
        b = np.array(cars['label'])
        temp = np.sqrt(np.sum((a - b) ** 2) / len(a)) / np.mean(a)
        res += temp
        count += 1
        print(temp)
    return 1 - (res / count)


# 特征工程

###### 历史特征统计函数

In [4]:
def calculate_sum_mean(feature, month):#计算过去几月的特征和
    data[feature.format('_diff_1')] = data[feature.format(2)]-data[feature.format(1)]
    data[feature.format('sum_{0}'.format(month))] = 0
    for i in range(1, month+1):
        data[feature.format('sum_{0}'.format(month))] += data[feature.format(i)]
    data[feature.format('mean')] = data[feature.format('sum_{0}'.format(month))]/month
        
    

###### 获取时移特征

In [5]:
def get_time_shift_feature(Data, month):
    data = Data[['adcode','bodyType','id', 'model', 'regMonth', 'regYear', 'label', 'seq', 'model_adcode', 
                   'model_adcode_seq','adcode_seq', 'model_seq', 'popularity']]
    for j in range(1,13):
        data['model_adcode_seq_{0}'.format(j)] = data['model_adcode_seq'] + j
        data_index = data[~data.label.isnull()].set_index('model_adcode_seq_{0}'.format(j))
        data['shift_label_{0}'.format(j)] = data['model_adcode_seq'].map(data_index['label'])
        if month==1:
            data['shift_popularity_{0}'.format(j)] = data['model_adcode_seq'].map(data_index['popularity'])
        data = data.drop(['model_adcode_seq_{0}'.format(j)], axis=1)
    return data

###### 获取组合时移特征

In [6]:
def get_group_shift_feature(data,group_feature):
    Data = data
    g_data = Data.groupby(by=[group_feature])['label'].sum(skipna=False)
    g_data = g_data.fillna(np.nan).reset_index()
    for j in range(1,13):
        g_data[group_feature+'_{0}'.format(j)] = g_data[group_feature] + j
        g_data_index = g_data[~g_data.label.isnull()].set_index(group_feature+'_{0}'.format(j))
        g_data[group_feature+'_shift_{0}'.format(j)] = g_data[group_feature].map(g_data_index['label'])
        del g_data[group_feature+'_{0}'.format(j)]
    del g_data['label']
    data = pd.merge(data, g_data, on=[group_feature], how='left')
    return data

###### 获取历史销量特征

In [7]:
def get_history_label_feature(month):
    for i in [2,3,4,6,12]:
        calculate_sum_mean('shift_label_{0}', i)
        if month==1:
            calculate_sum_mean('shift_popularity_{0}', i)
        calculate_sum_mean('adcode_seq_shift_{0}', i)
        calculate_sum_mean('model_seq_shift_{0}', i)

###### 定义lgb模型

In [8]:
lgb_model = lgb.LGBMRegressor(
                            num_leaves=31, reg_alpha=0.25, learning_rate=0.05, min_child_samples=5, seed=2019,reg_lambda=0.25, 
                                objective='mse',max_depth=-1, n_estimators=2100, subsample=0.9, colsample_bytree=0.7, random_state=2019
                            )

# 预测单月销量，再预测下月

In [9]:
for i in range(1, 5):
    print('=================predict month {0}=================='.format(i))

    data = get_time_shift_feature(data, i)
    data = get_group_shift_feature(data, 'adcode_seq')
    data = get_group_shift_feature(data, 'model_seq')
    get_history_label_feature(i)

    data_columns = list(data.columns)
    dels = ['regMonth', 'regYear', 'adcode', 'bodyType', 'id', 'model', 'province', 'label', 'seq', 'model_adcode',
                'model_adcode_seq', 'adcode_seq', 'model_seq', 'popularity']
    number_feature = []
    for index in data_columns:
        if index in dels:
            continue
        else:
            number_feature.append(index)
    category_feature = ['regMonth', 'regYear', 'adcode', 'bodyType', 'model', 'model_adcode_seq', 'model_adcode']
    features = number_feature + category_feature
    print(features)

    predict_data = data[data['seq'] == 24 + i]
    train_idx = (data['seq'].between(13, 23 + i))

    train_y = data[train_idx]['label']
    train_x = data[train_idx][features]

    print("train LGB model...\n")
    lgb_model.fit(train_x, train_y, categorical_feature=category_feature)
    predict_data['lgb_pred_label'] = lgb_model.predict(predict_data[features])
    print('month {} train ending!\n'.format(i))

    predict_data = predict_data.sort_index(by=['id'])
    data['transform_label'] = data['id'].map(predict_data.set_index('id')['lgb_pred_label'])
    data['label'] = data['label'].fillna(data['transform_label'])
    del data['transform_label']


['shift_label_1', 'shift_popularity_1', 'shift_label_2', 'shift_popularity_2', 'shift_label_3', 'shift_popularity_3', 'shift_label_4', 'shift_popularity_4', 'shift_label_5', 'shift_popularity_5', 'shift_label_6', 'shift_popularity_6', 'shift_label_7', 'shift_popularity_7', 'shift_label_8', 'shift_popularity_8', 'shift_label_9', 'shift_popularity_9', 'shift_label_10', 'shift_popularity_10', 'shift_label_11', 'shift_popularity_11', 'shift_label_12', 'shift_popularity_12', 'adcode_seq_shift_1', 'adcode_seq_shift_2', 'adcode_seq_shift_3', 'adcode_seq_shift_4', 'adcode_seq_shift_5', 'adcode_seq_shift_6', 'adcode_seq_shift_7', 'adcode_seq_shift_8', 'adcode_seq_shift_9', 'adcode_seq_shift_10', 'adcode_seq_shift_11', 'adcode_seq_shift_12', 'model_seq_shift_1', 'model_seq_shift_2', 'model_seq_shift_3', 'model_seq_shift_4', 'model_seq_shift_5', 'model_seq_shift_6', 'model_seq_shift_7', 'model_seq_shift_8', 'model_seq_shift_9', 'model_seq_shift_10', 'model_seq_shift_11', 'model_seq_shift_12', 'sh

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
              importance_type='split', learning_rate=0.05, max_depth=-1,
              min_child_samples=5, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=2100, n_jobs=-1, num_leaves=31, objective='mse',
              random_state=2019, reg_alpha=0.25, reg_lambda=0.25, seed=2019,
              silent=True, subsample=0.9, subsample_for_bin=200000,
              subsample_freq=0)

month 1 train ending!

['shift_label_1', 'shift_label_2', 'shift_label_3', 'shift_label_4', 'shift_label_5', 'shift_label_6', 'shift_label_7', 'shift_label_8', 'shift_label_9', 'shift_label_10', 'shift_label_11', 'shift_label_12', 'adcode_seq_shift_1', 'adcode_seq_shift_2', 'adcode_seq_shift_3', 'adcode_seq_shift_4', 'adcode_seq_shift_5', 'adcode_seq_shift_6', 'adcode_seq_shift_7', 'adcode_seq_shift_8', 'adcode_seq_shift_9', 'adcode_seq_shift_10', 'adcode_seq_shift_11', 'adcode_seq_shift_12', 'model_seq_shift_1', 'model_seq_shift_2', 'model_seq_shift_3', 'model_seq_shift_4', 'model_seq_shift_5', 'model_seq_shift_6', 'model_seq_shift_7', 'model_seq_shift_8', 'model_seq_shift_9', 'model_seq_shift_10', 'model_seq_shift_11', 'model_seq_shift_12', 'shift_label__diff_1', 'shift_label_sum_2', 'shift_label_mean', 'adcode_seq_shift__diff_1', 'adcode_seq_shift_sum_2', 'adcode_seq_shift_mean', 'model_seq_shift__diff_1', 'model_seq_shift_sum_2', 'model_seq_shift_mean', 'shift_label_sum_3', 'adcode

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
              importance_type='split', learning_rate=0.05, max_depth=-1,
              min_child_samples=5, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=2100, n_jobs=-1, num_leaves=31, objective='mse',
              random_state=2019, reg_alpha=0.25, reg_lambda=0.25, seed=2019,
              silent=True, subsample=0.9, subsample_for_bin=200000,
              subsample_freq=0)

month 2 train ending!

['shift_label_1', 'shift_label_2', 'shift_label_3', 'shift_label_4', 'shift_label_5', 'shift_label_6', 'shift_label_7', 'shift_label_8', 'shift_label_9', 'shift_label_10', 'shift_label_11', 'shift_label_12', 'adcode_seq_shift_1', 'adcode_seq_shift_2', 'adcode_seq_shift_3', 'adcode_seq_shift_4', 'adcode_seq_shift_5', 'adcode_seq_shift_6', 'adcode_seq_shift_7', 'adcode_seq_shift_8', 'adcode_seq_shift_9', 'adcode_seq_shift_10', 'adcode_seq_shift_11', 'adcode_seq_shift_12', 'model_seq_shift_1', 'model_seq_shift_2', 'model_seq_shift_3', 'model_seq_shift_4', 'model_seq_shift_5', 'model_seq_shift_6', 'model_seq_shift_7', 'model_seq_shift_8', 'model_seq_shift_9', 'model_seq_shift_10', 'model_seq_shift_11', 'model_seq_shift_12', 'shift_label__diff_1', 'shift_label_sum_2', 'shift_label_mean', 'adcode_seq_shift__diff_1', 'adcode_seq_shift_sum_2', 'adcode_seq_shift_mean', 'model_seq_shift__diff_1', 'model_seq_shift_sum_2', 'model_seq_shift_mean', 'shift_label_sum_3', 'adcode

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
              importance_type='split', learning_rate=0.05, max_depth=-1,
              min_child_samples=5, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=2100, n_jobs=-1, num_leaves=31, objective='mse',
              random_state=2019, reg_alpha=0.25, reg_lambda=0.25, seed=2019,
              silent=True, subsample=0.9, subsample_for_bin=200000,
              subsample_freq=0)

month 3 train ending!

['shift_label_1', 'shift_label_2', 'shift_label_3', 'shift_label_4', 'shift_label_5', 'shift_label_6', 'shift_label_7', 'shift_label_8', 'shift_label_9', 'shift_label_10', 'shift_label_11', 'shift_label_12', 'adcode_seq_shift_1', 'adcode_seq_shift_2', 'adcode_seq_shift_3', 'adcode_seq_shift_4', 'adcode_seq_shift_5', 'adcode_seq_shift_6', 'adcode_seq_shift_7', 'adcode_seq_shift_8', 'adcode_seq_shift_9', 'adcode_seq_shift_10', 'adcode_seq_shift_11', 'adcode_seq_shift_12', 'model_seq_shift_1', 'model_seq_shift_2', 'model_seq_shift_3', 'model_seq_shift_4', 'model_seq_shift_5', 'model_seq_shift_6', 'model_seq_shift_7', 'model_seq_shift_8', 'model_seq_shift_9', 'model_seq_shift_10', 'model_seq_shift_11', 'model_seq_shift_12', 'shift_label__diff_1', 'shift_label_sum_2', 'shift_label_mean', 'adcode_seq_shift__diff_1', 'adcode_seq_shift_sum_2', 'adcode_seq_shift_mean', 'model_seq_shift__diff_1', 'model_seq_shift_sum_2', 'model_seq_shift_mean', 'shift_label_sum_3', 'adcode

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
              importance_type='split', learning_rate=0.05, max_depth=-1,
              min_child_samples=5, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=2100, n_jobs=-1, num_leaves=31, objective='mse',
              random_state=2019, reg_alpha=0.25, reg_lambda=0.25, seed=2019,
              silent=True, subsample=0.9, subsample_for_bin=200000,
              subsample_freq=0)

month 4 train ending!



# submission 

In [10]:
# data['label'] = np.expm1(data['label'])
# predict_data_idx = (data['seq'] > 24)
# data['forecastVolum'] = data['label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
# data[predict_data_idx][['id', 'forecastVolum']].to_csv('./submit/new_5936_1.csv', index=False)

In [11]:
# data['label'] = np.expm1(data['label'])
# data[data['seq'] > 24]

In [12]:
data['label'] = np.expm1(data['label'])
data[data['seq'] > 24]

Unnamed: 0,adcode,bodyType,id,model,regMonth,regYear,label,seq,model_adcode,model_adcode_seq,...,model_seq_shift_sum_3,shift_label_sum_4,adcode_seq_shift_sum_4,model_seq_shift_sum_4,shift_label_sum_6,adcode_seq_shift_sum_6,model_seq_shift_sum_6,shift_label_sum_12,adcode_seq_shift_sum_12,model_seq_shift_sum_12
31680,310000,0,1,0,1,2018,241.013009,25,310000,31000025,...,383.60897,22.741146,1346.381966,513.022751,33.926386,1995.751828,769.849710,67.252498,3915.599018,1533.018977
31681,530000,0,2,0,1,2018,297.024708,25,530000,53000025,...,383.60897,23.043567,1353.678295,513.022751,34.718714,2012.802126,769.849710,68.494930,3933.112371,1533.018977
31682,150000,0,3,0,1,2018,131.512196,25,150000,15000025,...,383.60897,20.438977,1269.401474,513.022751,30.878958,1884.326271,769.849710,61.734231,3691.910677,1533.018977
31683,110000,0,4,0,1,2018,252.791720,25,110000,11000025,...,383.60897,22.426024,1282.827694,513.022751,33.879550,1915.261139,769.849710,69.855708,3825.479327,1533.018977
31684,510000,0,5,0,1,2018,387.521496,25,510000,51000025,...,383.60897,23.759101,1529.360305,513.022751,35.281531,2271.537683,769.849710,70.421272,4462.000215,1533.018977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36955,350000,0,5364,59,4,2018,81.006999,28,350059,35005928,...,313.42121,17.954366,1304.951209,424.063298,27.329373,1991.106727,642.025512,54.622168,3981.025715,1289.093882
36956,210000,0,5365,59,4,2018,84.966207,28,210059,21005928,...,313.42121,17.363225,1286.912342,424.063298,26.755887,1966.101087,642.025512,54.611152,3970.755649,1289.093882
36957,500000,0,5366,59,4,2018,102.900300,28,500059,50005928,...,313.42121,18.342554,1242.924574,424.063298,27.695435,1877.764593,642.025512,57.548946,3749.078284,1289.093882
36958,610000,0,5367,59,4,2018,194.354172,28,610059,61005928,...,313.42121,21.463429,1325.396521,424.063298,32.801477,2021.405097,642.025512,65.532078,4046.532725,1289.093882


In [15]:
data[data['seq'] > 24].groupby(['regMonth'])['label'].mean()

regMonth
1    473.868935
2    318.250062
3    497.139422
4    490.399432
Name: label, dtype: float64