In [37]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000,
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']

In [38]:
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    #mask=~data['salesVolume'].isnull()
    #data['salesVolume'][mask] = exponential_smoothing(data['salesVolume'][mask],0.95)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'xgb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/car_prediction_sales.csv', index=False)

100%|██████████| 3/3 [00:01<00:00,  2.87it/s]
100%|██████████| 4/4 [00:00<00:00, 27.81it/s]


32 32
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
[0]	validation_0-rmse:742.849	validation_1-rmse:916.995
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:702.771	validation_1-rmse:876.321
[200]	validation_0-rmse:222.565	validation_1-rmse:354.341
[300]	validation_0-rmse:123.294	validation_1-rmse:245.748
[400]	validation_0-rmse:107.931	validation_1-rmse:238.778
[500]	validation_0-rmse:99.3993	validation_1-rmse:236.557
[600]	validation_0-rmse:90.9904	validation_1-rmse:232.451
[700]	validation_0-rmse:86.105	validation_1-rmse:231.115
[800]	validation_0-rmse:81.0385	validation_1-rmse:231.079
[900]	validation_0-rmse:76.7003	validation_1-rmse:230.648
[1000]	validation_0-rmse:72.2561	validation_1-rmse:228.458
[1100]	validation_0-rmse:68.6501	validation_1-rmse:227.311
[1200]	validation_0-rmse:65.4614	validation_1-rmse:225.554
[1300]	valid

100%|██████████| 3/3 [00:01<00:00,  3.04it/s]
100%|██████████| 4/4 [00:00<00:00, 27.62it/s]


32 32
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
[0]	validation_0-rmse:764.161	validation_1-rmse:881.979
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:724.045	validation_1-rmse:842.767
[200]	validation_0-rmse:229.363	validation_1-rmse:435.032
[300]	validation_0-rmse:123.235	validation_1-rmse:378.546
[400]	validation_0-rmse:110.499	validation_1-rmse:377.78
[500]	validation_0-rmse:102.6	validation_1-rmse:379.815
Stopping. Best iteration:
[435]	validation_0-rmse:107.415	validation_1-rmse:376.813

0.5680454367352807
valid mean: 364.9231872558594
true  mean: 531.3914267715669
test  mean: 336.1412048339844


100%|██████████| 3/3 [00:01<00:00,  3.01it/s]
100%|██████████| 4/4 [00:00<00:00, 27.33it/s]


32 32
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
[0]	validation_0-rmse:776.747	validation_1-rmse:939.855
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:736.65	validation_1-rmse:899.532
[200]	validation_0-rmse:237.025	validation_1-rmse:422.125
[300]	validation_0-rmse:131.544	validation_1-rmse:328.373
[400]	validation_0-rmse:117.682	validation_1-rmse:319.741
Stopping. Best iteration:
[389]	validation_0-rmse:118.266	validation_1-rmse:319.349

0.6671840992711282
valid mean: 436.0978698730469
true  mean: 577.0072390614604
test  mean: 370.2229309082031


100%|██████████| 3/3 [00:01<00:00,  2.88it/s]
100%|██████████| 4/4 [00:00<00:00, 30.48it/s]


32 32
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
[0]	validation_0-rmse:792.963	validation_1-rmse:1271.5
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:752.784	validation_1-rmse:1230.87
[200]	validation_0-rmse:243.32	validation_1-rmse:707.234
[300]	validation_0-rmse:129.555	validation_1-rmse:636.811
Stopping. Best iteration:
[234]	validation_0-rmse:157.385	validation_1-rmse:632.894

0.38363959207905995
valid mean: 395.715576171875
true  mean: 768.2413899779178
test  mean: 361.51641845703125


# 平滑

In [3]:
#simple

# given a series and alpha, return series of smoothed points
def exponential_smoothing(series, alpha):
#     result = [series[0]] # first value is same as series
    result = [sum(series)/len(series)] #avg for first value
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n-1])
    return result

#double

# given a series and alpha, return series of smoothed points
def exponential_smoothing(series, alpha):
#     result = [series[0]] # first value is same as series
    result = [sum(series)/len(series)] #avg for first value
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n-1])
    return result

#triple
def initial_trend(series, slen):
    sum = 0.0
    for i in range(slen):
        sum += float(series[i+slen] - series[i]) / slen
    return sum / slen
def initial_seasonal_components(series, slen):
    seasonals = {}
    season_averages = []
    n_seasons = int(len(series)/slen)
    # compute season averages
    for j in range(n_seasons):
        season_averages.append(sum(series[slen*j:slen*j+slen])/float(slen))
    # compute initial values
    for i in range(slen):
        sum_of_vals_over_avg = 0.0
        for j in range(n_seasons):
            sum_of_vals_over_avg += series[slen*j+i]-season_averages[j]
        seasonals[i] = sum_of_vals_over_avg/n_seasons
    return seasonals
def triple_exponential_smoothing(series, slen, alpha, beta, gamma, n_preds):
    result = []
    seasonals = initial_seasonal_components(series, slen)
    for i in range(len(series)+n_preds):
        if i == 0: # initial values
            smooth = series[0]
            trend = initial_trend(series, slen)
            result.append(series[0])
            continue
        if i >= len(series): # we are forecasting
            m = i - len(series) + 1
            result.append((smooth + m*trend) + seasonals[i%slen])
        else:
            val = series[i]
            last_smooth, smooth = smooth, alpha*(val-seasonals[i%slen]) + (1-alpha)*(smooth+trend)
            trend = beta * (smooth-last_smooth) + (1-beta)*trend
            seasonals[i%slen] = gamma*(val-smooth) + (1-gamma)*seasonals[i%slen]
            result.append(smooth+trend+seasonals[i%slen])
    return result

In [7]:
data_df[features].head()

Unnamed: 0,regYear,shift_model_adcode_mt_label_1,shift_model_adcode_mt_label_2,shift_model_adcode_mt_label_3,shift_model_adcode_mt_label_4,shift_model_adcode_mt_label_5,shift_model_adcode_mt_label_6,shift_model_adcode_mt_label_7,shift_model_adcode_mt_label_8,shift_model_adcode_mt_label_9,shift_model_adcode_mt_popularity_1,shift_model_adcode_mt_popularity_2,shift_model_adcode_mt_popularity_3,shift_model_adcode_mt_popularity_4,shift_model_adcode_mt_popularity_5,shift_model_adcode_mt_popularity_6,shift_model_adcode_mt_popularity_7,shift_model_adcode_mt_popularity_8,shift_model_adcode_mt_popularity_9,shift_model_adcode_mt_area_sales_volume_1,shift_model_adcode_mt_area_sales_volume_2,shift_model_adcode_mt_area_sales_volume_3,shift_model_adcode_mt_area_sales_volume_4,shift_model_adcode_mt_area_sales_volume_5,shift_model_adcode_mt_area_sales_volume_6,shift_model_adcode_mt_area_sales_volume_7,shift_model_adcode_mt_area_sales_volume_8,shift_model_adcode_mt_area_sales_volume_9,adcode,bodyType,model,regMonth
0,2016,,,,,,,,,,,,,,,,,,,,,,,,,,,,6,0,0,0
1,2016,,,,,,,,,,,,,,,,,,,,,,,,,,,,20,0,0,0
2,2016,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,0,0,0
3,2016,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0
4,2016,,,,,,,,,,,,,,,,,,,,,,,,,,,,19,0,0,0


In [41]:
features

['regYear',
 'shift_model_adcode_mt_label_1',
 'shift_model_adcode_mt_label_2',
 'shift_model_adcode_mt_label_3',
 'shift_model_adcode_mt_label_4',
 'shift_model_adcode_mt_label_5',
 'shift_model_adcode_mt_label_6',
 'shift_model_adcode_mt_label_7',
 'shift_model_adcode_mt_label_8',
 'shift_model_adcode_mt_label_9',
 'shift_model_adcode_mt_popularity_1',
 'shift_model_adcode_mt_popularity_2',
 'shift_model_adcode_mt_popularity_3',
 'shift_model_adcode_mt_popularity_4',
 'shift_model_adcode_mt_popularity_5',
 'shift_model_adcode_mt_popularity_6',
 'shift_model_adcode_mt_popularity_7',
 'shift_model_adcode_mt_popularity_8',
 'shift_model_adcode_mt_popularity_9',
 'shift_model_adcode_mt_area_sales_volume_1',
 'shift_model_adcode_mt_area_sales_volume_2',
 'shift_model_adcode_mt_area_sales_volume_3',
 'shift_model_adcode_mt_area_sales_volume_4',
 'shift_model_adcode_mt_area_sales_volume_5',
 'shift_model_adcode_mt_area_sales_volume_6',
 'shift_model_adcode_mt_area_sales_volume_7',
 'shift_m

In [8]:
data.head()

Unnamed: 0,adcode,bodyType,forecastVolum,id,model,province,regMonth,regYear,salesVolume,popularity,carCommentVolum,newsReplyVolum,label,mt,area_sales_volume
0,310000,0,,0,0,上海,1,2016,292.0,1479.0,11.0,106.0,292.0,1,38525.0
1,530000,0,,0,0,云南,1,2016,466.0,1594.0,11.0,106.0,466.0,1,36511.0
2,150000,0,,0,0,内蒙古,1,2016,257.0,1479.0,11.0,106.0,257.0,1,25295.0
3,110000,0,,0,0,北京,1,2016,408.0,2370.0,11.0,106.0,408.0,1,43567.0
4,510000,0,,0,0,四川,1,2016,610.0,3562.0,11.0,106.0,610.0,1,72231.0


In [35]:
mask=~data['salesVolume'].isnull()
data['salesVolume'][mask] = exponential_smoothing(data['salesVolume'][mask],0.95)

In [32]:
tmpprice

[532.8333102365115,
 469.34166551182557,
 267.61708327559126,
 400.98085416377955,
 599.549042708189,
 225.67745213540945,
 489.13387260677047,
 248.65669363033854,
 3465.682834681517,
 600.784141734076,
 862.2392070867038,
 283.46196035433525,
 304.87309801771676,
 525.3936549008858,
 643.7696827450443,
 635.4384841372522,
 530.5219242068626,
 465.42609621034313,
 774.7213048105172,
 223.98606524052587,
 294.2993032620263,
 308.26496516310135,
 384.96324825815503,
 309.9481624129078,
 262.4974081206454,
 439.67487040603226,
 741.1337435203017,
 298.3066871760151,
 860.4153343588008,
 410.67076671794007,
 7934.033538335897,
 1157.6516769167952,
 1658.63258384584,
 456.28162919229203,
 771.4140814596145,
 846.0707040729808,
 1408.403535203649,
 794.3201767601824,
 683.8160088380092,
 1093.4408004419006,
 1293.472040022095,
 243.2736020011048,
 240.16368010005525,
 455.65818400500274,
 243.18290920025012,
 294.3091454600125,
 260.7654572730006,
 777.78827286365,
 440.7394136431825,
 153.

In [36]:
data['salesVolume'][:10]

0     532.833310
1     469.341666
2     267.617083
3     400.980854
4     599.549043
5     225.677452
6     489.133873
7     248.656694
8    3465.682835
9     600.784142
Name: salesVolume, dtype: float64

In [30]:
mask[:5]

0    True
1    True
2    True
3    True
4    True
Name: salesVolume, dtype: bool

# 为new_xgb保存数据

In [42]:
data_df_old=pd.read_csv('./middle_rst.csv')
data_df_old[features].to_csv('./middle_rst_features.csv')

In [44]:
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    #mask=~data['salesVolume'].isnull()
    #data['salesVolume'][mask] = exponential_smoothing(data['salesVolume'][mask],0.95)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    data.to_csv('original_data.csv',index=False)

# 调参

In [5]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                bagging_fraction=0.58,num_leaves=127,num_trees=527,
                                max_depth=5,learning_rate=0.0511,n_estimators=8679,
                                objective='reg:gamma',tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7,min_child_samples=6,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']

In [6]:
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    #mask=~data['salesVolume'].isnull()
    #data['salesVolume'][mask] = exponential_smoothing(data['salesVolume'][mask],0.95)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'lgb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/myx_lgb_tiaocan.csv', index=False)

100%|██████████| 3/3 [00:01<00:00,  3.03it/s]


32 32
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 8309.82	valid_1's l2: 50784.8
[200]	training's l2: 4524.09	valid_1's l2: 47517.7
[300]	training's l2: 3121.02	valid_1's l2: 46722.2
[400]	training's l2: 2261.11	valid_1's l2: 45954.5
[500]	training's l2: 1788.28	valid_1's l2: 45656.2
[600]	training's l2: 1442.11	valid_1's l2: 45398.4
[700]	training's l2: 1177.82	valid_1's l2: 45271.1
[800]	training's l2: 974.946	valid_1's l2: 45068.9
[900]	training's l2: 823.477	valid_1's l2: 45055.5
Early stopping, best iteration is:
[828]	training's l2: 927.752	valid_1's l2: 45039.4
0.7189541796021575
valid mean: 490.3871820360307
true  mean: 559.0532150776053
test  mean: 490.1664339729076


100%|██████████| 3/3 [00:01<00:00,  3.04it/s]


32 32
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 8653.88	valid_1's l2: 63392.8
[200]	training's l2: 4744.69	valid_1's l2: 60506.5
[300]	training's l2: 3402.46	valid_1's l2: 59542
[400]	training's l2: 2580.75	valid_1's l2: 58958.2
[500]	training's l2: 2061.37	valid_1's l2: 58827.6
Early stopping, best iteration is:
[489]	training's l2: 2100.58	valid_1's l2: 58727.1
0.6716090665954307
valid mean: 465.393969897045
true  mean: 531.319290465632
test  mean: 351.98349570504763


100%|██████████| 3/3 [00:01<00:00,  2.93it/s]


32 32
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 9879.59	valid_1's l2: 62845.8
[200]	training's l2: 5476.27	valid_1's l2: 57839.9
[300]	training's l2: 3892.06	valid_1's l2: 57434.2
[400]	training's l2: 2950.35	valid_1's l2: 57096.5
[500]	training's l2: 2353.28	valid_1's l2: 56798.9
[600]	training's l2: 1920.32	valid_1's l2: 56673.4
[700]	training's l2: 1602.34	valid_1's l2: 56757.5
Early stopping, best iteration is:
[613]	training's l2: 1870.6	valid_1's l2: 56647.9
0.7062427177346114
valid mean: 478.71703367452415
true  mean: 577.2344789356985
test  mean: 485.7528446871517


100%|██████████| 3/3 [00:01<00:00,  3.03it/s]


32 32
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 10185	valid_1's l2: 194624
[200]	training's l2: 5905.16	valid_1's l2: 184138
[300]	training's l2: 4160.35	valid_1's l2: 181575
[400]	training's l2: 3194.96	valid_1's l2: 180172
[500]	training's l2: 2606.81	valid_1's l2: 179475
[600]	training's l2: 2122.77	valid_1's l2: 178825
[700]	training's l2: 1798.56	valid_1's l2: 178431
[800]	training's l2: 1536.8	valid_1's l2: 178093
[900]	training's l2: 1344.6	valid_1's l2: 177870
[1000]	training's l2: 1146.65	valid_1's l2: 177603
[1100]	training's l2: 1005.4	valid_1's l2: 177455
[1200]	training's l2: 889.179	valid_1's l2: 177099
[1300]	training's l2: 789.104	valid_1's l2: 177057
[1400]	training's l2: 707.078	valid_1's l2: 176992
[1500]	training's l2: 631.296	valid_1's l2: 177007
[1600]	training's l2: 567.262	valid_1's l2: 176927
[1700]	training's l2: 512.983	valid_1's l2: 176942
[1800]	trai

# 去异常值

In [10]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000,
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']

def quantile_clip(group):
    #group.plot()
    group[group < group.quantile(.05)] = group.quantile(.05)
    group[group > group.quantile(.95)] = group.quantile(.95)
    #group.plot()
    #plt.show()
    return group

In [11]:
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    data['salesVolume'] = data.groupby(['adcode', 'model'])['salesVolume'].transform(quantile_clip)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'xgb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/myx_xgb_abnormal.csv', index=False)

100%|██████████| 3/3 [00:01<00:00,  2.93it/s]
100%|██████████| 4/4 [00:00<00:00, 27.03it/s]


32 32
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
[0]	validation_0-rmse:757.617	validation_1-rmse:931.125
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:718.08	validation_1-rmse:891.171
[200]	validation_0-rmse:229.523	validation_1-rmse:365.122
[300]	validation_0-rmse:124.108	validation_1-rmse:252.668
[400]	validation_0-rmse:107.776	validation_1-rmse:243.108
[500]	validation_0-rmse:100.375	validation_1-rmse:238.056
[600]	validation_0-rmse:94.4662	validation_1-rmse:234.209
[700]	validation_0-rmse:88.8747	validation_1-rmse:231.041
[800]	validation_0-rmse:83.6664	validation_1-rmse:228.863
[900]	validation_0-rmse:79.7014	validation_1-rmse:225.99
[1000]	validation_0-rmse:75.7866	validation_1-rmse:224.489
[1100]	validation_0-rmse:71.9663	validation_1-rmse:223.368
[1200]	validation_0-rmse:68.4655	validation_1-rmse:223.076
Stopping. Bes

100%|██████████| 3/3 [00:01<00:00,  2.88it/s]
100%|██████████| 4/4 [00:00<00:00, 26.06it/s]


32 32
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
[0]	validation_0-rmse:778.807	validation_1-rmse:898.617
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:739.264	validation_1-rmse:859.909
[200]	validation_0-rmse:235.811	validation_1-rmse:423.555
[300]	validation_0-rmse:123.719	validation_1-rmse:357.222
[400]	validation_0-rmse:112.581	validation_1-rmse:351.103
[500]	validation_0-rmse:105.382	validation_1-rmse:350.471
Stopping. Best iteration:
[436]	validation_0-rmse:109.313	validation_1-rmse:349.817

0.5938173711118979
valid mean: 380.1831970214844
true  mean: 531.6218680709541
test  mean: 356.3283386230469


100%|██████████| 3/3 [00:01<00:00,  2.83it/s]
100%|██████████| 4/4 [00:00<00:00, 24.90it/s]


32 32
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
[0]	validation_0-rmse:791.604	validation_1-rmse:956.125
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:752.093	validation_1-rmse:916.346
[200]	validation_0-rmse:244.98	validation_1-rmse:427.263
[300]	validation_0-rmse:130.936	validation_1-rmse:334.986
[400]	validation_0-rmse:115.848	validation_1-rmse:329.047
[500]	validation_0-rmse:107.751	validation_1-rmse:325.475
[600]	validation_0-rmse:101.715	validation_1-rmse:323.231
[700]	validation_0-rmse:98.2076	validation_1-rmse:321.05
[800]	validation_0-rmse:93.8661	validation_1-rmse:320.444
[900]	validation_0-rmse:89.82	validation_1-rmse:320.186
[1000]	validation_0-rmse:86.3518	validation_1-rmse:320.719
Stopping. Best iteration:
[913]	validation_0-rmse:89.2528	validation_1-rmse:319.882

0.6508045630795574
valid mean: 443.1093444824219

100%|██████████| 3/3 [00:01<00:00,  2.72it/s]
100%|██████████| 4/4 [00:00<00:00, 26.35it/s]


32 32
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
[0]	validation_0-rmse:807.946	validation_1-rmse:1189.55
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:768.364	validation_1-rmse:1148.69
[200]	validation_0-rmse:252.186	validation_1-rmse:606.758
[300]	validation_0-rmse:134.437	validation_1-rmse:536.715
Stopping. Best iteration:
[274]	validation_0-rmse:141.189	validation_1-rmse:532.994

0.40847847763405454
valid mean: 395.0877990722656
true  mean: 719.7307926829268
test  mean: 285.6610107421875


# model sale

In [1]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000,
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']

In [3]:
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    #mask=~data['salesVolume'].isnull()
    #data['salesVolume'][mask] = exponential_smoothing(data['salesVolume'][mask],0.95)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['model', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = str(province) + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['model', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = str(province) + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'xgb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/myx_model_sales.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.95it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 26.94it/s]


32 32
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
[0]	validation_0-rmse:756.046	validation_1-rmse:934.136
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:716.706	validation_1-rmse:894.206
[200]	validation_0-rmse:233.236	validation_1-rmse:366.55
[300]	validation_0-rmse:121.341	validation_1-rmse:237.091
[400]	validation_0-rmse:104.028	validation_1-rmse:225.787
[500]	validation_0-rmse:95.5082	validation_1-rmse:226.834
Stopping. Best iteration:
[407]	validation_0-rmse:103.426	validation_1-rmse:225.111

0.7538564197390623
valid mean: 501.8374938964844
true  mean: 559.0532150776053
test  mean: 461.4864501953125


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.90it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 27.03it/s]


32 32
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
[0]	validation_0-rmse:777.85	validation_1-rmse:898.685
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:738.473	validation_1-rmse:860.178
[200]	validation_0-rmse:240.043	validation_1-rmse:453.467
[300]	validation_0-rmse:126.359	validation_1-rmse:398.044
Stopping. Best iteration:
[283]	validation_0-rmse:132.573	validation_1-rmse:395.611

0.5687436258401762
valid mean: 370.1124572753906
true  mean: 531.319290465632
test  mean: 280.6367492675781


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.88it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 26.40it/s]


32 32
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
[0]	validation_0-rmse:790.765	validation_1-rmse:957.098
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:751.406	validation_1-rmse:917.475
[200]	validation_0-rmse:250.605	validation_1-rmse:418.734
[300]	validation_0-rmse:137.449	validation_1-rmse:298.219
[400]	validation_0-rmse:120.178	validation_1-rmse:289.474
[500]	validation_0-rmse:108.515	validation_1-rmse:286.026
[600]	validation_0-rmse:101.124	validation_1-rmse:287.188
Stopping. Best iteration:
[556]	validation_0-rmse:104.118	validation_1-rmse:284.691

0.6928770654964207
valid mean: 459.1951599121094
true  mean: 577.2344789356985
test  mean: 291.9375


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.66it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 26.93it/s]


32 32
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
[0]	validation_0-rmse:807.303	validation_1-rmse:1292.89
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:767.859	validation_1-rmse:1252.75
[200]	validation_0-rmse:257.339	validation_1-rmse:727.719
[300]	validation_0-rmse:133.302	validation_1-rmse:664.588
Stopping. Best iteration:
[227]	validation_0-rmse:176.042	validation_1-rmse:661.768

0.35097409977608596
valid mean: 382.886962890625
true  mean: 769.5532150776053
test  mean: 273.4256896972656


# area_sales&model_sales

In [1]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume', 'model_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000,
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']

In [2]:
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    #mask=~data['salesVolume'].isnull()
    #data['salesVolume'][mask] = exponential_smoothing(data['salesVolume'][mask],0.95)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = str(province) + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = str(province) + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column
    
    model_sales = {}
    for raw in data[['model', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = str(province) + "_" + str(regYear) + "_" + str(regMonth)
        if model_sales.__contains__(key):
            model_sales[key] += sales
        else:
            model_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['model', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = str(province) + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(model_sales[key])
        new_column1.append(sales/model_sales[key])
    data['model_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'xgb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/myx_area_model_sales.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.32it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 22.47it/s]


41 41
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
[0]	validation_0-rmse:756.046	validation_1-rmse:934.136
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:716.709	validation_1-rmse:894.329
[200]	validation_0-rmse:233.675	validation_1-rmse:389.818
[300]	validation_0-rmse:123.913	validation_1-rmse:289.517
[400]	validation_0-rmse:105.543	validation_1-rmse:277.879
[500]	validation_0-rmse:95.2349	validation_1-rmse:273.872
[600]	validation_0-rmse:86.256	validation_1-rmse:269.975
[700]	validation_0-rmse:80.3876	validation_1-rmse:267.794
[800]	validation_0-rmse:75.7549	validation_1-rmse:265.475
[900]	validation_0-rmse:71.9827	validation_1-rmse:266.068
Stopping. Best iteration:
[816]	validation_0-rmse:75.0684	validation_1-rmse:265.045

0.7417953529701986
valid mean: 488.8229064941406
true  mean: 559.0532150776053
test  mean: 469.514129638

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.19it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 24.53it/s]


41 41
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
[0]	validation_0-rmse:777.85	validation_1-rmse:898.685
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:738.477	validation_1-rmse:860.275
[200]	validation_0-rmse:239.929	validation_1-rmse:451.203
[300]	validation_0-rmse:125.964	validation_1-rmse:386.132
[400]	validation_0-rmse:107.577	validation_1-rmse:385.179
[500]	validation_0-rmse:97.5323	validation_1-rmse:384.702
Stopping. Best iteration:
[458]	validation_0-rmse:101.331	validation_1-rmse:384.231

0.5685182210206637
valid mean: 367.9149169921875
true  mean: 531.319290465632
test  mean: 286.6864318847656


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.24it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 23.53it/s]


41 41
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
[0]	validation_0-rmse:790.765	validation_1-rmse:957.098
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:751.409	validation_1-rmse:917.38
[200]	validation_0-rmse:248.649	validation_1-rmse:434.082
[300]	validation_0-rmse:132.976	validation_1-rmse:346.697
[400]	validation_0-rmse:113.165	validation_1-rmse:341.257
[500]	validation_0-rmse:101.661	validation_1-rmse:339.11
[600]	validation_0-rmse:94.0367	validation_1-rmse:339.573
Stopping. Best iteration:
[509]	validation_0-rmse:100.833	validation_1-rmse:338.667

0.6631120802468129
valid mean: 432.3575134277344
true  mean: 577.2344789356985
test  mean: 281.7676086425781


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.26it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 24.99it/s]


41 41
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
[0]	validation_0-rmse:807.303	validation_1-rmse:1292.89
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:767.862	validation_1-rmse:1252.77
[200]	validation_0-rmse:255.167	validation_1-rmse:721.513
[300]	validation_0-rmse:133.44	validation_1-rmse:637.152
[400]	validation_0-rmse:115.09	validation_1-rmse:633.112
[500]	validation_0-rmse:103.399	validation_1-rmse:631.293
[600]	validation_0-rmse:96.5808	validation_1-rmse:629.917
[700]	validation_0-rmse:90.4574	validation_1-rmse:631.398
Stopping. Best iteration:
[668]	validation_0-rmse:92.0926	validation_1-rmse:629.337

0.38084102185024893
valid mean: 401.1592712402344
true  mean: 769.5532150776053
test  mean: 262.2773132324219


# catboost

In [2]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import catboost
from catboost import Pool, CatBoostRegressor
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000,
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'ctb':
        model = CatBoostRegressor(iterations=250,
                                 learning_rate=0.05,
                                 depth=10,
                                 eval_metric='RMSE',
                                 random_seed = 42,
                                 bagging_temperature = 0.2,
                                 od_type='Iter',
                                 metric_period = 50,
                                 od_wait=20)
        model.fit(train_x, train_y,
                  cat_features=cate_feat,
                  eval_set=(valid_x, valid_y),
                  use_best_model=True,
                  verbose=True)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    elif m_type == 'ctb':
        #model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], cat_features=cate_feat)
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']

In [None]:
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    #mask=~data['salesVolume'].isnull()
    #data['salesVolume'][mask] = exponential_smoothing(data['salesVolume'][mask],0.95)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'ctb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))
        elif m_type == 'ctb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/myx_cat_car_prediction_sales.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.96it/s]


32 32
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25




0:	learn: 582.9760460	test: 734.7351976	best: 734.7351976 (0)	total: 145ms	remaining: 36s
50:	learn: 173.3949164	test: 359.2651602	best: 359.2651602 (50)	total: 4.42s	remaining: 17.3s
100:	learn: 129.2509500	test: 329.7306158	best: 329.6868145 (99)	total: 8.61s	remaining: 12.7s
150:	learn: 111.1436697	test: 316.7218825	best: 316.7218825 (150)	total: 12.8s	remaining: 8.4s
200:	learn: 99.6286812	test: 311.6190871	best: 311.6190871 (200)	total: 17s	remaining: 4.14s
249:	learn: 90.5104003	test: 310.2387810	best: 310.2387810 (249)	total: 21.1s	remaining: 0us

bestTest = 310.238781
bestIteration = 249

0.6840336497059338
0:	learn: 667.3371076	total: 101ms	remaining: 25s
50:	learn: 205.2793746	total: 4.8s	remaining: 18.7s
100:	learn: 155.0376196	total: 9.55s	remaining: 14.1s
150:	learn: 137.6242529	total: 14.2s	remaining: 9.34s
200:	learn: 125.9528792	total: 19.1s	remaining: 4.65s
249:	learn: 117.0525871	total: 23.9s	remaining: 0us
valid mean: 462.45061324238145
true  mean: 559.0532150776053


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.86it/s]


32 32
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26




0:	learn: 599.8632954	test: 707.1474115	best: 707.1474115 (0)	total: 95ms	remaining: 23.7s
50:	learn: 173.3080054	test: 384.7781387	best: 384.7781387 (50)	total: 4.63s	remaining: 18.1s


# lgb+abnormal

In [5]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000,
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']
def quantile_clip(group):
    #group.plot()
    group[group < group.quantile(.05)] = group.quantile(.05)
    group[group > group.quantile(.95)] = group.quantile(.95)
    #group.plot()
    #plt.show()
    return group

In [6]:
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    data['salesVolume'] = data.groupby(['adcode', 'model'])['salesVolume'].transform(quantile_clip)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'lgb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/myx_lgb_areaSale_abnormal.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.93it/s]


32 32
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 7773.44	valid_1's l2: 40836.7
[200]	training's l2: 4167.36	valid_1's l2: 37648.5
[300]	training's l2: 2851.13	valid_1's l2: 36708.2
[400]	training's l2: 2118.29	valid_1's l2: 36228.3
[500]	training's l2: 1634.66	valid_1's l2: 35964.7
[600]	training's l2: 1330.14	valid_1's l2: 35773.2
[700]	training's l2: 1091.04	valid_1's l2: 35770.9
[800]	training's l2: 903.396	valid_1's l2: 35687.8
[900]	training's l2: 763.602	valid_1's l2: 35588.6
Early stopping, best iteration is:
[880]	training's l2: 788.617	valid_1's l2: 35583
0.7269185701756633
valid mean: 477.82292290364705
true  mean: 556.6262195121955
test  mean: 493.7830262177206


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.96it/s]


32 32
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 8009.68	valid_1's l2: 65741.6
[200]	training's l2: 4459.18	valid_1's l2: 63086.7
[300]	training's l2: 3227.4	valid_1's l2: 62356.3
[400]	training's l2: 2399.48	valid_1's l2: 61795.4
[500]	training's l2: 1886.05	valid_1's l2: 61964.8
Early stopping, best iteration is:
[410]	training's l2: 2336.22	valid_1's l2: 61759.7
0.6923002828429897
valid mean: 472.22941153739583
true  mean: 531.6218680709541
test  mean: 350.65750731980637


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  3.01it/s]


32 32
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 9202.31	valid_1's l2: 68055
[200]	training's l2: 5077.15	valid_1's l2: 63724.3
[300]	training's l2: 3624.83	valid_1's l2: 62794.4
[400]	training's l2: 2738.63	valid_1's l2: 62678.1
Early stopping, best iteration is:
[356]	training's l2: 3092.65	valid_1's l2: 62526.1
0.7196731877424329
valid mean: 484.4388083229382
true  mean: 576.4874168514414
test  mean: 487.60449235983504


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.96it/s]


32 32
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 9551.76	valid_1's l2: 107199
[200]	training's l2: 5504.08	valid_1's l2: 100623
[300]	training's l2: 3894.6	valid_1's l2: 98426.4
[400]	training's l2: 3002.51	valid_1's l2: 98027.6
[500]	training's l2: 2441.45	valid_1's l2: 97732.2
Early stopping, best iteration is:
[497]	training's l2: 2455.98	valid_1's l2: 97727.2
0.613942784638706
valid mean: 523.592314687327
true  mean: 719.7307926829268
test  mean: 487.8328066197992


# lgb+model sale+abnormal

In [7]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000,
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']
def quantile_clip(group):
    #group.plot()
    group[group < group.quantile(.05)] = group.quantile(.05)
    group[group > group.quantile(.95)] = group.quantile(.95)
    #group.plot()
    #plt.show()
    return group

In [8]:
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    data['salesVolume'] = data.groupby(['adcode', 'model'])['salesVolume'].transform(quantile_clip)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['model', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = str(province) + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['model', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = str(province) + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'lgb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/myx_lgb_modelSale_abnormal.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  3.02it/s]


32 32
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 7806.64	valid_1's l2: 31541.9
[200]	training's l2: 4033.63	valid_1's l2: 27632.1
[300]	training's l2: 2805.65	valid_1's l2: 27381.2
[400]	training's l2: 2052.03	valid_1's l2: 27276.9
[500]	training's l2: 1592.2	valid_1's l2: 27251.3
[600]	training's l2: 1286.5	valid_1's l2: 27175.1
[700]	training's l2: 1061.27	valid_1's l2: 27224.1
Early stopping, best iteration is:
[616]	training's l2: 1243.98	valid_1's l2: 27135.8
0.7441883782427814
valid mean: 488.53109678322113
true  mean: 556.6262195121955
test  mean: 487.7110367813596


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  3.03it/s]


32 32
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 7959.01	valid_1's l2: 57823.4
[200]	training's l2: 4290.25	valid_1's l2: 55260.8
[300]	training's l2: 3035.35	valid_1's l2: 54483.7
[400]	training's l2: 2316.93	valid_1's l2: 54331.3
Early stopping, best iteration is:
[390]	training's l2: 2371.58	valid_1's l2: 54322.8
0.7166771624346866
valid mean: 482.10188124434563
true  mean: 531.6218680709541
test  mean: 331.93550951531665


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  3.01it/s]


32 32
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 8820.31	valid_1's l2: 55799.8
[200]	training's l2: 4928.44	valid_1's l2: 53035.7
[300]	training's l2: 3457.39	valid_1's l2: 52620.7
[400]	training's l2: 2649.33	valid_1's l2: 52353.5
[500]	training's l2: 2117.74	valid_1's l2: 52255.1
Early stopping, best iteration is:
[420]	training's l2: 2535.46	valid_1's l2: 52183.5
0.7367270153834871
valid mean: 487.65161699093915
true  mean: 576.4874168514414
test  mean: 463.87019116875126


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  3.03it/s]


32 32
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 9176.6	valid_1's l2: 125075
[200]	training's l2: 5121.08	valid_1's l2: 118001
[300]	training's l2: 3638.31	valid_1's l2: 116718
[400]	training's l2: 2813.51	valid_1's l2: 115893
[500]	training's l2: 2264.39	valid_1's l2: 115771
[600]	training's l2: 1866.7	valid_1's l2: 115730
Early stopping, best iteration is:
[554]	training's l2: 2030.75	valid_1's l2: 115650
0.6380907009838253
valid mean: 529.961575920203
true  mean: 719.7307926829268
test  mean: 463.7539452157097


# xgb+areaSale+abnormal

In [9]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000,
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']
def quantile_clip(group):
    #group.plot()
    group[group < group.quantile(.05)] = group.quantile(.05)
    group[group > group.quantile(.95)] = group.quantile(.95)
    #group.plot()
    #plt.show()
    return group
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    data['salesVolume'] = data.groupby(['adcode', 'model'])['salesVolume'].transform(quantile_clip)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['province', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = province + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'xgb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/myx_xgb_areaSale_abnormal.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 24.84it/s]


32 32
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
[0]	validation_0-rmse:757.617	validation_1-rmse:931.125
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:718.08	validation_1-rmse:891.171
[200]	validation_0-rmse:229.523	validation_1-rmse:365.122
[300]	validation_0-rmse:124.108	validation_1-rmse:252.668
[400]	validation_0-rmse:107.776	validation_1-rmse:243.108
[500]	validation_0-rmse:100.375	validation_1-rmse:238.056
[600]	validation_0-rmse:94.4662	validation_1-rmse:234.209
[700]	validation_0-rmse:88.8747	validation_1-rmse:231.041
[800]	validation_0-rmse:83.6664	validation_1-rmse:228.863
[900]	validation_0-rmse:79.7014	validation_1-rmse:225.99
[1000]	validation_0-rmse:75.7866	validation_1-rmse:224.489
[1100]	validation_0-rmse:71.9663	validation_1-rmse:223.368
[1200]	validation_0-rmse:68.4655	validation_1-rmse:223.076
Stopping. Bes

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.99it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 25.64it/s]


32 32
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
[0]	validation_0-rmse:778.807	validation_1-rmse:898.617
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:739.264	validation_1-rmse:859.909
[200]	validation_0-rmse:235.811	validation_1-rmse:423.555
[300]	validation_0-rmse:123.719	validation_1-rmse:357.222
[400]	validation_0-rmse:112.581	validation_1-rmse:351.103
[500]	validation_0-rmse:105.382	validation_1-rmse:350.471
Stopping. Best iteration:
[436]	validation_0-rmse:109.313	validation_1-rmse:349.817

0.5938173711118979
valid mean: 380.1831970214844
true  mean: 531.6218680709541
test  mean: 356.3283386230469


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  3.04it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 25.80it/s]


32 32
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
[0]	validation_0-rmse:791.604	validation_1-rmse:956.125
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:752.093	validation_1-rmse:916.346
[200]	validation_0-rmse:244.98	validation_1-rmse:427.263
[300]	validation_0-rmse:130.936	validation_1-rmse:334.986
[400]	validation_0-rmse:115.848	validation_1-rmse:329.047
[500]	validation_0-rmse:107.751	validation_1-rmse:325.475
[600]	validation_0-rmse:101.715	validation_1-rmse:323.231
[700]	validation_0-rmse:98.2076	validation_1-rmse:321.05
[800]	validation_0-rmse:93.8661	validation_1-rmse:320.444
[900]	validation_0-rmse:89.82	validation_1-rmse:320.186
[1000]	validation_0-rmse:86.3518	validation_1-rmse:320.719
Stopping. Best iteration:
[913]	validation_0-rmse:89.2528	validation_1-rmse:319.882

0.6508045630795574
valid mean: 443.1093444824219

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.81it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 25.32it/s]


32 32
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
[0]	validation_0-rmse:807.946	validation_1-rmse:1189.55
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:768.364	validation_1-rmse:1148.69
[200]	validation_0-rmse:252.186	validation_1-rmse:606.758
[300]	validation_0-rmse:134.437	validation_1-rmse:536.715
Stopping. Best iteration:
[274]	validation_0-rmse:141.189	validation_1-rmse:532.994

0.40847847763405454
valid mean: 395.0877990722656
true  mean: 719.7307926829268
test  mean: 285.6610107421875


# xgb+modelSale+abnormal

In [10]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
import math
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)


def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']


    #for col in tqdm(['label','popularity','area_sales_volume','pop_dist']):
    for col in tqdm(['label', 'popularity', 'area_sales_volume']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])

    return df, stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2333,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              categorical_feature=cate_feat,
              early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000,
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9,
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
                                )
        model.fit(train_x, train_y,
              eval_set=[(train_x, train_y),(valid_x, valid_y)],
              early_stopping_rounds=100, verbose=100)
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    return sub,df[valid_idx]['pred_label']
def quantile_clip(group):
    #group.plot()
    group[group < group.quantile(.05)] = group.quantile(.05)
    group[group > group.quantile(.95)] = group.quantile(.95)
    #group.plot()
    #plt.show()
    return group
if __name__ == '__main__':
    path = '../ccf_car/'
    train_sales = pd.read_csv(path + 'train_sales_data.csv')
    train_search = pd.read_csv(path + 'train_search_data.csv')
    train_user = pd.read_csv(path + 'train_user_reply_data.csv')
    evaluation_public = pd.read_csv(path + 'evaluation_public.csv')
    submit_example = pd.read_csv(path + 'submit_example.csv')
    data = pd.concat([train_sales, evaluation_public], ignore_index=True)
    data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
    data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
    data['salesVolume'] = data.groupby(['adcode', 'model'])['salesVolume'].transform(quantile_clip)
    data['label'] = data['salesVolume']
    data['id'] = data['id'].fillna(0).astype(int)
    data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
    # LabelEncoder
    for i in ['bodyType', 'model']:
        data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
    data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

    area_sales = {}
    for raw in data[['model', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = str(province) + "_" + str(regYear) + "_" + str(regMonth)
        if area_sales.__contains__(key):
            area_sales[key] += sales
        else:
            area_sales[key] = sales
    new_column = []
    new_column1 = []
    for raw in data[['model', 'salesVolume','regMonth','regYear']].values:
        province = raw[0]
        sales = raw[1]
        if pd.isna(sales):
            new_column.append(None)
            new_column1.append(None)
            continue
        regMonth = raw[2]
        regYear = raw[3]
        key = str(province) + "_" + str(regYear) + "_" + str(regMonth)
        new_column.append(area_sales[key])
        new_column1.append(sales/area_sales[key])
    data['area_sales_volume'] = new_column

    for month in [25, 26, 27, 28]:
        m_type = 'xgb'

        data_df, stat_feat = get_stat_feature(data)

        num_feat = ['regYear'] + stat_feat
        cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']
        if m_type == 'lgb':
            for i in cate_feat:
                data_df[i] = data_df[i].astype('category')
        elif m_type == 'xgb':
            lbl = LabelEncoder()
            for i in tqdm(cate_feat):
                data_df[i] = lbl.fit_transform(data_df[i].astype(str))

        features = num_feat + cate_feat
        print(len(features), len(set(features)))

        #data_df.to_csv('middle_rst.csv',index=False)
        #break
        sub, val_pred = get_train_model(data_df, month, m_type)
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.regMonth == (month - 24)) & (data.regYear == 2018), 'label'] = sub['forecastVolum'].values
    sub = data.loc[(data.regMonth >= 1) & (data.regYear == 2018), ['id', 'salesVolume']]
    sub.columns = ['id', 'forecastVolum']
    sub[['id', 'forecastVolum']].round().astype(int).to_csv('../rst/myx_xgb_modelSale_abnormal.csv', index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.98it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 23.12it/s]


32 32
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
[0]	validation_0-rmse:757.617	validation_1-rmse:931.125
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:718.08	validation_1-rmse:891.171
[200]	validation_0-rmse:229.192	validation_1-rmse:360.437
[300]	validation_0-rmse:119.722	validation_1-rmse:230.494
[400]	validation_0-rmse:103.698	validation_1-rmse:218.83
[500]	validation_0-rmse:94.7486	validation_1-rmse:212.202
[600]	validation_0-rmse:89.3369	validation_1-rmse:206.905
[700]	validation_0-rmse:82.5827	validation_1-rmse:202.677
[800]	validation_0-rmse:77.5794	validation_1-rmse:199.445
[900]	validation_0-rmse:74.1125	validation_1-rmse:199.228
[1000]	validation_0-rmse:70.213	validation_1-rmse:199.106
Stopping. Best iteration:
[915]	validation_0-rmse:73.3155	validation_1-rmse:198.483

0.7721931151635194
valid mean: 511.846343994140

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 24.84it/s]


32 32
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
[0]	validation_0-rmse:778.807	validation_1-rmse:898.617
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:739.264	validation_1-rmse:859.893
[200]	validation_0-rmse:235.884	validation_1-rmse:424.178
[300]	validation_0-rmse:120.441	validation_1-rmse:368.182
Stopping. Best iteration:
[276]	validation_0-rmse:127.413	validation_1-rmse:367.475

0.6007052563358208
valid mean: 382.11505126953125
true  mean: 531.6218680709541
test  mean: 308.2127685546875


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.92it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 25.15it/s]


32 32
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
[0]	validation_0-rmse:791.604	validation_1-rmse:956.125
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:752.093	validation_1-rmse:916.34
[200]	validation_0-rmse:245.967	validation_1-rmse:409.766
[300]	validation_0-rmse:131.308	validation_1-rmse:282.392
[400]	validation_0-rmse:114.264	validation_1-rmse:275.566
[500]	validation_0-rmse:103.827	validation_1-rmse:270.372
[600]	validation_0-rmse:96.5414	validation_1-rmse:269.005
[700]	validation_0-rmse:91.3058	validation_1-rmse:267.183
[800]	validation_0-rmse:86.1202	validation_1-rmse:265.523
[900]	validation_0-rmse:82.629	validation_1-rmse:265.318
[1000]	validation_0-rmse:78.8261	validation_1-rmse:264.432
[1100]	validation_0-rmse:74.9661	validation_1-rmse:263.766
Stopping. Best iteration:
[1077]	validation_0-rmse:75.6335	validation_1-

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 25.32it/s]


32 32
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
[0]	validation_0-rmse:807.946	validation_1-rmse:1189.55
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:768.363	validation_1-rmse:1148.7
[200]	validation_0-rmse:252.286	validation_1-rmse:617.703
[300]	validation_0-rmse:132.2	validation_1-rmse:560.667
Stopping. Best iteration:
[243]	validation_0-rmse:154.627	validation_1-rmse:557.602

0.39344372386673965
valid mean: 384.47479248046875
true  mean: 719.7307926829268
test  mean: 350.4908142089844
