In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
#import catboost as ctb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = './ccf_car'

train_sales_data = pd.read_csv(path + '/train_sales_data.csv')
train_search_data = pd.read_csv(path + '/train_search_data.csv')
train_user_reply_data = pd.read_csv(path + '/train_user_reply_data.csv')

test = pd.read_csv(path + '/evaluation_public.csv')

In [3]:
data = pd.concat([train_sales_data, test], ignore_index=True)
data = data.merge(train_search_data, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user_reply_data, 'left', on=['model', 'regYear', 'regMonth'])

In [4]:
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
del data['salesVolume'], data['forecastVolum']
data['bodyType'] = data['model'].map(train_sales_data.drop_duplicates('model').set_index('model')['bodyType'])

In [5]:
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))

In [6]:
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

In [8]:
data.head()

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,mt
0,310000,0,0,0,上海,1,2016,1479.0,11.0,106.0,292.0,1
1,530000,0,0,0,云南,1,2016,1594.0,11.0,106.0,466.0,1
2,150000,0,0,0,内蒙古,1,2016,1479.0,11.0,106.0,257.0,1
3,110000,0,0,0,北京,1,2016,2370.0,11.0,106.0,408.0,1
4,510000,0,0,0,四川,1,2016,3562.0,11.0,106.0,610.0,1


In [7]:
shift_feat = []

data['model_adcode'] = data['adcode'] + data['model']
data['model_adcode_mt'] = data['model_adcode'] * 100 + data['mt']
for i in [11]:
    i = i + 1
    shift_feat.append('shift_model_adcode_mt_label_{0}'.format(i))
    data['model_adcode_mt_{0}'.format(i)] = data['model_adcode_mt'] + i
    data_last = data[~data.label.isnull()].set_index('model_adcode_mt_{0}'.format(i))
    data['shift_model_adcode_mt_label_{0}'.format(i)] = data['model_adcode_mt'].map(data_last['label'])

num_feat = ['regYear'] + shift_feat
cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']

In [8]:
features = num_feat + cate_feat

# data['n_label'] = data['label'] / data.groupby('model')['label'].transform('mean')
train_idx = (data['mt'] <= 20)

valid_idx = (data['mt'].between(21, 24))

test_idx = (data['mt'] > 24)

data['model_weight'] = data.groupby('model')['label'].transform('mean')
data['n_label'] = data['label'] / data['model_weight']

In [9]:
train_x = data[train_idx][features]
train_y = data[train_idx]['n_label']

valid_x = data[valid_idx][features]
valid_y = data[valid_idx]['n_label']

# test_x = data[test_idx][features]

In [14]:
lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=1, reg_lambda=0.1, objective='mse',
    max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=np.random.randint(1000),
    n_estimators=5000, subsample=0.8, colsample_bytree=0.8,
)

lgb_model.fit(train_x, train_y, eval_set=[
    (valid_x, valid_y),
], categorical_feature=cate_feat, early_stopping_rounds=100, verbose=100)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.289116
[200]	valid_0's l2: 0.247219
[300]	valid_0's l2: 0.238665
[400]	valid_0's l2: 0.239144
Early stopping, best iteration is:
[314]	valid_0's l2: 0.237607


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
              importance_type='split', learning_rate=0.05, max_depth=-1,
              min_child_samples=5, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=32, objective='mse',
              random_state=372, reg_alpha=1, reg_lambda=0.1, silent=True,
              subsample=0.8, subsample_for_bin=200000, subsample_freq=0)

In [10]:
def score(data, pred='pred_label', label='label', group='model'):
    data[pred] = data[pred].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred: list,
        label: [list, 'mean'],

    }).reset_index()

    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print('scoring:')
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

In [18]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def xgb_Regressor(train_x, train_y, val_x, val_y):
    '''
    xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, num_leaves=20,
                             n_estimators=5000,reg_alpha=1, reg_lambda=0.1,
                             subsample=0.8, silent=1,min_child_samples=5,
                             random_state =np.random.randint(1000), nthread = -1)
    
    for cate in cate_feat:
        train_x[cate] = LabelEncoder().fit_transform(train_x[cate])
        val_x[cate] = LabelEncoder().fit_transform(val_x[cate])
    xgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y),
           ], early_stopping_rounds=100, verbose=100)
    data['xgb_pred'] = xgb_model.predict(data[features]) * data['model_weight']
    best_score = score(data[valid_idx],pred='xgb_pred')
    #lgb_model.n_estimators = 666
    return xgb_model
    '''
    '''
    param = {
        'max_depth' : 6,
        'eta' : 0.02,
        'objective' : 'reg:linear',
        'silent': 0,
        #     'nthread': 4,
        #     'booster': 'gbtree'
    }
    num_round = 130
    #num_round = 8000 # v0.2 best 129  # 8000
    dtrain = xgb.DMatrix(train_x, label=train_y, missing=np.nan) 
    # dtest = xgb.DMatrix(df_test)
    dtest = xgb.DMatrix(val_x, label=val_y.values, missing=np.nan) 
    eval_set = [(dtrain, 'train'), (dtest, 'validation')]
    xgb_model = xgb.train(param, dtrain, num_round, verbose_eval=True, early_stopping_rounds=20, evals=eval_set)#early stop 200
    '''
    xgb_model = xgb.XGBRegressor(max_depth=6,
        eta=0.02,
        objective='reg:linear',
        silent=0)
    
    #for cate in cate_feat:
    #    train_x[cate] = LabelEncoder().fit_transform(train_x[cate])
    xgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y),
            ], early_stopping_rounds=100, verbose=100)
    #df=xgb.DMatrix(data[features],label=data['n_label'].values)
    data['xgb_pred'] = xgb_model.predict(data[features]) * data['model_weight']
    best_score = score(data[valid_idx],pred='xgb_pred')
    #lgb_model.n_estimators = 666
    return xgb_model
    

def lgb_Regressor(train_x, train_y, valid_x, valid_y):
    lgb_model = lgb.LGBMRegressor(
        num_leaves=32, reg_alpha=1, reg_lambda=0.1, objective='mse',
        max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=np.random.randint(1000),
        n_estimators=5000, subsample=0.8, colsample_bytree=0.8,
    )
    
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y),
            ], categorical_feature=cate_feat, early_stopping_rounds=100, verbose=100)
    
    data['lgb_pred'] = lgb_model.predict(data[features]) * data['model_weight']
    best_score = score(data[valid_idx],pred='lgb_pred')
    lgb_model.n_estimators = 666
    return lgb_model

def base_model():
    ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

    return ENet,lasso

def gboost_Regressor(train_x, train_y, val_x, val_y, train_X, y):
    gb_model = GradientBoostingRegressor(n_estimators=3600, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=20, min_samples_split=20, 
                    loss='huber', random_state =5)
    gb_model.fit(train_x, train_y)
    pred_val = gb_model.predict(val_x)
    score = rmsle(val_y, pred_val)
    gb_model.fit(train_X, y)
    
    return gb_model, score, pred_val

In [20]:
print("LGBRegressor开始训练...")
lgb_reg= lgb_Regressor(train_x, train_y, valid_x, valid_y)
print(score)
lgb_pred = lgb_reg.predict(data[test_idx][features])

LGBRegressor开始训练...
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.162392	valid_1's l2: 0.289954
[200]	training's l2: 0.102711	valid_1's l2: 0.26303
[300]	training's l2: 0.0817219	valid_1's l2: 0.258462
[400]	training's l2: 0.0683372	valid_1's l2: 0.254229
[500]	training's l2: 0.0590822	valid_1's l2: 0.254438
Early stopping, best iteration is:
[468]	training's l2: 0.0610146	valid_1's l2: 0.253616
scoring:
0.5953230848000274
<function score at 0x00000242DE912510>


In [19]:
print("XGBOOSTRegressor开始训练...")
xgb_reg = xgb_Regressor(train_x, train_y, valid_x, valid_y)
print(score)
xgb_pred = xgb_reg.predict(data[test_idx][features])

XGBOOSTRegressor开始训练...
[0]	validation_0-rmse:0.959292	validation_1-rmse:1.14599
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[99]	validation_0-rmse:0.418656	validation_1-rmse:0.519423
scoring:
0.5586724872501776
<function score at 0x00000242DE912510>


In [61]:
data['pred_label'] = lgb_model.predict(data[features]) * data['model_weight']

In [19]:
data.to_csv('data_lgb_rst.csv')

In [62]:
best_score = score(data[valid_idx])
lgb_model.n_estimators = 666

scoring:
0.243033927745538


In [18]:
lgb_model.fit(data[~test_idx][features], data[~test_idx]['n_label'], categorical_feature=cate_feat)
data['forecastVolum'] = lgb_model.predict(data[features]) * data['model_weight']
sub = data[test_idx][['id']]
sub['forecastVolum'] = data[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
sub.to_csv(path + 'lgb_base_0_46.csv', index=False)