In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from mlxtend.regressor import StackingRegressor
import xgboost as xgb
import catboost as ctb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [2]:
path = './ccf_car'

train_sales_data = pd.read_csv(path + '/train_sales_data.csv')
train_search_data = pd.read_csv(path + '/train_search_data.csv')
train_user_reply_data = pd.read_csv(path + '/train_user_reply_data.csv')

test = pd.read_csv(path + '/evaluation_public.csv')

In [3]:
data = pd.concat([train_sales_data, test], ignore_index=True)
data = data.merge(train_search_data, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user_reply_data, 'left', on=['model', 'regYear', 'regMonth'])

In [4]:
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
del data['salesVolume'], data['forecastVolum']
data['bodyType'] = data['model'].map(train_sales_data.drop_duplicates('model').set_index('model')['bodyType'])

In [5]:
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))

In [6]:
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

In [7]:
data.head()

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,mt
0,310000,0,0,0,上海,1,2016,1479.0,11.0,106.0,292.0,1
1,530000,0,0,0,云南,1,2016,1594.0,11.0,106.0,466.0,1
2,150000,0,0,0,内蒙古,1,2016,1479.0,11.0,106.0,257.0,1
3,110000,0,0,0,北京,1,2016,2370.0,11.0,106.0,408.0,1
4,510000,0,0,0,四川,1,2016,3562.0,11.0,106.0,610.0,1


In [8]:
shift_feat = []

data['model_adcode'] = data['adcode'] + data['model']
data['model_adcode_mt'] = data['model_adcode'] * 100 + data['mt']
for i in [11]:
    i = i + 1
    shift_feat.append('shift_model_adcode_mt_label_{0}'.format(i))
    data['model_adcode_mt_{0}'.format(i)] = data['model_adcode_mt'] + i
    data_last = data[~data.label.isnull()].set_index('model_adcode_mt_{0}'.format(i))
    data['shift_model_adcode_mt_label_{0}'.format(i)] = data['model_adcode_mt'].map(data_last['label'])

num_feat = ['regYear'] + shift_feat
cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']

In [9]:
features = num_feat + cate_feat

# data['n_label'] = data['label'] / data.groupby('model')['label'].transform('mean')
train_idx = (data['mt'] <= 20)

valid_idx = (data['mt'].between(21, 24))

test_idx = (data['mt'] > 24)

data['model_weight'] = data.groupby('model')['label'].transform('mean')
data['n_label'] = data['label'] / data['model_weight']

In [15]:
X=data[~test_idx][features]
y=data[~test_idx]['n_label']
test_X=data[test_idx][features]
test_y=data[test_idx]['n_label']

In [14]:
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import lightgbm as lgb
xgb_mdl=xgb.XGBRegressor()
lgb_mdl=lgb.LGBMRegressor()
print(cross_val_score(xgb_mdl,X,y,cv=5,scoring="neg_mean_squared_error").mean())
print(cross_val_score(lgb_mdl,X,y,cv=5,scoring="neg_mean_squared_error").mean())

-0.4325446284994026
-0.2861643153264876


In [17]:
from tpot import TPOTRegressor

tpot = TPOTRegressor(verbosity=2,n_jobs=-1)
tpot.fit(X,y)
tpot.export('tpot_plain.py')

Imputing missing values in feature set


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=10100, style=ProgressStyle(descri…

Generation 1 - Current best internal CV score: -0.2005924830269578


TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=False, max_features=0.9000000000000001, min_samples_leaf=4, min_samples_split=6, n_estimators=100)


In [10]:
train_x = data[train_idx][features]
train_y = data[train_idx]['n_label']

valid_x = data[valid_idx][features]
valid_y = data[valid_idx]['n_label']

# test_x = data[test_idx][features]

In [11]:
def score(data, pred='pred_label', label='label', group='model'):
    data[pred] = data[pred].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred: list,
        label: [list, 'mean'],

    }).reset_index()

    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print('scoring:')
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

In [12]:
def rmsle(y, y_pred):
    return np.sqrt(mse(y, y_pred))

def cat_Regressor(train_x,train_y,valid_x,valid_y):
    cat_model=ctb.CatBoostRegressor(iterations=2000, learning_rate=0.05,
                depth=7, eval_metric='RMSE', cat_features=cate_feat)
    cat_model.fit(train_x,train_y,eval_set=[(train_x, train_y), (valid_x, valid_y),
            ], early_stopping_rounds=100, verbose=100)
    data['cat_pred'] = cat_model.predict(data[features]) * data['model_weight']
    best_score = score(data[valid_idx],pred='cat_pred')
    return cat_model

def xgb_Regressor(train_x, train_y, val_x, val_y):
    xgb_model = xgb.XGBRegressor(max_depth=6,
        eta=0.02,
        objective='reg:linear',
        silent=0)

    xgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y),
            ], early_stopping_rounds=100, verbose=100)
    data['xgb_pred'] = xgb_model.predict(data[features]) * data['model_weight']
    best_score = score(data[valid_idx],pred='xgb_pred')
    return xgb_model
    
def gboost_Regressor(train_x, train_y, valid_x, valid_y):
    gb_model = GradientBoostingRegressor(n_estimators=3600, learning_rate=0.05,
                                   max_depth=6, max_features='sqrt',
                                   min_samples_leaf=20, min_samples_split=20, 
                    loss='huber', random_state =5)
    train_x=train_x.fillna(0)
    gb_model.fit(train_x, train_y)
    data['gb_pred'] = gb_model.predict(data[features].fillna(0)) * data['model_weight']
    best_score = score(data[valid_idx],pred='gb_pred')
    return gb_model

def lgb_Regressor(train_x, train_y, valid_x, valid_y):
    lgb_model = lgb.LGBMRegressor(
        num_leaves=32, reg_alpha=1, reg_lambda=0.1, objective='mse',
        max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=np.random.randint(1000),
        n_estimators=5000, subsample=0.8, colsample_bytree=0.8
    )
    
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y),
            ], categorical_feature=cate_feat, early_stopping_rounds=100, verbose=100)
    
    data['lgb_pred'] = lgb_model.predict(data[features]) * data['model_weight']
    best_score = score(data[valid_idx],pred='lgb_pred')
    lgb_model.n_estimators = 666
    return lgb_model

def base_model():
    ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

    return ENet,lasso

In [13]:
print("LGBRegressor开始训练...")
lgb_reg= lgb_Regressor(train_x, train_y, valid_x, valid_y)
lgb_pred = lgb_reg.predict(data[test_idx][features])

LGBRegressor开始训练...
Training until validation scores don't improve for 100 rounds.
[100]	training's l2: 0.176296	valid_1's l2: 0.299142
[200]	training's l2: 0.109714	valid_1's l2: 0.263745
[300]	training's l2: 0.0846756	valid_1's l2: 0.261737
[400]	training's l2: 0.0697191	valid_1's l2: 0.258047
[500]	training's l2: 0.0618694	valid_1's l2: 0.258151
Early stopping, best iteration is:
[420]	training's l2: 0.0676747	valid_1's l2: 0.257694
scoring:
0.5944981182653273


In [14]:
print("XGBOOSTRegressor开始训练...")
xgb_reg = xgb_Regressor(train_x, train_y, valid_x, valid_y)
xgb_pred = xgb_reg.predict(data[test_idx][features])

XGBOOSTRegressor开始训练...
[0]	validation_0-rmse:0.959292	validation_1-rmse:1.14599
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[99]	validation_0-rmse:0.418656	validation_1-rmse:0.519423
scoring:
0.5586724872501776


In [15]:
# 效果太差，直接舍弃
print("CatBOOSTRegressor开始训练...")
cat_reg = cat_Regressor(train_x, train_y, valid_x, valid_y)
cat_pred = cat_reg.predict(data[test_idx][features])

CatBOOSTRegressor开始训练...
0:	learn: 1.2738830	test: 1.2727952	test1: 1.5364990	best: 1.5364990 (0)	total: 132ms	remaining: 4m 24s
100:	learn: 0.5246755	test: 0.4895168	test1: 0.6572954	best: 0.6572954 (100)	total: 3.6s	remaining: 1m 7s
200:	learn: 0.4790760	test: 0.4375386	test1: 0.6172712	best: 0.6172712 (200)	total: 6.2s	remaining: 55.5s
300:	learn: 0.4471853	test: 0.4038843	test1: 0.6007424	best: 0.6007422 (299)	total: 9.54s	remaining: 53.9s
400:	learn: 0.4279009	test: 0.3838780	test1: 0.5939624	best: 0.5939624 (400)	total: 13.2s	remaining: 52.7s
500:	learn: 0.4079982	test: 0.3650929	test1: 0.5864255	best: 0.5861429 (499)	total: 17.6s	remaining: 52.7s
600:	learn: 0.3917887	test: 0.3505379	test1: 0.5877531	best: 0.5849090 (533)	total: 22.4s	remaining: 52.1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.5849089645
bestIteration = 533

Shrink model to first 534 iterations.
scoring:
0.5103347207485311


In [16]:
print("GDBTRegressor开始训练...")
gb_reg = gboost_Regressor(train_x, train_y, valid_x, valid_y)
gb_pred = gb_reg.predict(data[test_idx][features])

GDBTRegressor开始训练...
scoring:
0.600581130327456


In [17]:
# ####Stacking####
print('Stacking...')
stacked_averaged_models = StackingRegressor(
regressors=[lgb_reg, gb_reg],
meta_regressor= xgb_reg
)
stacked_averaged_models.fit(train_x.fillna(0), train_y.fillna(0))
stacked_train_pred = stacked_averaged_models.predict(valid_x.fillna(0))
stacked_averaged_models.fit(data[~test_idx][features].fillna(0), data[~test_idx]['n_label'])
stacked_pred = stacked_averaged_models.predict(data[test_idx][features])

Stacking...


In [18]:
print(rmsle(valid_y, stacked_train_pred))
print(rmsle(valid_y, stacked_train_pred*0.55 + gb_pred*0.20 + 
       lgb_pred*0.25 + xgb_pred*0))
print(rmsle(valid_y, stacked_train_pred))
ensemble = stacked_pred*0.30 + gb_pred*0.30  + lgb_pred*0.20 + xgb_pred*0.20

0.5139839728842504
0.4986124939777301
0.5139839728842504


In [None]:
data.to_csv('data_lgb_rst.csv')

In [None]:
best_score = score(data[valid_idx])
lgb_model.n_estimators = 666

In [None]:
lgb_model.fit(data[~test_idx][features], data[~test_idx]['n_label'], categorical_feature=cate_feat)
data['forecastVolum'] = lgb_model.predict(data[features]) * data['model_weight']
sub = data[test_idx][['id']]
sub['forecastVolum'] = data[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
sub.to_csv(path + 'lgb_base_0_46.csv', index=False)

In [None]:
sub = data[test_idx][['id']]
sub['forecastVolum'] = (data[test_idx]['cat_pred']).apply(lambda x: 0 if x < 0 else x).round().astype(int)
sub.to_csv('cat.csv', index=False)

In [None]:
sub = data[test_idx][['id']]
data['forecastVolum'] = stacked_averaged_models.predict(data[features].fillna(0)) * data['model_weight']
sub['forecastVolum'] = (data[test_idx]['forecastVolum']).apply(lambda x: 0 if x < 0 else x).round().astype(int)
sub.to_csv('stack_rst.csv', index=False)

In [None]:
best=pd.read_csv('./rst/stacking_lgb_0_46And0_49_original.csv')
today=pd.read_csv('lgb_xgb_cat_gb.csv')
best['forecastVolum'] = (best['forecastVolum']*0.6+today['forecastVolum']*0.4).apply(lambda x: 0 if x < 0 else x).round().astype(int)
best.to_csv('bestAndToday_rst.csv', index=False)