In [1]:
## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import lightgbm as lgb
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [2]:
## 通过Pandas对于数据进行读取 (pandas是一个很友好的数据读取函数库)
Train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')
TestA_data = pd.read_csv('used_car_testB_20200421.csv', sep=' ')

## 输出数据的大小信息
print('Train data shape:',Train_data.shape)
print('TestA data shape:',TestA_data.shape)

Train data shape: (150000, 31)
TestA data shape: (50000, 30)


In [None]:
Field	Description
SaleID	交易ID，唯一编码
name	汽车交易名称，已脱敏
regDate	汽车注册日期，例如20160101，2016年01月01日
model	车型编码，已脱敏
brand	汽车品牌，已脱敏
bodyType	车身类型：豪华轿车：0，微型车：1，厢型车：2，大巴车：3，敞篷车：4，双门汽车：5，商务车：6，搅拌车：7
fuelType	燃油类型：汽油：0，柴油：1，液化石油气：2，天然气：3，混合动力：4，其他：5，电动：6
gearbox	变速箱：手动：0，自动：1
power	发动机功率：范围 [ 0, 600 ]
kilometer	汽车已行驶公里，单位万km
notRepairedDamage	汽车有尚未修复的损坏：是：0，否：1
regionCode	地区编码，已脱敏
seller	销售方：个体：0，非个体：1
offerType	报价类型：提供：0，请求：1
creatDate	汽车上线时间，即开始售卖时间
price	二手车交易价格（预测目标）
v系列特征	匿名特征，包含v0-14在内15个匿名特征

In [3]:
Train_data.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,0,0,20160404,1850,43.357796,3.966344,0.050257,2.159744,1.143786,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,-,4366,0,0,20160309,3600,45.305273,5.236112,0.137925,1.380657,-1.422165,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,0,0,20160402,6222,45.978359,4.823792,1.319524,-0.998467,-0.996911,0.25141,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,0,0,20160312,2400,45.687478,4.492574,-0.050616,0.8836,-2.228079,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,0,0,20160313,5200,44.383511,2.031433,0.572169,-1.571239,2.246088,0.228036,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482


In [19]:
int(str(Train_data['regDate'][0])[4:6])

4

In [21]:
# 汽车注册日期拆分年月特征
Train_data['regDate_year'] = Train_data['regDate'].apply(lambda x:int(str(x)[:4]))
Train_data['regDate_month'] = Train_data['regDate'].apply(lambda x:int(str(x)[4:6]))

In [27]:
# 汽车注册日期拆分年月特征
TestA_data['regDate_year'] = TestA_data['regDate'].apply(lambda x:int(str(x)[:4]))
TestA_data['regDate_month'] = TestA_data['regDate'].apply(lambda x:int(str(x)[4:6]))

In [23]:
Train_data['regDate_month'][:3]

0    4
1    3
2    4
Name: regDate_month, dtype: int64

In [24]:
# 获取连续变量，得到的是Index类型
numerical_cols = Train_data.select_dtypes(exclude='object').columns
print(numerical_cols)

Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
       'gearbox', 'power', 'kilometer', 'regionCode', 'seller', 'offerType',
       'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6',
       'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14',
       'regDate_year', 'regDate_month'],
      dtype='object')


In [25]:
# 获取离散变量，得到的是Index类型
categorical_cols = Train_data.select_dtypes(include='object').columns
print(categorical_cols)

Index(['notRepairedDamage'], dtype='object')


In [28]:
# 选择特征列
feature_cols = [col for col in numerical_cols if col not in ['SaleID','name','regDate','creatDate','price','model','brand','regionCode','seller']]
feature_cols = [col for col in feature_cols if 'Type' not in col]

# 分开特征列和标签列
X_data = Train_data[feature_cols]
Y_data = Train_data['price']

X_test = TestA_data[feature_cols]

print('X train shape:', X_data.shape)
print('X test shape:', X_test.shape)

X train shape: (150000, 20)
X test shape: (50000, 20)


In [29]:
X_data = X_data.fillna(-1)
X_test = X_test.fillna(-1)

In [32]:
xgr = xgb.XGBRegressor(n_estimators=120, learning_rate=0.1, gamma=0,subsample=0.8,colsample_bytree=0.9,max_depth=7)

scores_train = []
scores = []

sk = StratifiedKFold(n_splits=5,shuffle=True,random_state=0)
for train_ind, val_ind in sk.split(X_data,Y_data):
    train_x = X_data.iloc[train_ind].values
    train_y = Y_data.iloc[train_ind]
    val_x = X_data.iloc[val_ind].values
    val_y = Y_data.iloc[val_ind]
    
    xgr.fit(train_x, train_y)
    pred_train_xgb = xgr.predict(train_x)
    pred_xgb = xgr.predict(val_x)
    
    #  mean_absolute_error(y_true, y_pred) 计算平均绝对误差
    score_train = mean_absolute_error(train_y, pred_train_xgb)
    scores_train.append(score_train)
    score = mean_absolute_error(val_y, pred_xgb)
    scores.append(score)

print('Train mae:', np.mean(score_train))
print('Val mae:', np.mean(scores))

Train mae: 554.4277917253668
Val mae: 642.2329763262685


In [33]:
def bulid_model_xgb(x_train, y_train):
    model = xgb.XGBRegressor(n_estimators=120, learning_rate=0.1, gamma=0,subsample=0.8,colsample_bytree=0.9,max_depth=7)
    model.fit(x_train, y_train)
    
    return model

def bulid_model_lgb(x_train, y_train):
    model = lgb.LGBMRegressor(num_leaves=127, n_estimators = 150)
    param_gird = {'learning_rate':[0.01, 0.05, 0.1, 0.2]}
    gbm = GridSearchCV(model, param_gird)
    gbm.fit(x_train, y_train)
    
    return gbm

In [34]:
x_train, x_val, y_train, y_val = train_test_split(X_data,Y_data, test_size=0.3)

In [35]:
def Sta_inf(data):
    print('_min',np.min(data))
    print('_max:',np.max(data))
    print('_mean',np.mean(data))
    print('_ptp',np.ptp(data))
    print('_std',np.std(data))
    print('_var',np.var(data))

In [39]:
print('Train lgb...')
model_lgb = bulid_model_lgb(x_train, y_train)
val_lgb = model_lgb.predict(x_val)
MAE_lgb = mean_absolute_error(y_val, val_lgb)
print('MAE of val with lgb:', MAE_lgb)

print('Predict lgb..')
model_lgb_pre = bulid_model_lgb(X_data, Y_data)
subA_lgb = model_lgb_pre.predict(X_test)
print('Sta of Predict lgb:')
Sta_inf(subA_lgb)

Train lgb...
MAE of val with lgb: 615.3086368008733
Predict lgb..
Sta of Predict lgb:
_min -644.0403677804414
_max: 91840.55356525585
_mean 5906.069365995844
_ptp 92484.59393303629
_std 7353.997388602548
_var 54081277.591573104


In [41]:
print('Train xgb...')
model_xgb = bulid_model_xgb(x_train, y_train)
val_xgb = model_xgb.predict(x_val)
MAE_xgb = mean_absolute_error(y_val, val_xgb)
print('MAE of val with xgb:', MAE_xgb)

print('Predict xbg...')
model_xgb_pre = bulid_model_xgb(X_data, Y_data)
subA_xgb = model_xgb_pre.predict(X_test)
print('Sta of Predict xgb:')
Sta_inf(subA_xgb)

Train xgb...
MAE of val with xgb: 654.568998788876
Predict xbg...
Sta of Predict xgb:
_min -969.0882
_max: 90517.86
_mean 5905.0337
_ptp 91486.945
_std 7332.701
_var 53768504.0


In [48]:
#  简单的加权融合方式
val_Weighted = (1-MAE_lbg/(MAE_xgb+MAE_lgb))*val_lgb+(1-MAE_xgb/(MAE_xgb+MAE_lgb))*val_xgb
val_Weighted[val_Weighted < 0] = 10

In [49]:
print('MAE of val with Weighted ensemble:',mean_absolute_error(y_val,val_Weighted))

MAE of val with Weighted ensemble: 615.2875961245525


In [51]:
sub_Weighted =  (1-MAE_lbg/(MAE_xgb+MAE_lgb))*subA_lgb+(1-MAE_xgb/(MAE_xgb+MAE_lgb))*subA_xgb
sub_Weighted[sub_Weighted < 0] = 10

In [52]:
sub = pd.DataFrame()
sub['SaleID'] = TestA_data.SaleID
sub['price'] = sub_Weighted

In [53]:
sub.head()

Unnamed: 0,SaleID,price
0,200000,1169.822129
1,200001,1839.169026
2,200002,8502.65725
3,200003,1273.072256
4,200004,1949.556477


In [54]:
# 保存至CSV文件
# sub.to_csv('val_Weighted.csv', index=False)