In [1]:
# 安装包
!pip install seaborn
!pip install lightgbm --install-option=--gpu
!pip install xgboost
!pip install missingno
!pip install bayesian-optimization
!pip install pandas_profiling
!pip install ipywidgets

Looking in indexes: https://mirror.baidu.com/pypi/simple/
  cmdoptions.check_install_build_global(options)
Looking in indexes: https://mirror.baidu.com/pypi/simple/
Looking in indexes: https://mirror.baidu.com/pypi/simple/
Looking in indexes: https://mirror.baidu.com/pypi/simple/
Looking in indexes: https://mirror.baidu.com/pypi/simple/
Looking in indexes: https://mirror.baidu.com/pypi/simple/
Looking in indexes: https://mirror.baidu.com/pypi/simple/


## 准备工作

In [2]:
## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.model_selection  import cross_val_score,KFold
## 数据降维处理
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import lightgbm as lgb
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from bayes_opt import BayesianOptimization
## 数据分析
import pandas_profiling as pp



## 2 数据处理

### Step 1:数据读取

In [3]:
## 通过Pandas对于数据进行读取 (pandas是一个很友好的数据读取函数库)
Train_data = pd.read_csv('./work/used_car_train_20200313.csv', sep=' ')
Test_data = pd.read_csv('./work/used_car_testB_20200421.csv', sep=' ')

## 输出数据的大小信息
print('Train data shape:',Train_data.shape)
print('Test data shape:',Test_data.shape)

Train data shape: (150000, 31)
Test data shape: (50000, 30)


In [None]:
## pandas_profiling数据报告生成
profile = Train_data.profile_report(title="原始数据",minimal=True)
profile.to_file("./work/origin_report.html")
profile.to_widgets()
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/39 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

### Step 2:数据分析与预处理
查看导出的数据报告分析可知:
+ offerType 值为常量0，无法对price做出影响，故删去此特征
+ saleid 为编号，看不出与价格有潜在关系，删去此特征
+ 分析报告中指出 bodyType、fuelType、gearbox 存在缺失值，如需使用后续需要处理（可以考虑采用能够处理缺省值的模型）
+ 观察可知price并不是一个常见分布，可以对其进行一定处理

In [None]:
# 训练集和测试集放在一起，方便数据处理
Train_data['train']=1
Test_data['train']=0
data = pd.concat([Train_data, Test_data], ignore_index=True)
data.columns

In [None]:
#'name'有部分重复值，做一个简单统计
data['name_count'] = data.groupby(['name'])['SaleID'].transform('count')
# 删除无用字段
del data['name']
del data['offerType']
del data['seller']

#对'price'做对数变换
data['price'] = np.log1p(data['price'])

In [None]:
#用众数填充缺失值，众数可以从数据分析报告中看出
data['fuelType'] = data['fuelType'].fillna(0)
data['gearbox'] = data['gearbox'].fillna(0)
data['bodyType'] = data['bodyType'].fillna(0)
data['model'] = data['model'].fillna(0)
# 查看缺省值
msno.matrix(data.sample(1500))

In [None]:
#处理异常值
data['power'] = data['power'].map(lambda x: 600 if x>600 else x)#赛题限定power<=600
data['notRepairedDamage'] = data['notRepairedDamage'].astype('str').apply(lambda x: x if x != '-' else 1).astype('float32') # 某人未损坏

In [None]:
# 对可分类的连续特征进行分桶，kilometer是已经分桶了
bin = [i*10 for i in range(31)]
data['power_bin'] = pd.cut(data['power'], bin, labels=False)

bin = [i*10 for i in range(24)]
data['model_bin'] = pd.cut(data['model'], bin, labels=False)

### Step 3:特征与标签构建

#### 1) 特征构造

In [None]:
# 构造使用时间特征：data['creatDate'] - data['regDate']，反应汽车使用时间，一般来说价格与使用时间成反比

#时间提取出年，月，日和使用时间
from datetime import datetime
def date_process(x):
    year = int(str(x)[:4])
    month = int(str(x)[4:6])
    day = int(str(x)[6:8])

    if month < 1:
        month = 1

    date = datetime(year, month, day)
    return date

data['regDate'] = data['regDate'].apply(date_process)
data['creatDate'] = data['creatDate'].apply(date_process)
data['regDate_year'] = data['regDate'].dt.year
data['regDate_month'] = data['regDate'].dt.month
data['regDate_day'] = data['regDate'].dt.day
data['creatDate_year'] = data['creatDate'].dt.year
data['creatDate_month'] = data['creatDate'].dt.month
data['creatDate_day'] = data['creatDate'].dt.day
data['car_age_day'] = (data['creatDate'] - data['regDate']).dt.days#二手车使用天数
data['car_age_year'] = round(data['car_age_day'] / 365, 1)#二手车使用年数
del data['regDate_year']
del data['regDate_month']
del data['regDate_day']
del data['creatDate_year']
del data['creatDate_month']
del data['creatDate_day']
del data['creatDate']
del data['regDate']

In [None]:
# 查看缺省值
msno.matrix(data.sample(1500))

In [None]:
# 从邮编中提取城市信息，相当于加入了先验知识,同城价格规律一致原则
data['city'] = data['regionCode'].apply(lambda x : str(x)[:-3])
del data['regionCode']

In [None]:
# 当前数据标签
print(data.shape)
data.columns

In [None]:
# 数据还原
Train_data = data[data['train']==1]
del Train_data['train']

## 输出数据的大小信息
print('Train data shape:',Train_data.shape)
Train_data.info()

In [None]:
Test_data = data[data['train']==0]
del Test_data['train']
del Test_data['price']
print('Test data shape:',Test_data.shape)
Test_data.info()

In [None]:
# 提取数值类型特征列名
numerical_cols = Train_data.select_dtypes(exclude = 'object').columns
categorical_cols = Train_data.select_dtypes(include = 'object').columns

## 选择特征列
feature_cols = [col for col in numerical_cols if col not in ['SaleID','price','name']]
feature_cols = [col for col in feature_cols if 'Type' not in col]

## 提前特征列，标签列构造训练样本和测试样本
X_train = Train_data[feature_cols]
Y_train = Train_data['price']

test  = Test_data[feature_cols]

print('X train shape:',X_train.shape)
print('X test shape:',test.shape)

In [None]:
X_train.profile_report(title="新数据",minimal=True)

## 模型训练与预测

#### 1) 模型定义

In [None]:
def build_model_xgb(x_train,y_train):
    xgb_model = xgb.XGBRegressor(
        nthread = 15,
    )
    param_grid = dict(
        max_depth = [4,6,10],
        learning_rate = [0.01,0.05,0.1,0.5],
        n_estimators = [100,150,200]
    )
    grid = GridSearchCV(xgb_model, param_grid,scoring="neg_mean_absolute_error")
    grid.fit(x_train, y_train)
    return grid

def build_model_lgb(x_train,y_train):
    estimator = lgb.LGBMRegressor(num_leaves=127,n_estimators = 150)
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
    }
    gbm = GridSearchCV(estimator, param_grid)
    gbm.fit(x_train, y_train)
    return gbm

In [None]:
## Split data with val
x_train,x_val,y_train,y_val = train_test_split(X_train,Y_train,test_size=0.2)

In [None]:
print('predict XGB...')
model_xgb = build_model_xgb(x_train,y_train)
val_xgb = model_xgb.predict(x_val)
subA_xgb = model_xgb.predict(test)
print('predict XGB...done')
MAE_xgb= mean_absolute_error(np.expm1(val_xgb), np.expm1(y_val))
print(MAE_xgb)

In [None]:
print('predict lgb...')
model_lgb = build_model_lgb(x_train,y_train)
val_lgb = model_lgb.predict(x_val)
subA_lgb = model_lgb.predict(test)
print('predict done...')
MAE_lgb = mean_absolute_error(np.expm1(val_lgb), np.expm1(y_val))
print(MAE_lgb)

In [None]:
# 进行两模型的结果加权融合
val_Weighted = (1-MAE_lgb/(MAE_xgb+MAE_lgb))*val_lgb+(1-MAE_xgb/(MAE_xgb+MAE_lgb))*val_xgb
val_Weighted = np.expm1(val_Weighted)
val_Weighted[val_Weighted<0]=10
MAE = mean_absolute_error(val_Weighted, np.expm1(y_val))
print(MAE)
sub_Weighted = (1-MAE_lgb/(MAE_xgb+MAE_lgb))*subA_lgb+(1-MAE_xgb/(MAE_xgb+MAE_lgb))*subA_xgb

sub_Weighted = np.expm1(sub_Weighted)
sub_Weighted[sub_Weighted<0]=10

In [None]:
sub_Weighted = np.expm1(subA_xgb)
sub_Weighted[sub_Weighted<0]=10

In [None]:
# 输出结果
sub = pd.DataFrame()
sub['SaleID'] = Test_data.SaleID
sub['price'] = sub_Weighted
sub.to_csv('./work/sub_Weighted.csv',index=False)