In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import datetime


input_path = '.'

def load_order_data(file_name):
    df = pd.read_csv('%s/%s' % (input_path, file_name))
    c = 'order_unix_time'
    mask = pd.notnull(df[c])#notnull,是一个标注，可以判断每个数据是否为空。为以后可以进行布尔索引
    df.loc[mask, c] = df.loc[mask, c].apply(lambda x: datetime.datetime.fromtimestamp(x))#参考核心编程p286-288
    df.loc[mask, 'date'] = df.loc[mask, c].apply(lambda x: x.strftime('%Y%m%d'))#df.loc[]的用法，[行标，列标]
    df.loc[mask, 'hour'] = df.loc[mask, c].apply(lambda x: x.hour)
    df.loc[mask, 'minute'] = df.loc[mask, c].apply(lambda x: x.minute)
    df.loc[mask, 'weekday'] = df.loc[mask, c].apply(lambda x: x.strftime('%u'))
    df.loc[mask, 'is_weekend'] = df.loc[mask, 'weekday'].apply(lambda x: 1 if (x == 6 or x == 7) else 0)
    df['weekday'] = df['weekday'].apply(lambda x : int(x))
    df.loc[mask, 'is_hot_hour'] = df.loc[mask, 'hour'].apply(lambda x: 1 if x in (11, 17) else 0)
    return df

def load_area_data(file_name):
    df = pd.read_csv('%s/%s' % (input_path, file_name), dtype={'date': str, 'time': str})
    mask = pd.notnull(df['time'])
    df.loc[mask, 'hour'] = df.loc[mask, 'time'].apply(lambda x: int(x[:2]))
    df.loc[mask, 'minute'] = df.loc[mask, 'time'].apply(lambda x: int(x[2:]))
    df.drop(['log_unix_time', 'time'], axis=1, inplace=True)
    df['not_fetched_order_num'] = df['not_fetched_order_num'].apply(lambda x : 0 if x<0 else x);
    df['deliverying_order_num'] = df['deliverying_order_num'].apply(lambda x : 0 if x<0 else x);
    return df

def load_weather_data(file_name):
    df = pd.read_csv('%s/%s' % (input_path, file_name), dtype={'date': str, 'time': str})
    mask = pd.notnull(df['time'])
    df.loc[mask, 'hour'] = df.loc[mask, 'time'].apply(lambda x: int(x[:2]))
    df.loc[mask, 'minute'] = df.loc[mask, 'time'].apply(lambda x: int(x[2:]))
    df.drop(['log_unix_time', 'time'], axis=1, inplace=True)
    return df



In [2]:
print('loading data...')
df_tr = load_order_data('./all_data/waybill_info.csv')
#先查看数据，及describe，发现最大值与最小值，然后找出11点与17点的数据用于以后的训练
mask = (df_tr.delivery_duration < 4654.0) & (df_tr.delivery_duration > 663.0) & ((df_tr.hour.values == 11) | (df_tr.hour.values == 17))
df_tr = df_tr.loc[mask]
df_te = load_order_data('./all_data/waybill_info_test_b.csv')

df_tr_weather = load_weather_data('./all_data/weather_realtime.csv')
df_te_weather = load_weather_data('./all_data/weather_realtime_test.csv')

df_tr_area = load_area_data('./all_data/area_realtime.csv')
df_te_area = load_area_data('./all_data/area_realtime_test.csv')

print('merging data...')
df_tr = pd.merge(df_tr, df_tr_weather, on=['date', 'hour', 'minute', 'area_id'], how='left')
df_tr = pd.merge(df_tr, df_tr_area, on=['date', 'hour', 'minute', 'area_id'], how='left')

df_te = pd.merge(df_te, df_te_weather, on=['date', 'hour', 'minute', 'area_id'], how='left')
df_te = pd.merge(df_te, df_te_area, on=['date', 'hour', 'minute', 'area_id'], how='left')

loading data...
merging data...


In [3]:
print('constructing training data...')
cols = df_tr.columns.tolist()
to_drop = ['order_unix_time', 'arriveshop_unix_time', 'fetch_unix_time', 'finish_unix_time', 'order_id', 'delivery_duration', 'date']
features = list(np.setdiff1d(cols, to_drop))#通过np.setdiffd返回不同将无用特征去掉
print(features)


constructing training data...
['area_id', 'box_total_value', 'customer_latitude', 'customer_longitude', 'delivery_distance', 'deliverying_order_num', 'food_num', 'food_total_value', 'hour', 'is_hot_hour', 'is_weekend', 'minute', 'not_fetched_order_num', 'notbusy_working_rider_num', 'poi_id', 'poi_lat', 'poi_lng', 'rain', 'temperature', 'waiting_order_num', 'weekday', 'wind', 'working_rider_num']


In [4]:
x_train = df_tr[features]
y_train = df_tr['delivery_duration']

x_test = df_te[features]
id_test = df_te['order_id']

In [5]:
from sklearn.preprocessing  import PolynomialFeatures #构造特征的函数
def feat(x_mat,x_mat1,str1,str2):
    poly=PolynomialFeatures(2)
    str3=poly.fit_transform(x_mat1[[str1,str2]])
    str3=pd.DataFrame(str3)
    x_mat[[str1+str2+'1',str1+str2+'2',str1+str2+'3']]=str3[[3,4,5]]
    return x_mat

###开始构造特征

In [6]:
x_train1=x_train.fillna(0)
x_test1=x_test.fillna(0)

In [7]:
#距离2次特征
x_train=feat(x_train,x_train1,'customer_latitude','poi_lat')
x_test=feat(x_test,x_test1,'customer_latitude','poi_lat')

x_train=feat(x_train,x_train1,'customer_longitude','poi_lng')
x_test=feat(x_test,x_test1,'customer_longitude','poi_lng')

#
x_train=feat(x_train,x_train1,'waiting_order_num','working_rider_num')
x_test=feat(x_test,x_test1,'waiting_order_num','working_rider_num')

x_train=feat(x_train,x_train1,'not_fetched_order_num','notbusy_working_rider_num')
x_test=feat(x_test,x_test1,'not_fetched_order_num','notbusy_working_rider_num')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


###结束构造特征

In [8]:
print(x_train.shape)
print(x_test.shape)

dtrain = xgb.DMatrix(x_train.values, y_train)
dtest = xgb.DMatrix(x_test.values)

(85887, 35)
(251864, 35)


In [9]:
print('training model...')
watchlist = [(dtrain, 'train')]#用于evals_result的参数，来显示进度
param = {
        'booster': 'gbtree',
        'objective': 'reg:linear',
        'eval_metric': 'mae',
        'eta': 0.01,
        'num_round': 2000,
        'colsample_bytree': 0.65,
        'subsample': 0.5,
        'max_depth': 5,
        'nthread': -1,
        'seed': 20171001,
        'silent': 1,
    }
bst = xgb.train(param, dtrain, param['num_round'],watchlist, verbose_eval=10)

training model...
[0]	train-mae:1832.67
[10]	train-mae:1657.53
[20]	train-mae:1499.25
[30]	train-mae:1356.01
[40]	train-mae:1226.48
[50]	train-mae:1109.44
[60]	train-mae:1004.32
[70]	train-mae:911.337
[80]	train-mae:830.055
[90]	train-mae:759.446
[100]	train-mae:699.179
[110]	train-mae:647.274
[120]	train-mae:603.275
[130]	train-mae:566.06
[140]	train-mae:534.807
[150]	train-mae:508.547
[160]	train-mae:486.962
[170]	train-mae:468.618
[180]	train-mae:453.473
[190]	train-mae:440.894
[200]	train-mae:430.356
[210]	train-mae:421.732
[220]	train-mae:414.53
[230]	train-mae:408.471
[240]	train-mae:403.431
[250]	train-mae:399.238
[260]	train-mae:395.736
[270]	train-mae:392.715
[280]	train-mae:390.187
[290]	train-mae:388.048
[300]	train-mae:386.062
[310]	train-mae:384.316
[320]	train-mae:382.872
[330]	train-mae:381.593
[340]	train-mae:380.497
[350]	train-mae:379.363
[360]	train-mae:378.36
[370]	train-mae:377.556
[380]	train-mae:376.805
[390]	train-mae:376.017
[400]	train-mae:375.329
[410]	train-

In [None]:
df_tr.to_csv('other_train_feature1.csv', index=False)
df_te.to_csv('other_test_feature1.csv', index=False)

In [10]:
print('generating prediction...')
pred = bst.predict(dtest)



generating prediction...


In [11]:
print('generating submission...')
sub = pd.DataFrame({'order_id': id_test, 'delivery_duration': pred})

print('saving submission...')
sub.to_csv('sub_xgb_starter2000_0.5.csv', index=False)

generating submission...
saving submission...
