In [3]:
import pandas as pd
from tqdm import tqdm
import numpy as np

from sklearn.metrics import mean_squared_error,explained_variance_score
from sklearn.model_selection import KFold
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [8]:
# baseline只用到gps定位数据，即train_gps_path
train_gps_path = '../train0523/train0523.csv'
test_data_path = '../data/A_testData0531.csv'
order_data_path = '../data/loadingOrderEvent.csv'
port_data_path = '../data/port.csv'

In [9]:
# 取前1000000行
debug = True
NDATA = 1000000

if debug:
    train_data = pd.read_csv(train_gps_path,nrows=NDATA,header=None)
else:
    train_data = pd.read_csv(train_gps_path,header=None)

train_data.columns = ['loadingOrder','carrierName','timestamp','longitude',
                  'latitude','vesselMMSI','speed','direction','vesselNextport',
                  'vesselNextportETA','vesselStatus','vesselDatasource','TRANSPORT_TRACE']
test_data = pd.read_csv(test_data_path)

In [10]:
def get_data(data, mode='train'):
    
    assert mode=='train' or mode=='test'
    
    if mode=='train':
        data['vesselNextportETA'] = pd.to_datetime(data['vesselNextportETA'], infer_datetime_format=True)
    elif mode=='test':
        data['temp_timestamp'] = data['timestamp']
        data['onboardDate'] = pd.to_datetime(data['onboardDate'], infer_datetime_format=True)
    data['timestamp'] = pd.to_datetime(data['timestamp'], infer_datetime_format=True)
    data['longitude'] = data['longitude'].astype(float)
    data['loadingOrder'] = data['loadingOrder'].astype(str)
    data['latitude'] = data['latitude'].astype(float)
    data['speed'] = data['speed'].astype(float)
    data['direction'] = data['direction'].astype(float)

    return data

train_data = get_data(train_data, mode='train')
test_data = get_data(test_data, mode='test')

In [18]:
# 代码参考：https://github.com/juzstu/TianChi_HaiYang
def get_feature(df, mode='train'):
    
    assert mode=='train' or mode=='test'
    
    df.sort_values(['loadingOrder', 'timestamp'], inplace=True)
    # 特征只选择经纬度、速度\方向
    df['lat_diff'] = df.groupby('loadingOrder')['latitude'].diff(1)
    df['lon_diff'] = df.groupby('loadingOrder')['longitude'].diff(1)
    df['speed_diff'] = df.groupby('loadingOrder')['speed'].diff(1)
    df['diff_minutes'] = df.groupby('loadingOrder')['timestamp'].diff(1).dt.total_seconds() // 60
    df['anchor'] = df.apply(lambda x: 1 if x['lat_diff'] <= 0.03 and x['lon_diff'] <= 0.03
                            and x['speed_diff'] <= 0.3 and x['diff_minutes'] <= 10 else 0, axis=1)
    
    if mode=='train':
        group_df = df.groupby('loadingOrder')['timestamp'].agg({"mmax":'max', "count":'count', "mmin":'min'}).reset_index()
        # 读取数据的最大值-最小值，即确认时间间隔为label
        group_df['label'] = (group_df['mmax'] - group_df['mmin']).dt.total_seconds()
    elif mode=='test':
        group_df = df.groupby('loadingOrder')['timestamp'].agg({"count":'count'}).reset_index()
        
    anchor_df = df.groupby('loadingOrder')['anchor'].agg('sum').reset_index()
    anchor_df.columns = ['loadingOrder', 'anchor_cnt']
    group_df = group_df.merge(anchor_df, on='loadingOrder', how='left')
    group_df['anchor_ratio'] = group_df['anchor_cnt'] / group_df['count']

    agg_function = ['min', 'max', 'mean', 'median']
    agg_col = ['latitude', 'longitude', 'speed', 'direction']

    group = df.groupby('loadingOrder')[agg_col].agg(agg_function).reset_index()
    group.columns = ['loadingOrder'] + ['{}_{}'.format(i, j) for i in agg_col for j in agg_function]
    group_df = group_df.merge(group, on='loadingOrder', how='left')

    return group_df
    
train = get_feature(train_data, mode='train')
test = get_feature(test_data, mode='test')
features = [c for c in train.columns if c not in ['loadingOrder', 'label', 'mmin', 'mmax', 'count']]

In [19]:
def build_model(train, test, pred, label, seed=1080, is_shuffle=True):
    train_pred = np.zeros((train.shape[0], ))
    test_pred = np.zeros((test.shape[0], ))
    n_splits = 10
    # Kfold
    fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed)
    kf_way = fold.split(train[pred])
    # params
    params = {
        'learning_rate': 0.01,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'num_leaves': 36,
        'metric': 'mse',
        'feature_fraction': 0.6,
        'bagging_fraction': 0.7,
        'bagging_freq': 6,
        'seed': 8,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': 8,
        'verbose': 1,
    }
    # train
    for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1):
        train_x, train_y = train[pred].iloc[train_idx], train[label].iloc[train_idx]
        valid_x, valid_y = train[pred].iloc[valid_idx], train[label].iloc[valid_idx]
        # 数据加载
        n_train = lgb.Dataset(train_x, label=train_y)
        n_valid = lgb.Dataset(valid_x, label=valid_y)

        clf = lgb.train(
            params=params,
            train_set=n_train,
            num_boost_round=3000,
            valid_sets=[n_valid],
            early_stopping_rounds=100,
            verbose_eval=100
        )
        train_pred[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        test_pred += clf.predict(test[pred], num_iteration=clf.best_iteration)/fold.n_splits
    
    test['label'] = test_pred
    
    return test[['loadingOrder', 'label']]

result = build_model(train, test, features, 'label', is_shuffle=True)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 4.10178e+10
[200]	valid_0's l2: 1.68453e+10
[300]	valid_0's l2: 1.14587e+10
[400]	valid_0's l2: 9.92733e+09
[500]	valid_0's l2: 9.34437e+09
[600]	valid_0's l2: 9.07332e+09
[700]	valid_0's l2: 8.81451e+09
[800]	valid_0's l2: 8.75262e+09
[900]	valid_0's l2: 8.68566e+09
Early stopping, best iteration is:
[843]	valid_0's l2: 8.63748e+09
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 5.57278e+10
[200]	valid_0's l2: 2.43323e+10
[300]	valid_0's l2: 1.49322e+10
[400]	valid_0's l2: 1.0925e+10
[500]	valid_0's l2: 8.97585e+09
[600]	valid_0's l2: 7.74915e+09
[700]	valid_0's l2: 6.88507e+09
[800]	valid_0's l2: 6.41256e+09
[900]	valid_0's l2: 6.09157e+09
[1000]	valid_0's l2: 5.67147e+09
[1100]	valid_0's l2: 5.39496e+09
[1200]	valid_0's l2: 5.1354e+09
[1300]	valid_0's l2: 4.97892e+09
[1400]	valid_0's l2: 4.85253e+09
[1500]	valid_0's l2: 4.73472e+09
[1600]	valid_0's l2: 4.63557e+09
[170

[400]	valid_0's l2: 1.39659e+10
[500]	valid_0's l2: 1.28897e+10
[600]	valid_0's l2: 1.21006e+10
[700]	valid_0's l2: 1.17397e+10
[800]	valid_0's l2: 1.14014e+10
[900]	valid_0's l2: 1.11774e+10
[1000]	valid_0's l2: 1.0961e+10
[1100]	valid_0's l2: 1.08757e+10
[1200]	valid_0's l2: 1.07819e+10
[1300]	valid_0's l2: 1.06504e+10
[1400]	valid_0's l2: 1.06117e+10
Early stopping, best iteration is:
[1357]	valid_0's l2: 1.05667e+10


In [20]:
test_data = test_data.merge(result, on='loadingOrder', how='left')
test_data['ETA'] = (test_data['onboardDate'] + test_data['label'].apply(lambda x:pd.Timedelta(seconds=x))).apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
test_data.drop(['direction','TRANSPORT_TRACE'],axis=1,inplace=True)
test_data['onboardDate'] = test_data['onboardDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
test_data['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')
test_data['timestamp'] = test_data['temp_timestamp']
# 整理columns顺序
result = test_data[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA', 'creatDate']]

In [21]:
result.to_csv('result.csv', index=False)

In [22]:
result

Unnamed: 0,loadingOrder,timestamp,longitude,latitude,carrierName,vesselMMSI,onboardDate,ETA,creatDate
0,CF946210847851,2019-04-02T02:42:28.000Z,138.471062,40.278787,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/03 12:40:14,2020/06/26 18:41:26
1,CF946210847851,2019-04-02T02:59:28.000Z,138.552168,40.327785,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/03 12:40:14,2020/06/26 18:41:26
2,CF946210847851,2019-04-02T03:07:28.000Z,138.588250,40.352542,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/03 12:40:14,2020/06/26 18:41:26
3,CF946210847851,2019-04-02T03:43:28.000Z,138.751325,40.459447,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/03 12:40:14,2020/06/26 18:41:26
4,CF946210847851,2019-04-02T04:29:28.000Z,138.969782,40.581485,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/03 12:40:14,2020/06/26 18:41:26
5,CF946210847851,2019-04-02T04:41:28.000Z,139.023647,40.617033,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/03 12:40:14,2020/06/26 18:41:26
6,CF946210847851,2019-04-02T04:49:28.000Z,139.059750,40.641672,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/03 12:40:14,2020/06/26 18:41:26
7,CF946210847851,2019-04-02T04:53:28.000Z,139.077772,40.654002,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/03 12:40:14,2020/06/26 18:41:26
8,CF946210847851,2019-04-02T05:13:28.000Z,139.171020,40.710698,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/03 12:40:14,2020/06/26 18:41:26
9,CF946210847851,2019-04-02T05:17:28.000Z,139.190528,40.720732,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/03 12:40:14,2020/06/26 18:41:26
