In [2]:
import os, sys, glob
import numpy as np
import pandas as pd

import time
import datetime

from joblib import Parallel, delayed
from sklearn.metrics import f1_score, log_loss, classification_report
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb  # 系统首次安装

In [3]:
test = pd.read_csv('hy_round1_testA_20200102/7000.csv')

In [4]:
test.head()

Unnamed: 0,渔船ID,x,y,速度,方向,time
0,7000,7118845.0,5918277.0,0.11,0,1103 11:54:32
1,7000,7118940.0,5918285.0,0.32,346,1103 11:44:32
2,7000,7118948.0,5918174.0,0.11,0,1103 11:34:43
3,7000,7118948.0,5918174.0,0.11,71,1103 11:14:30
4,7000,7118948.0,5918174.0,0.11,30,1103 11:04:46


In [5]:
train = pd.read_csv('hy_round1_train_20200102/7.csv')

In [6]:
train.head()

Unnamed: 0,渔船ID,x,y,速度,方向,time,type
0,7,6588193.0,5813085.0,0.27,37,1123 23:54:17,围网
1,7,6588193.0,5813085.0,0.05,0,1123 23:44:19,围网
2,7,6588193.0,5813085.0,0.0,0,1123 23:34:19,围网
3,7,6588193.0,5813085.0,0.05,0,1123 23:24:19,围网
4,7,6588193.0,5813085.0,0.0,0,1123 23:14:19,围网


In [7]:
def read_featrue(path, test_mode=False):
    df = pd.read_csv(path)
    df = df.iloc[::-1]
    
    if test_mode:
        df_feat = [df['渔船ID'].iloc[0], df['type'].iloc[0]]
        df = df.drop(['type'], axis=1)
    else:
        df_feat = [df['渔船ID'].iloc[0]]
    
    df['time'] = df['time'].apply(lambda x: datetime.datetime.strptime(x, '%m%d %H:%M:%S'))
    # 下一列 - 上一列
    df_diff = df.diff(1).iloc[1:]
    df_diff['time_seconds'] = df_diff['time'].dt.total_seconds()
    df_diff['dis'] = np.sqrt(df_diff['x']**2 + df_diff['y']**2)
    df_feat.append(df['time'].dt.day.nunique())
    df_feat.append(df['time'].dt.hour.min())
    df_feat.append(df['time'].dt.hour.max())
    df_feat.append(df['time'].dt.hour.value_counts().index[0])
    
    df_feat.append(df['速度'].min())
    df_feat.append(df['速度'].max())
    df_feat.append(df['速度'].mean())
    
    df_feat.append(df_diff['速度'].min())
    df_feat.append(df_diff['速度'].max())
    df_feat.append(df_diff['速度'].mean())
    df_feat.append((df_diff['速度']).mean())
    df_feat.append((df_diff['速度']).mean())
    
    df_feat.append(df_diff['方向'].min())
    df_feat.append(df_diff['方向'].max())
    df_feat.append(df_diff['方向'].mean())
    df_feat.append((df_diff['方向'] > 0).mean())
    df_feat.append((df_diff['方向'] == 0).mean())
    
    
    df_feat.append((df_diff['x'].abs() / df_diff['time_seconds']).min())
    df_feat.append((df_diff['x'].abs() / df_diff['time_seconds']).max())
    df_feat.append((df_diff['x'].abs() / df_diff['time_seconds']).mean())
    df_feat.append((df_diff['x'] > 0).mean())
    df_feat.append((df_diff['x'] == 0).mean())
    
    df_feat.append((df_diff['y'].abs() / df_diff['time_seconds']).min())
    df_feat.append((df_diff['y'].abs() / df_diff['time_seconds']).max())
    df_feat.append((df_diff['y'].abs() / df_diff['time_seconds']).mean())
    df_feat.append((df_diff['y'] > 0).mean())
    df_feat.append((df_diff['y'] == 0).mean())
    
    df_feat.append(df_diff['dis'].min())
    df_feat.append(df_diff['dis'].max())
    df_feat.append(df_diff['dis'].mean())
    
    df_feat.append((df_diff['dis'] / df_diff['time_seconds']).min())
    df_feat.append((df_diff['dis'] / df_diff['time_seconds']).max())
    df_feat.append((df_diff['dis'] / df_diff['time_seconds']).mean())
    
    return df_feat

In [8]:
a = read_featrue('hy_round1_testA_20200102/7000.csv')
type(a)

list

In [9]:
train_feat = Parallel(n_jobs=10)(delayed(read_featrue)(path, True) for path in glob.glob('hy_round1_train_20200102/*')[:])

train_feat = pd.DataFrame(train_feat)

test_feat = Parallel(n_jobs=10)(delayed(read_featrue)(path, False) for path in glob.glob('hy_round1_testA_20200102/*')[:])

test_feat = pd.DataFrame(test_feat)
test_feat = test_feat.sort_values(by=0)

In [10]:
train_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0,拖网,4,0,23,15,0.0,9.39,0.265966,-6.8,...,2.181406,0.029993,0.01937,0.94431,0.0,4745.887438,87.089644,0.0,5.125652,0.13668
1,1,拖网,4,0,23,19,0.0,10.47,1.607922,-3.19,...,4.886008,0.462122,0.153646,0.632812,0.0,5828.114792,494.874699,0.0,5.232657,0.740035
2,10,拖网,4,0,23,23,0.0,10.09,1.313854,-6.8,...,2.536566,0.320474,0.191919,0.608586,0.0,5526.89741,414.501179,0.0,4.694111,0.648485
3,100,拖网,3,0,23,11,0.0,8.69,2.965864,-5.4,...,4.030897,0.413737,0.456098,0.170732,0.0,3266.637624,930.294733,0.0,4.271592,1.477071
4,1000,围网,3,0,23,0,0.0,8.9,2.08557,-5.77,...,3.94312,0.673804,0.31383,0.25266,0.0,10831.412476,679.109667,0.0,4.317332,0.946626


In [11]:
train_feat.shape

(7000, 35)

In [12]:
test_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,7000,4,0,23,1,0.0,10.09,1.656139,-9.39,9.12,...,4.994467,0.496579,0.236559,0.497312,0.0,6082.455686,593.535312,0.0,5.089298,0.83755
1,7001,3,0,23,1,0.0,10.09,3.074476,-9.39,9.29,...,126.363692,1.499711,0.396061,0.157549,0.0,13958.643844,995.684493,0.0,126.896762,1.90365
2,7002,3,0,23,11,0.0,10.09,2.985488,-7.23,8.09,...,2.991093,0.867465,0.488998,0.031785,0.0,5690.304305,892.238179,0.0,4.761761,1.410623
3,7003,4,0,23,23,0.0,10.09,1.132212,-9.98,8.47,...,3.723221,0.283304,0.132075,0.709906,0.0,4694.982758,338.245548,0.0,5.461123,0.545136
4,7004,4,0,23,20,0.0,10.09,1.473442,-10.09,9.77,...,4.237397,0.441845,0.115869,0.700252,0.0,5556.659805,441.670678,0.0,7.158958,0.654401


In [13]:
train_feat[1] = train_feat[1].map({'围网':0, '刺网':1, '拖网':2})

In [14]:
test_feat.shape

(2000, 34)

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

n_fold = 10
skf = StratifiedKFold(n_splits=n_fold, shuffle=True)
eval_fun = f1_score

In [24]:
def run_oof(clf, x_train, y_train, x_test, kf):
    print(clf)
    preds_train = np.zeros((len(x_train), 3), dtype=np.float)
    preds_test = np.zeros((len(x_test), 3), dtype=np.float)
    train_loss = []
    test_loss = []
    
    i = 1
    for train_index, test_index in kf.split(x_train, y_train):
        x_tr = x_train[train_index]; x_te = x_train[test_index]
        y_tr = y_train[train_index]; y_te = y_train[test_index]
        
        clf.fit(x_tr, y_tr, eval_set = [(x_te, y_te)], early_stopping_rounds = 500, verbose = False)
        
        train_loss.append(eval_fun(y_tr, np.argmax(clf.predict_proba(x_tr)[:], 1), average='macro'))
        test_loss.append(eval_fun(y_te, np.argmax(clf.predict_proba(x_te)[:], 1), average='macro'))
        
        preds_train[test_index] = clf.predict_proba(x_te)[:]
        preds_test += clf.predict_proba(x_test)[:]
        
        print('{0}: Train {1:0.7f} Val {2:0.7f}/{3:0.7f}'.format(i, train_loss[-1], test_loss[-1], np.mean(test_loss)))
        print('-' * 50)
        i += 1
    print('Train: ', train_loss)
    print('Val: ', test_loss)
    print('-' * 50)
    print('Train{0:0.5f}_Test{1:0.5f}\n\n'.format(np.mean(train_loss), np.mean(test_loss)))
    preds_test /= n_fold
    return preds_train, preds_test

In [25]:
params = {
    'learning_rate': 0.01,
    'min_child_samples': 5,
    'max_depth': 7,
    'lambda_l1': 2,
    'boosting': 'gbdt',
    'objective': 'multiclass',
    'n_estimators': 2000,
    'metric': 'multi_error',
    'num_class': 3,
    'feature_fraction': .75,
    'bagging_fraction': .85,
    'seed': 99,
    'num_threads': 20,
    'verbose': -1
}


In [26]:
train_pred, test_pred = run_oof(lgb.LGBMClassifier(**params), 
                                train_feat.iloc[:, 2:].values, 
                                train_feat.iloc[:, 1].values, 
                                test_feat.iloc[:, 1:].values, 
                                skf)

LGBMClassifier(bagging_fraction=0.85, boosting='gbdt', boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.75,
               importance_type='split', lambda_l1=2, learning_rate=0.01,
               max_depth=7, metric='multi_error', min_child_samples=5,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=2000,
               n_jobs=-1, num_class=3, num_leaves=31, num_threads=20,
               objective='multiclass', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, seed=99, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0, verbose=-1)
1: Train 0.8902068 Val 0.6296377/0.6296377
--------------------------------------------------
2: Train 0.9402708 Val 0.6379494/0.6337935
--------------------------------------------------
3: Train 0.8956711 Val 0.6535876/0.6403916
--------------------------------------------------
4: Train 0.8944852 Val 0.6786027/0.6499443
---------------

In [27]:
test_feat['label'] = np.argmax(test_pred, 1)
test_feat['label'] = test_feat['label'].map({0:'围网',1:'刺网',2:'拖网'})
test_feat[[0, 'label']].to_csv('baseline.csv',index=None, header=None)