In [48]:
%matplotlib inline
import datetime
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

### lat,lon -> x,y https://blog.csdn.net/weixin_43428682/article/details/87889753
* 经度：longitude 纬度： latitude
* 椭圆的长半轴: a
* 椭圆的短半轴: b

https://baike.baidu.com/item/%E6%A4%AD%E7%90%83%E5%81%8F%E5%BF%83%E7%8E%87/4944476?fr=aladdin
* 椭圆的第一偏心率: $ e=\frac{\sqrt{a^2-b^2}}{a} $  
* 椭圆的第二偏心率: $ e'=\frac{\sqrt{a^2-b^2}}{b} $

In [2]:
a=6378137.0000
b=6356752.3142
B0=0
lon0=0
def k_val_compute():
    e_=np.sqrt(a**2-b**2)/b
    return a**2*np.cos(B0)/(b*np.sqrt((1+(e_*np.cos(B0))**2)))

def X_unit_trans(lon):
    lon_rad=lon*np.pi/180 #角度-> 弧度
    k=k_val_compute()
    return k*(lon_rad-lon0)

def Y_unit_trans(lat):
    lat_rad=lat*np.pi/180 #角度-> 弧度
    k=k_val_compute()
    e=np.sqrt(a**2-b**2)/a
    dot_val=np.tan(np.pi/4+lat_rad/2)*((1-e*np.sin(lat_rad))/(1+e*np.sin(lat_rad)))**(e/2)                        
    return k*np.log(dot_val)

### Load data

In [3]:
train_path = '../../hy_round2_train_20200225'
test_path = '../../hy_round2_testA_20200225'
train_files = os.listdir(train_path)
test_files = os.listdir(test_path)

In [4]:
ret = []
for file in tqdm(train_files):
    df = pd.read_csv(f'{train_path}/{file}')
    ret.append(df)
df_train = pd.concat(ret)
df_train.columns = ['ship','lat','lon','v','d','time','type']

100%|███████████████████████████████████████████████████████████████████| 8166/8166 [01:58<00:00, 132.08it/s]


In [5]:
ret = []
for file in tqdm(test_files):
    df = pd.read_csv(f'{test_path}/{file}')
    ret.append(df)
df_test = pd.concat(ret)
df_test.columns = ['ship','lat','lon','v','d','time','type']

100%|██████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.23s/it]


### Feature extract

In [65]:
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df, train,test_model=False):
    # 基本 统计特征
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','count'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','skew'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['mean','std','skew'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['mean','std','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    
    t = group_feature(df, 'ship','lat',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','lon',['max','min','mean','skew'])
    train = pd.merge(train, t, on='ship', how='left')
    # 上下1/4分位数，x y的协方差，相关系数
    t=df.groupby('ship')['x'].agg({'x_25':lambda x: np.percentile(x,25),'x_50':lambda x: np.percentile(x,50)
                                   ,'x_75':lambda x: np.percentile(x,75)})
    train = pd.merge(train, t, on='ship', how='left')
    t=df.groupby('ship')['y'].agg({'y_25':lambda x: np.percentile(x,25),'y_50':lambda x: np.percentile(x,50),
                                   'y_75':lambda x: np.percentile(x,75)})
    train = pd.merge(train, t, on='ship', how='left') 
    t=df.groupby('ship')['v'].agg({'v_25':lambda x: np.percentile(x,25),'v_50':lambda x: np.percentile(x,50),
                                   'v_75':lambda x: np.percentile(x,75)})
    train = pd.merge(train, t, on='ship', how='left')
    t=df.groupby('ship')['d'].agg({'d_50':lambda x: np.percentile(x,50),
                                   'd_75':lambda x: np.percentile(x,75)})
    train = pd.merge(train, t, on='ship', how='left')
    train['xy_cov']=df[['ship','x','y']].groupby('ship').cov().values[::2,1]
    train['xy_corr']=df[['ship','x','y']].groupby('ship').corr().values[::2,1]
    train['lon_lat_cov']=df[['ship','lat','lon']].groupby('ship').cov().values[::2,1]
    train['lon_lat_corr']=df[['ship','lat','lon']].groupby('ship').corr().values[::2,1]
    # x,y 交叉特征
    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['x_max_y_min'] = train['x_max'] - train['y_min']
    train['slope'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min'])
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']
    # lat,lon 交叉特征
    train['lat_max_lat_min'] = train['lat_max'] - train['lat_min']
    train['lon_max_lat_min'] = train['lon_max'] - train['lat_min']
    train['lon_max_lon_min'] = train['lon_max'] - train['lon_min']
    train['lat_max_lon_min'] = train['lat_max'] - train['lon_min']
    train['lat_lon_slope'] = train['lat_max_lat_min'] / np.where(train['lon_max_lon_min']==0, 0.001, train['lon_max_lon_min'])
    train['lat_lon_area'] = train['lon_max_lon_min'] * train['lat_max_lat_min']
    # 时间特征 
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict() # 最频繁的时间
    train['mode_hour'] = train['ship'].map(mode_hour)
    t = group_feature(df, 'ship','day',['nunique']) # day nunique
    train = pd.merge(train, t, on='ship', how='left')
    t = df.groupby('ship')['time'].agg({'dif_time':lambda x:np.max(x)-np.min(x)}).reset_index() # 耗时长(以秒计算)
    t['dif_second'] = t['dif_time'].dt.seconds
    train = pd.merge(train, t, on='ship', how='left')
    #
    df_diff=df.groupby('ship')[['x','y','d','v','lon','lat']].apply(lambda x: x.diff(1)[1:])
    df_diff['dis'] = (df_diff['x']**2 + df_diff['y']**2)**0.5
    t= df_diff['dis'].agg({'dis_mean':'mean','dis_max':'max','dis_min':'min'})
    train = pd.merge(train, t, on='ship', how='left')
#     t=(df_diff['x'].apply(lambda x:x.abs()) / df_diff['total_seconds']).agg({'x_sec_mean':'mean','x_sec_max':'max','x_sec_min':'min'})
#     train = pd.merge(train, t, on='ship', how='left')
#     t=(df_diff['y'].apply(lambda x:x.abs()) / df_diff['total_seconds']).agg({'y_sec_mean':'mean','y_sec_max':'max','y_sec_min':'min'})
#     train = pd.merge(train, t, on='ship', how='left')
    t=df_diff['x'].apply(lambda x:1 if x>0 else 0).agg({'x_pos_mean':'mean'})
    train = pd.merge(train, t, on='ship', how='left')
    t=df_diff['y'].apply(lambda x:1 if x>0 else 0).agg({'y_pos_mean':'mean'})
    train = pd.merge(train, t, on='ship', how='left')
    t=df_diff['x'].apply(lambda x:1 if x==0 else 0).agg({'x_zero_mean':'mean'})
    train = pd.merge(train, t, on='ship', how='left')
    t=df_diff['y'].apply(lambda x:1 if x==0 else 0).agg({'y_zero_mean':'mean'})
    train = pd.merge(train, t, on='ship', how='left')
    t=df_diff['v'].apply(lambda x:1 if x==0 else 0).agg({'v_zero_mean':'mean'})
    train = pd.merge(train, t, on='ship', how='left')
    t=df_diff['d'].apply(lambda x: 1 if x==0 else 0).agg({'d_zero_mean':'mean'})
    train = pd.merge(train, t, on='ship', how='left')
    t=df_diff['lon'].apply(lambda x:1 if x==0 else 0).agg({'lon_zero_mean':'mean'})
    train = pd.merge(train, t, on='ship', how='left')
    t=df_diff['lat'].apply(lambda x: 1 if x==0 else 0).agg({'lat_zero_mean':'mean'})
    train = pd.merge(train, t, on='ship', how='left')

    return train

def extract_dt(df):
#     df['total_seconds']=df['time'].dt.total_seconds()
    df['time'] =pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    df['day']=df.time.dt.day
    df['hour']=df.time.dt.hour
    df['second']=df.time.dt.second
    df['minute']=df.time.dt.minute
    
    return df

In [68]:
diff=df_train.groupby('ship')[['x','y','d','v','lon','lat']].apply(lambda x: x.diff(1)[1:])

In [72]:
diff.reset_index()

Unnamed: 0,ship,level_1,x,y,d,v,lon,lat
0,20000,1,-667.916945,-712.677802,-20.0,0.97,-0.006,-0.006
1,20000,2,-667.916945,-831.420255,10.0,-0.21,-0.006,-0.007
2,20000,3,-445.277963,-950.145566,-20.0,0.43,-0.004,-0.008
3,20000,4,-445.277963,-831.334505,-10.0,-0.43,-0.004,-0.007
4,20000,5,-556.597454,-950.047609,20.0,0.00,-0.005,-0.008
5,20000,6,-556.597454,-831.248830,-10.0,0.10,-0.005,-0.007
6,20000,7,-445.277963,-831.208874,0.0,-1.40,-0.004,-0.007
7,20000,8,-556.597454,-831.168934,20.0,0.43,-0.005,-0.007
8,20000,9,-556.597454,-712.398739,-10.0,0.16,-0.005,-0.006
9,20000,10,-556.597454,-712.369418,0.0,0.44,-0.005,-0.006


### lat,lon -> x,y

In [7]:
df_train['x']=df_train.lon.apply(X_unit_trans)
df_train['y']=df_train.lat.apply(Y_unit_trans)

df_test['x']=df_test.lon.apply(X_unit_trans)
df_test['y']=df_test.lat.apply(Y_unit_trans)

In [8]:
train_label = df_train.drop_duplicates('ship')
test_label = df_test.drop_duplicates('ship')

In [9]:
train_label.head()

Unnamed: 0,ship,lat,lon,v,d,time,type,x,y
0,20000,21.295,115.563,2.32,50,0912 23:59:55,拖网,12864410.0,2411578.0
0,20001,25.172,119.131,0.05,0,0930 23:53:42,围网,13261600.0,2878718.0
0,20002,27.287,120.261,0.0,20,1027 11:51:43,刺网,13387390.0,3139791.0
0,20003,23.125,118.315,0.27,0,1012 23:54:28,拖网,13170770.0,2630367.0
0,20004,26.068,119.51,0.22,91,1012 23:55:19,刺网,13303790.0,2988734.0


In [10]:
train_label['type'].value_counts(1)

围网    0.433627
拖网    0.403135
刺网    0.163238
Name: type, dtype: float64

In [11]:
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)

In [66]:
train_label =  extract_feature(extract_dt(df_train), train_label) 

{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_count': 'count'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_skew': 'skew'}
{'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew'}
{'d_mean': 'mean', 'd_std': 'std', 'd_sum': 'sum'}
{'lat_max': 'max', 'lat_min': 'min', 'lat_mean': 'mean', 'lat_std': 'std', 'lat_skew': 'skew', 'lat_sum': 'sum'}
{'lon_max': 'max', 'lon_min': 'min', 'lon_mean': 'mean', 'lon_skew': 'skew'}
{'day_nunique': 'nunique'}


KeyError: 'ship'

In [None]:
test_label =  extract_feature(extract_dt(df_test), test_label,test_model=True) 

In [None]:
features = [x for x in train_label.columns if x not in ['ship','type','time','dif_time',                                                      
                                                       ]]
target = 'type'

In [None]:
print(len(features),len(train_label.columns), ','.join(features))

### How to train? 

- Lightgbm 单模 
- 总共提取了60个特征，丢弃一些不重要特征，最终34个特征用于训练
- Way: [OOF](https://stackoverflow.com/questions/52396191/what-is-oof-approach-in-machine-learning).10 folds cross-validation

In [None]:
lgb_params = {
    'learning_rate' : 0.03,
    'n_estimators': 1000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 100,
}

In [None]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X = train_label[features].copy()
y = train_label[target]
X_test=test_label[features].copy()
models = []
pred = np.zeros((len(test_label),3))
oof = np.zeros((len(X), 3))

for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

    model = lgb.train(lgb_params, train_set, valid_sets=[train_set, val_set], verbose_eval=100)
    models.append(model)
    val_pred = model.predict(X.iloc[val_idx])
    oof[val_idx] = val_pred
    val_y = y[val_idx]
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))

    test_pred = model.predict(X_test)
    pred += test_pred/fold.n_splits

In [None]:
oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))
# oof f1 0.8901439764950653 5folds  oof f1 0.8925009171023909  37-features

### Write submission

In [None]:
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
sub.to_csv('result.csv', index=None, header=None)   

### Features importance

In [None]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

In [None]:
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)

In [None]:
df