## ForVizorの実装

[The DEBS 2013 Grand Challeng](https://www2.informatik.uni-erlangen.de/publication/download/DEBS2013b.pdf)で提案されているデータセットを使用。  
[ForVizor](https://www.semanticscholar.org/paper/ForVizor%3A-Visualizing-Spatio-Temporal-Team-in-Wu-Xie/d72e58f168fd3df4ff40c045db2b57d60a638912)と[Large-Scale Analysis of Soccer Matches using Spatiotemporal Tracking Data](http://www.yisongyue.com/publications/icdm2014_soccer_formation.pdf)を参考にしながら解析を進める。  
このデータの解析の目的は以下の2つである。  
1. フォーメーションの時間的変化を解析、可視化する
2. 守備時のフォーメーションを平均的な矢印の向きとともに可視化する

#### 必要なライブラリのインポート

In [16]:
import os, random, time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import japanize_matplotlib

from scipy.stats import multivariate_normal, gaussian_kde, zscore, entropy
from munkres import Munkres

#### データの読み込み

In [17]:
infile = os.path.join('.', 'full-game')

# n = sum(1 for line in open(infile)) - 1
n = 49576080

p = 0.01
n_rows = int(n/2)
df = pd.read_csv(infile, 
                 header=None)
#                  header=None, 
#                  nrows=n_rows)
#                  skiprows=lambda i: i>0 and random.random() > p)

df.columns = ['sid', 'ts', 'x', 'y', 'z', '|v|', '|a|', 'vx', 'vy', 'vz', 'ax', 'ay', 'az']

df.iloc[:,1:].describe()

Unnamed: 0,ts,x,y,z,|v|,|a|,vx,vy,vz,ax,ay,az
count,49576080.0,49576080.0,49576080.0,49576080.0,49576080.0,49576080.0,49576080.0,49576080.0,49576080.0,49576080.0,49576080.0,49576080.0
mean,1.286534e+16,18761.72,-359.5071,12.43526,1730069.0,10266350.0,-27.35336,-26.60484,-208.0428,2.175055,-2.280792,-13.26106
std,1336397000000000.0,19796.24,22018.71,553.1649,2780320.0,19994300.0,5883.493,6132.355,5266.312,6395.994,6517.742,4075.545
min,1.062934e+16,-23010.0,-62545.0,-13675.0,0.0,0.0,-32768.0,-32768.0,-32768.0,-32768.0,-32768.0,-32768.0
25%,1.15944e+16,-5092.0,-15688.0,-279.0,155653.0,3303185.0,-4954.0,-5405.0,-3703.0,-5958.0,-6197.0,-2385.0
50%,1.322602e+16,22150.0,-269.0,-35.0,491350.0,6144118.0,-16.0,-15.0,-58.0,2.0,3.0,7.0
75%,1.406254e+16,31203.0,15902.0,231.0,2299802.0,11876240.0,4864.0,5325.0,2940.0,5966.0,6188.0,2382.0
max,1.489395e+16,64513.0,56066.0,18215.0,65490510.0,1327792000.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0


#### データの理解（センサーID）

In [18]:
print('センサー数 :{}'.format(len(df['sid'].unique())))
sid_list = np.sort(df['sid'].unique()).tolist()
print('センサーID :{}'.format(sid_list))

# レフリー、ボール、各プレイヤーのセンサーID
left_r = 105
right_r = 106

ball_list = [4, 8, 10, 12]  # 12は後半のみ
left_a_list = [13, 47, 49, 19, 53, 23, 57, 59]
right_a_list = [14, 16, 88, 52, 54, 24, 58, 28]

left_b_list = [61, 63, 65, 67, 69, 71, 73, 75]
right_b_list = [62, 64, 66, 68, 38, 40, 74, 44]

センサー数 :42
センサーID :[4, 8, 10, 12, 13, 14, 16, 19, 23, 24, 28, 38, 40, 44, 47, 49, 52, 53, 54, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 73, 74, 75, 88, 97, 98, 99, 100, 105, 106]


#### データの理解（タイムスタンプ）
タイムスタンプは17桁で、秒の小数部が12桁ある。

In [19]:
t_array = np.sort(df['ts'].unique())
print('タイムスタンプ数: {}'.format(len(t_array)))

# 試合開始時からのデータとし、time-stampを変換する
df['ts'] = [float(str(t)[:5]+'.'+str(t)[5:]) for t in df.ts.values.tolist()]
st_1st = 10753.295594424116
ed_1st = 12557.295594424116
st_2nd = 13086.639146403495
ed_2nd = 14879.639146403495

df = df[(st_1st <= df.ts)&(df.ts < ed_2nd)]
df.ts -= st_1st

print('センサー数 :{}'.format(len(df['sid'].unique())))

タイムスタンプ数: 49576080
センサー数 :42


#### データを吐き出す

In [41]:
from ipywidgets import FloatProgress
from IPython.display import display

dt = 0.1
threshold_pass = 0.5e+7
threshold_distance = 2e+3

T = (ed_1st-st_1st)/2

fp = FloatProgress(min=0, max=int(T/dt))
print(T/dt)
display(fp)

st, ed = 0, dt
data_a_of_list, data_a_df_list, data_b_of_list, data_b_df_list = [], [], [], []
t = 0

while st < T:
    start_time = time.time()
    ball_features = df[(st <= df.ts)&(df.ts < ed)&((df.sid == ball_list[0])|(df.sid == ball_list[1])|(df.sid == ball_list[2])|(df.sid == ball_list[3]))][['x', 'y','|v|']].mean().values
    d_array = np.array([[np.linalg.norm(df[(st <= df.ts)&(df.ts < ed)&((df.sid == left)|(df.sid == right))][['x', 'y']].mean().values-ball_features[:2]) for (left, right) in zip(left_list, right_list)] for (left_list, right_list) in zip([left_a_list, left_b_list], [right_a_list, right_b_list])])
    tid, pid = np.unravel_index(d_array.argmin(), d_array.shape)
    min_d = d_array.min()
    
    if ball_features[-1] > threshold_pass and st != st:
        pid = pid if min_d < threshold_distance and tid == pre_tid else pre_pid
        tid = pre_tid
    
    pre_tid, pre_pid = tid, pid
    data = np.array([[df[(st <= df.ts)&(df.ts < ed)&((df.sid == left)|(df.sid == right))][['x', 'y']].mean().values for (left, right) in zip(left_list, right_list)] for (left_list, right_list) in zip([left_a_list, left_b_list], [right_a_list, right_b_list])])
    if pid == 0:
        data_a_of_list.append(data[0].tolist())
        data_b_df_list.append(data[1].tolist())
    else:
        data_a_df_list.append(data[0].tolist())
        data_b_of_list.append(data[1].tolist())
    
    st += dt; ed += dt
    print(time.time()-start_time)
    fp.value = t
    t += 1

9020.0


FloatProgress(value=0.0, max=9020.0)

KeyboardInterrupt: 

In [26]:
team_list = ['a', 'b']
name_list = ['of', 'df']
for i, data_list_list in enumerate(zip([data_a_of_list, data_b_of_list], [data_a_df_list, data_b_df_list])):
    for j, data_list in enumerate(data_list_list):
        data_array = np.array(data_list)
        index_list = np.isnan(data_array.reshape(-1, 8*2)).any(axis=1)
        data_array = data_array[~index_list].reshape(-1, 8, 2)
        data_array = np.array(([zscore(data) for data in data_array]))
        
        np.savetxt(os.path.join('_csv', '{}_1st_1_{}.csv').format(team_list[i],name_list[j]), data_array.reshape(-1, 8*2), delimiter=',')

0 0 (0,)
0 1 (11, 8, 2)
1 0 (11, 8, 2)
1 1 (0,)


'\ndata_array = np.array(data_list).transpose(1,0,2)\nindex_list = np.isnan(data_array.reshape(-1, 8*2)).any(axis=1)\ndata_array = data_array[~index_list].reshape(-1, 8, 2)\n'