In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
# read the dataset and set the index
columns = ['timestamp', 'tag_id', 'x_pos', 'y_pos', 'heading', 'direction', 'energy', 'speed', 'total_distance']
data = pd.read_csv('../data/input/raw/tromso_stromsgodset_first.csv', names=columns)

# convert timestamp to second
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['timestamp'] = data['timestamp'] - data['timestamp'][0]
data['timestamp'] = [time.total_seconds() for time in data['timestamp']]

# make the list of player id
player_ids = data['tag_id'].unique()

# remove the entries of the goalkeeper and substitute players
x_means = [] # a list of x-position mean for each player
y_means = [] # a list of y-position mean for each player
for player_id in player_ids:
    x_means.append(data['x_pos'][data['tag_id']==player_id].mean())
    y_means.append(data['y_pos'][data['tag_id']==player_id].mean())

# remove the id of goalkeeper
player_ids = np.delete(player_ids, np.argsort(x_means)[0]) # a goalkeeper has the lowest x-position mean
# remove the ids of a substitute player
player_ids = np.delete(player_ids, np.argsort(y_means)[-3:]) # substitute players have the highest y-position mean
data = data[data['tag_id'].isin(player_ids)]

print(data)

        timestamp  tag_id      x_pos      y_pos   heading  direction  \
0             0.0       2  26.572793  29.435691  0.799873   0.824138   
1             0.0       5  35.550286  30.268062  1.157589  -0.174832   
2             0.0       7  41.586068  38.675045  2.315807  -2.309638   
3             0.0       8  28.506505  39.611611  0.975878   2.526083   
4             0.0       9  32.253800  12.724300  0.238786   0.000000   
...           ...     ...        ...        ...       ...        ...   
623511     2833.0       9  28.875295   9.418390 -0.980497  -0.136579   
623512     2833.0      10  36.042101   9.025376 -0.348861   0.188951   
623514     2833.0      13  25.004687  26.885823  1.178887   2.565675   
623515     2833.0      14  35.959117  17.466859  1.499948   2.121866   
623516     2833.0      15  49.905929  27.317175 -2.147617  -2.927301   

             energy     speed  total_distance  
0        150.661796  0.967681      255.584300  
1        364.308659  0.624163      297.

In [10]:
timestamp_quarter = [time for time in data['timestamp'].unique() if time % 0.25 == 0]
x_quarter = []
y_quarter = []

timestamp_new = []
x_new = []
y_new = []

for time in timestamp_quarter:
    x_quarter = []
    y_quarter = []
    for tag_id in data['tag_id'].unique():
        x_quarter.append(data['x_pos'][(data['tag_id']==tag_id) & (time<=data['timestamp']) & (data['timestamp']<time+0.25)].mean())
        y_quarter.append(data['y_pos'][(data['tag_id']==tag_id) & (time<=data['timestamp']) & (data['timestamp']<time+0.25)].mean())
    if (np.nan not in x_quarter) and (np.nan not in y_quarter):
        timestamp_new.append(time)
        x_new.append(x_quarter)
        y_new.append(y_quarter)

In [None]:
new_data = pd.DataFrame({'timestamp' : sorted(timestamp_new*10), 
              'tag_id' : [i for i in data['tag_id'].unique()] * len(timestamp_new),
              'x_pos' : [x_pos for sublist in x_new for x_pos in sublist],
              'y_pos' : [y_pos for sublist in y_new for y_pos in sublist]})

In [None]:
new_data.head()

In [None]:
new_data.to_csv('../data/input/processed/prepped_tromso_stromsgodset_first.csv', index=False)