In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
#original dataset
data = pd.read_csv('../data/raw/all_coordinates-45min.csv') 
print(len(data))


In [81]:
#remove columns with no value

import ast

for col in [col for col in data.columns if col.startswith('team_0') or col.startswith('team_1')]:
    data[col] = data[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

t0 = [col for col in data.columns if col.startswith('team_0')]
t1 = [col for col in data.columns if col.startswith('team_1')]

def algum_array_vazio(row, cols):
    for col in cols:
        val = row[col]
        if isinstance(val, list):
            if len(val) == 0:
                return True
        elif pd.isnull(val):
            return True
    return False

mask_t0 = data.apply(lambda row: algum_array_vazio(row, t0), axis=1)
mask_t1 = data.apply(lambda row: algum_array_vazio(row, t1), axis=1)

# Elimina linhas onde qualquer array de t0 ou t1 está vazio
data_prep = data[~(mask_t0 | mask_t1)]

print(len(data_prep))

593


In [82]:
#split coords
data_prep[['ball_x', 'ball_y']] = data_prep['ball_coords'].str.extract(r'\((.*), (.*)\)').astype(float)
data_prep = data_prep.drop(columns=['ball_coords'])

data_prep[['gk_t1_coords_x', 'gk_t1_coords_y']] = data_prep['gk_t1_coords'].str.extract(r'\((.*), (.*)\)').astype(float)
data_prep = data_prep.drop(columns=['gk_t1_coords'])

data_prep[['gk_t2_coords_x', 'gk_t2_coords_y']] = data_prep['gk_t2_coords'].str.extract(r'\((.*), (.*)\)').astype(float)
data_prep = data_prep.drop(columns=['gk_t2_coords'])

# Expand team_0_players and team_1_players into columns
for team in [0, 1]:
    player_col = f'team_{team}_players'
    # Each row is a list of (x, y) tuples
    coords = pd.DataFrame(data_prep[player_col].tolist(), columns=[f'player_{i+1}' for i in range(13)])
    for i in range(13):
        data_prep[f'team_{team}_player_{i+1}_x'] = coords[f'player_{i+1}'].apply(lambda x: x[0] if isinstance(x, (list, tuple)) else np.nan)
        data_prep[f'team_{team}_player_{i+1}_y'] = coords[f'player_{i+1}'].apply(lambda x: x[1] if isinstance(x, (list, tuple)) else np.nan)
    data_prep = data_prep.drop(columns=[player_col])

data_prep.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prep[['ball_x', 'ball_y']] = data_prep['ball_coords'].str.extract(r'\((.*), (.*)\)').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prep[['ball_x', 'ball_y']] = data_prep['ball_coords'].str.extract(r'\((.*), (.*)\)').astype(float)


Unnamed: 0,frame_index,ball_x,ball_y,gk_t1_coords_x,gk_t1_coords_y,gk_t2_coords_x,gk_t2_coords_y,team_0_player_1_x,team_0_player_1_y,team_0_player_2_x,...,team_1_player_9_x,team_1_player_9_y,team_1_player_10_x,team_1_player_10_y,team_1_player_11_x,team_1_player_11_y,team_1_player_12_x,team_1_player_12_y,team_1_player_13_x,team_1_player_13_y
0,0,,,,,,,803.5,634.5,1850.5,...,475.0,955.0,,,,,,,,
1,1,891.5,209.0,,,,,1128.0,322.0,1214.0,...,,,,,,,,,,
2,2,129.5,595.5,,,,,1110.0,402.0,1551.0,...,936.5,235.0,,,,,,,,
3,3,,,,,,,1132.0,614.5,1787.5,...,1585.0,342.0,1003.0,237.5,,,,,,
4,4,1778.5,756.0,115.5,319.0,1314.5,246.0,138.5,847.0,1362.0,...,,,,,,,,,,


In [86]:
#format coords to be in 120x70

# Define scaling factors
x_scale = 120 / 1920
y_scale = 70 / 1024

# List all columns that are x or y coordinates
x_cols = [col for col in data_prep.columns if col.endswith('_x')]
y_cols = [col for col in data_prep.columns if col.endswith('_y')]

# Scale x and y columns
data_prep[x_cols] = data_prep[x_cols] * x_scale
data_prep[y_cols] = data_prep[y_cols] * y_scale

In [87]:
# Encontrar grupos de frame_index consecutivos
data_prep = data_prep.sort_values('frame_index').reset_index(drop=True)
groups = (data_prep['frame_index'].diff(1) != 1).cumsum()

# Criar uma lista de DataFrames, cada um com uma sequência de frame_index consecutivos
subsets = [group for _, group in data_prep.groupby(groups)]

# Exemplo: mostrar o tamanho e o range de frame_index de cada subset
for i, subset in enumerate(subsets):
    if len(subset) > 15:
        subset.to_csv(f'../data/raw/subset_{i+1}.csv', index=False)
        print(f"Subset {i+1} saved as subset_{i+1}.csv")

Subset 1 saved as subset_1.csv
Subset 2 saved as subset_2.csv
Subset 8 saved as subset_8.csv
Subset 25 saved as subset_25.csv
Subset 30 saved as subset_30.csv
Subset 31 saved as subset_31.csv
Subset 36 saved as subset_36.csv
Subset 39 saved as subset_39.csv
Subset 44 saved as subset_44.csv
Subset 45 saved as subset_45.csv
Subset 55 saved as subset_55.csv
Subset 70 saved as subset_70.csv
