In [9]:
import pandas as pd
import numpy as np
import re

df = pd.read_json('../Data/Serie_A_2014_games.json')

df_transposto = df.transpose()

df_transposto.head()

Unnamed: 0,Home,Away,Result,Players,Goals,Changes,Yellow cards,Red cards
1,Flamengo / RJ,Goiás / GO,0 X 0,[[1Felipe Luiz Felipe Ventura ... T(g)P141698...,[],[45:00 INTFlamengo/RJ 17 - Gabriel Santana Pin...,"[24:00 1T40Mauricio Azevedo Alves Flamengo/RJ,...",[]
2,Fluminense / RJ,Figueirense / SC,3 X 0,"[[12Diego Diego Cavalieri T(g)P137990, Flumine...",[31:00 1T23NRRafael Augusto Sobis Fluminense/R...,[45:00 INTFigueirense/SC 15 - Paulo Roberto da...,[44:00 1T4Thiago Heleno Henrique Ferreira Figu...,[]
3,São Paulo / SP,Botafogo / RJ,3 X 0,"[[1Rogerio Rogerio Ceni T(g)P107639, São Paulo...",[12:00 1T4NRAntonio Carlos dos Santos Aguiar S...,[45:00 INTBotafogo/RJ 23 - Mario Ariel Bolatti...,[30:00 1T6Alvaro Daniel Pereira Barragan São P...,[]
4,Santos / SP,Sport / PE,1 X 1,[[1Aranha Mario Lucio Duarte Costa T(g)P142720...,[24:00 2T9NREuvaldo Jose de Aguiar Neto Sport/...,[10:00 2TSport/PE 15 - Francisco Rithely da Si...,"[42:00 1T8Rodrigo Marcos dos Santos Sport/PE, ...",[]
5,Athletico Paranaense / PR,Grêmio / RS,1 X 0,[[1Aderbar Aderbar Melo dos San ... T(g)P29342...,[15:00 1T55NRDrausio Luis Salla Gil Atletico/PR],[14:00 2TGrêmio/RS 27 - Rodrigo Eduardo Costa ...,[37:00 1T17Paulo Henrique Dias da Cruz Atletic...,[]


Teste inicial utilizando um jogo do brasileirão Série A 2014. Pegamos o primeiro jogo do Fluminense x Figueirense para criarmos a formatação das colunas e extração de informações da súmula.

In [1]:
import pandas as pd
import numpy as np
import re

class FootballDataProcessor:
    def __init__(self, json_path):
        self.df = pd.read_json(json_path)
        self.df_transposed = self.df.transpose()
        self.players_teams = self.df_transposed.iloc[1, 3]
        self.team_changes = self.df_transposed.iloc[1, 5]
        self.home_team = self.df_transposed.iloc[1, 0]
        self.new_df_players = pd.DataFrame(self.players_teams, columns=['player', 'team'])
        self.parsed_changes = []
        self.parsed_goals = []
    
    @staticmethod
    def extract_id(player_string):
        match = re.search(r'T\(g\)?P(\d+)|RP(\d+)|TP(\d+)', player_string, re.IGNORECASE)
        if match:
            return next((m for m in match.groups() if m), None)
        return None
    
    @staticmethod
    def clean_player_name(player):
        clean_name = re.sub(r'\s+T.*|\s+TP.*|\s+RP.*|\s+T(g).*', '', player)
        return clean_name
    
    @staticmethod
    def parse_team_changes(changes):
        pattern = re.compile(r'(\d{2}:\d{2}) (INT|\d+T)([\w\s]+/\w+) (\d+) - [^\d]+ (\d+) - [^\d]+')
        parsed_data = []
        for change in changes:
            match = pattern.search(change)
            if match:
                time, half, team, player_out_number, player_in_number = match.groups()
                team = team.strip()
                parsed_data.append((time, half, team, player_out_number, player_in_number))
        return parsed_data
    
    @staticmethod
    def parse_goals(goals, home_team):
        pattern = re.compile(r'(\d+):00 (\d+T)([\d\w]+)([A-Za-z ]+) ([\w\s/]+)')
        parsed_goals = []
        for goal in goals:
            match = pattern.search(goal)
            if match:
                minute = int(match.group(1))
                half = match.group(2)
                scorer_info = match.group(3)
                player_name = match.group(4).strip()
                team = match.group(5).strip()
                team = team.replace('/', ' / ')
                
                if '2T' in half and minute != 45:
                    minute += 45
                
                if 'CT' in scorer_info:
                    team_status = 'Away' if team == home_team else 'Home'
                else:
                    team_status = 'Home' if team == home_team else 'Away'
                
                parsed_goals.append((minute, team_status))
        return parsed_goals
    
    def process_players(self):
        self.new_df_players['player_id'] = self.new_df_players['player'].apply(self.extract_id)
        self.new_df_players['player_name'] = self.new_df_players['player'].apply(self.clean_player_name)
        self.new_df_players['Minutes Played'] = 90
        self.new_df_players['Minute Entered'] = 0
        self.new_df_players['Minute Exited'] = 90
    
    def process_team_changes(self):
        self.parsed_changes = self.parse_team_changes(self.team_changes)
        
        for time, half, team, player_out_number, player_in_number in self.parsed_changes:
            minute = int(time.split(':')[0])
            team = team.replace('/', ' / ')
            
            minute_entered = 45 + minute if '2T' in half else minute
            minute_exited = minute_entered
            
            mask_in = (
                self.new_df_players['player_name'].apply(
                    lambda x: re.match(r'^' + player_in_number + r'\D', x) is not None
                ) & (self.new_df_players['team'] == team)
            )
            self.new_df_players.loc[mask_in, 'Minute Entered'] = minute_entered
            self.new_df_players.loc[mask_in, 'Minute Exited'] = 90
            
            mask_out = (
                self.new_df_players['player_name'].apply(
                    lambda x: re.match(r'^' + player_out_number + r'\D', x) is not None
                ) & (self.new_df_players['team'] == team)
            )
            self.new_df_players.loc[mask_out, 'Minute Exited'] = minute_exited
        
        self.new_df_players['Minutes Played'] = self.new_df_players['Minute Exited'] - self.new_df_players['Minute Entered']
    
    def process_goals(self):
        self.parsed_goals = self.parse_goals(self.df_transposed.iloc[1, 4], self.home_team)
        
        self.new_df_players['Goals For'] = 0
        self.new_df_players['Goals Against'] = 0
        
        for minute, team in self.parsed_goals:
            mask = (self.new_df_players['Minute Entered'] <= minute) & (self.new_df_players['Minute Exited'] >= minute)
            mask_for = mask & (self.new_df_players['status'] == team)
            self.new_df_players.loc[mask_for, 'Goals For'] += 1
            
            mask_against = mask & (self.new_df_players['status'] != team)
            self.new_df_players.loc[mask_against, 'Goals Against'] += 1
    
    def set_status(self):
        self.new_df_players['status'] = np.where(self.new_df_players['team'] == self.home_team, 'Home', 'Away')
    
    def process(self):
        self.process_players()
        self.process_team_changes()
        self.set_status()
        self.process_goals()
        self.new_df_players = self.new_df_players.drop(columns='player')
        return self.new_df_players


In [4]:
# Exemplo de uso
processor = FootballDataProcessor('../Data/Serie_A_2014_games.json')
final_df = processor.process()
final_df


Unnamed: 0,team,player_id,player_name,Minutes Played,Minute Entered,Minute Exited,status,Goals For,Goals Against
0,Ponte Preta / SP,135494,1Volpato Roberto Volpato Netto,90,0,90,Home,0,2
1,Ponte Preta / SP,298023,2Neilson Neilson dos Santos V ...,90,0,90,Home,0,2
2,Ponte Preta / SP,312663,3Cesar Cesar Henrique Martins,90,0,90,Home,0,2
3,Ponte Preta / SP,162757,4Diego Diego Alessandro Apa ...,90,0,90,Home,0,2
4,Ponte Preta / SP,291874,5Dilsinho Adilson Carlos,45,45,90,Home,0,1
5,Ponte Preta / SP,176461,6Magal Magno Aparecido de A ...,90,0,90,Home,0,2
6,Ponte Preta / SP,181803,7Elton Elton Junior Melo Ataide,90,0,90,Home,0,2
7,Ponte Preta / SP,172039,8Fernando Fernando Paixao da Silva,90,0,90,Home,0,2
8,Ponte Preta / SP,172513,9Alexandro Alexandro da Silva B ...,12,78,90,Home,0,0
9,Ponte Preta / SP,133023,10Adriano Adriano Laaber,18,72,90,Home,0,0
