# Retrieve All PDF File Paths from a Directory and Subdirectories

In [4]:
import os

# Function to retrieve paths of all PDF files
def get_all_pdf_files(root_folder):
    pdf_files = []
    # Traverse the directory and its subdirectories to find PDF files
    for folder_name, subfolders, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith('.pdf'):
                pdf_files.append(os.path.join(folder_name, filename))
    return pdf_files

# Example usage
os.getcwd()
root_folder = os.path.join(os.getcwd(), "match-reports")
pdf_files = get_all_pdf_files(root_folder)
print(f"A total of {len(pdf_files)} PDF files were found.")

A total of 184 PDF files were found.


# Match Basic Information

In [7]:
import os
import pdfplumber
import pandas as pd
import re

# Get a list of all PDF files from the specified folder and its subfolders
def get_all_pdf_files(root_folder):
    pdf_files = []
    for folder_name, subfolders, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith('.pdf'):
                pdf_files.append(os.path.join(folder_name, filename))
    return pdf_files

# Extract team, score, and date information from a PDF file
def extract_match_data(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        # Extract data from the second page (index starts at 0)
        page = pdf.pages[1]  # Select the second page
        text = page.extract_text()
        
        # Split the text into lines
        lines = text.split('\n')

        # Extract team names and scores (find the line containing '–')
        match_line = [line for line in lines if '–' in line][0]
        teams_and_scores = match_line.strip()

        # Split team names and scores using regex
        match_parts = re.split(r' (\d+) – (\d+) ', teams_and_scores)
        team1 = match_parts[0].strip().replace("MATCH SHEET", "").strip()  # Remove "MATCH SHEET"
        score1 = int(match_parts[1])
        score2 = int(match_parts[2])
        team2 = match_parts[3].strip().replace("MATCH SHEET", "").strip()  # Remove "MATCH SHEET"

        # Extract the date (find the line enclosed in parentheses)
        date_line = [line for line in lines if '(' in line][0]
        match_date = date_line.split('(')[-1].replace(')', '').strip()

        return {
            "file": pdf_path,  # Path to the PDF file
            "team1": team1,    # Name of the first team
            "team2": team2,    # Name of the second team
            "score1": score1,  # Score of the first team
            "score2": score2,  # Score of the second team
            "date": match_date # Match date
        }

# Process all PDF files in the folder
def process_all_pdfs(root_folder):
    pdf_files = get_all_pdf_files(root_folder)
    all_data = []

    for pdf_file in pdf_files:
        try:
            data = extract_match_data(pdf_file)
            all_data.append(data)
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")
    
    return all_data

# Example usage
root_folder = os.path.join(os.getcwd(), "match-reports")
all_pdf_data = process_all_pdfs(root_folder)

# Convert to DataFrame
df = pd.DataFrame(all_pdf_data)

# Save to CSV
df.to_csv("match_data_cleaned.csv", index=False, encoding='utf-8-sig')

# Display the top 5 rows
print(df.head())

                                                file             team1  \
0  c:\Users\John\Desktop\Hackathon 2024\hackathon...   Butler Bulldogs   
1  c:\Users\John\Desktop\Hackathon 2024\hackathon...  Indiana Hoosiers   
2  c:\Users\John\Desktop\Hackathon 2024\hackathon...  Indiana Hoosiers   
3  c:\Users\John\Desktop\Hackathon 2024\hackathon...  Indiana Hoosiers   
4  c:\Users\John\Desktop\Hackathon 2024\hackathon...  Indiana Hoosiers   

                             team2  score1  score2        date  
0                 Indiana Hoosiers       1       1  05.09.2024  
1                    Dayton Flyers       0       2  10.09.2024  
2           Evansville Purple Aces       1       1  18.09.2024  
3                Kentucky Wildcats       3       2  09.10.2024  
4  Maryland College Park Terrapins       0       1  21.09.2024  


# Substitution Records

In [8]:
import os
import pdfplumber
import pandas as pd
import re

# Function to get all PDF files from the specified folder and its subfolders
def get_all_pdf_files(root_folder):
    pdf_files = []
    for folder_name, subfolders, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith('.pdf'):
                pdf_files.append(os.path.join(folder_name, filename))
    return pdf_files

# Function to parse the starting lineup and substitutes from a PDF
def parse_lineup_and_substitutes(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[1]  # Select the 2nd page
        text = page.extract_text()

        # Extract "Starting Lineup" and "Substitutes" sections
        starting_lineup_match = re.search(r"Starting lineup\n(.*?)\nSubstitutes", text, re.S)
        substitutes_match = re.search(r"Substitutes\n(.*?)\nCoaches", text, re.S)

        # Parse each section
        starting_lineup_data = parse_players(starting_lineup_match.group(1)) if starting_lineup_match else []
        substitutes_data = parse_players(substitutes_match.group(1)) if substitutes_match else []

        # Convert to DataFrame
        starting_lineup_df = pd.DataFrame(starting_lineup_data)
        substitutes_df = pd.DataFrame(substitutes_data)

        return starting_lineup_df, substitutes_df

# Function to parse player data with support for multiple in/out times
def parse_players(section_text):
    pattern = r'(\w+)\s+(\d+)\s+([A-Z]\.\s*[A-Za-z]+)(?:\s*((?:\d+(?:\+\d+)?\')*(?:\s+\d+(?:\+\d+)?\')*))'
    matches = re.findall(pattern, section_text)
    
    data = []
    for match in matches:
        position, number, player, times = match
        
        time_pattern = r'(\d+(?:\+\d+)?)\''
        times_list = re.findall(time_pattern, times)
        
        player_data = {
            'Position': position,
            'Number': number,
            'Player': player,
            'In': None,
            'Out': None,
            'In2': None,
            'Out2': None,
            'In3': None
        }
        
        if times_list:
            if len(times_list) >= 1:
                player_data['In'] = times_list[0] + "'"
            if len(times_list) >= 2:
                player_data['Out'] = times_list[1] + "'"
            if len(times_list) >= 3:
                player_data['In2'] = times_list[2] + "'"
            if len(times_list) >= 4:
                player_data['Out2'] = times_list[3] + "'"
            if len(times_list) >= 5:
                player_data['In3'] = times_list[4] + "'"
        
        data.append(player_data)
    
    return data

# Function to process all PDFs in the folder
def process_all_pdfs(root_folder):
    pdf_files = get_all_pdf_files(root_folder)
    all_starting_lineups = []
    all_substitutes = []

    for pdf_file in pdf_files:
        try:
            starting_lineup_df, substitutes_df = parse_lineup_and_substitutes(pdf_file)
            # Add a column to indicate the file source
            starting_lineup_df['file'] = pdf_file
            substitutes_df['file'] = pdf_file
            all_starting_lineups.append(starting_lineup_df)
            all_substitutes.append(substitutes_df)
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

    # Combine all data into single DataFrames
    combined_starting_lineups = pd.concat(all_starting_lineups, ignore_index=True)
    combined_substitutes = pd.concat(all_substitutes, ignore_index=True)
    
    return combined_starting_lineups, combined_substitutes

# Example usage
root_folder = os.path.join(os.getcwd(), "match-reports")
starting_lineups_df, substitutes_df = process_all_pdfs(root_folder)

# Display results
print("Starting Lineups:")
print(starting_lineups_df.head())
print("\nSubstitutes:")
print(substitutes_df.head())

Starting Lineups:
  Position Number         Player    In   Out   In2  Out2   In3  \
0       GK      1      C. Norris  None  None  None  None  None   
1       GK      1       J. Harms  None  None  None  None  None   
2       RB     17    H. Kumwenda   44'  None  None  None  None   
3       RB      2      Q. Elliot  None  None  None  None  None   
4      RCB      3  V. Verkooijen  None  None  None  None  None   

                                                file  
0  c:\Users\John\Desktop\Hackathon 2024\hackathon...  
1  c:\Users\John\Desktop\Hackathon 2024\hackathon...  
2  c:\Users\John\Desktop\Hackathon 2024\hackathon...  
3  c:\Users\John\Desktop\Hackathon 2024\hackathon...  
4  c:\Users\John\Desktop\Hackathon 2024\hackathon...  

Substitutes:
  Position Number      Player   In  Out   In2  Out2   In3  \
0     RAMF      7    N. Okoro  31'  46'   49'   59'   85'   
1       CF     12    M. Nesci  30'  46'   82'  None  None   
2     LAMF     27     L. Raso  39'  46'  None  None  None 

### Extract Match information, Starting line up and substitutes data as CSV

In [None]:
import os
import pdfplumber
import pandas as pd
import re

# Get a list of all PDF files from the specified folder and its subfolders
def get_all_pdf_files(root_folder):
    pdf_files = []
    for folder_name, subfolders, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith('.pdf'):
                pdf_files.append(os.path.join(folder_name, filename))
    return pdf_files

# Extract match information: team names, scores, and date
def extract_match_data(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[1]  # Second page
        text = page.extract_text()
        
        # Extract teams and scores
        match_line = [line for line in text.split('\n') if '–' in line][0]
        match_parts = re.split(r' (\d+) – (\d+) ', match_line)
        team1 = match_parts[0].strip().replace("MATCH SHEET", "").strip()  # Remove "MATCH SHEET"
        score1 = int(match_parts[1])
        score2 = int(match_parts[2])
        team2 = match_parts[3].strip().replace("MATCH SHEET", "").strip()  # Remove "MATCH SHEET"
        
        # Extract match date
        date_line = [line for line in text.split('\n') if '(' in line][0]
        match_date = date_line.split('(')[-1].replace(')', '').strip()

        return {
            "team1": team1,
            "score1": score1,
            "score2": score2,
            "team2": team2,
            "date": match_date
        }

# Extract starting lineup and substitutes
def parse_lineup_and_substitutes(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[1]  # Second page
        text = page.extract_text()

        # Extract "Starting Lineup" and "Substitutes" sections
        starting_lineup_match = re.search(r"Starting lineup\n(.*?)\nSubstitutes", text, re.S)
        substitutes_match = re.search(r"Substitutes\n(.*?)\nCoaches", text, re.S)

        # Parse each section
        starting_lineup_data = parse_players(starting_lineup_match.group(1)) if starting_lineup_match else []
        substitutes_data = parse_players(substitutes_match.group(1)) if substitutes_match else []

        return pd.DataFrame(starting_lineup_data), pd.DataFrame(substitutes_data)

# Parse player data with support for multiple in/out times
def parse_players(section_text):
    pattern = r'(\w+)\s+(\d+)\s+([A-Z]\.\s*[A-Za-z]+)(?:\s*((?:\d+(?:\+\d+)?\')*(?:\s+\d+(?:\+\d+)?\')*))'
    matches = re.findall(pattern, section_text)
    
    data = []
    for match in matches:
        position, number, player, times = match
        times_list = re.findall(r'(\d+(?:\+\d+)?)\'', times)
        
        player_data = {
            'Position': position,
            'Number': number,
            'Player': player,
            'In': times_list[0] + "'" if len(times_list) > 0 else None,
            'Out': times_list[1] + "'" if len(times_list) > 1 else None,
            'In2': times_list[2] + "'" if len(times_list) > 2 else None,
            'Out2': times_list[3] + "'" if len(times_list) > 3 else None,
            'In3': times_list[4] + "'" if len(times_list) > 4 else None,
        }
        data.append(player_data)
    return data

# Process all PDF files in the folder
def process_all_pdfs(root_folder):
    pdf_files = get_all_pdf_files(root_folder)
    match_data = []
    all_starting_lineups = []
    all_substitutes = []

    for pdf_file in pdf_files:
        try:
            # Extract match information
            match_info = extract_match_data(pdf_file)
            match_info["file"] = pdf_file  # Add file path for reference
            
            # Extract lineup and substitutes
            starting_lineup_df, substitutes_df = parse_lineup_and_substitutes(pdf_file)
            starting_lineup_df['file'] = pdf_file
            substitutes_df['file'] = pdf_file
            
            # Add match information to each player row
            for key, value in match_info.items():
                starting_lineup_df[key] = value
                substitutes_df[key] = value
            
            # Append to results
            match_data.append(match_info)
            all_starting_lineups.append(starting_lineup_df)
            all_substitutes.append(substitutes_df)
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

    # Combine data into DataFrames
    match_df = pd.DataFrame(match_data)
    starting_lineups_df = pd.concat(all_starting_lineups, ignore_index=True)
    substitutes_df = pd.concat(all_substitutes, ignore_index=True)
    
    return match_df, starting_lineups_df, substitutes_df

# Example usage
root_folder = "/Users/da-eunji/Downloads/Hackathon Data/match-reports"
match_df, starting_lineups_df, substitutes_df = process_all_pdfs(root_folder)

# Save to CSV
match_df.to_csv("match_data.csv", index=False, encoding='utf-8-sig')
starting_lineups_df.to_csv("starting_lineups.csv", index=False, encoding='utf-8-sig')
substitutes_df.to_csv("substitutes.csv", index=False, encoding='utf-8-sig')

print("CSV files created: match_data.csv, starting_lineups.csv, substitutes.csv")

CSV files created: match_data.csv, starting_lineups.csv, substitutes.csv


# Formation and Position

In [None]:
import pdfplumber
import re
import pandas as pd
import os

def extract_formation_changes(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[2]  # Extract from page 3 
            text = page.extract_text()
            
            # Pattern for formations and times
            pattern = r"(\d-\d-\d)\s+(\d+'\s*[–—-]\s*\d+(?:\+\d+)?')"
            formation_data = []
            
            # Extract team names from filename
            filename = os.path.basename(pdf_path)
            teams = re.findall(r"(.+?)\s*-\s*(.+?)\s*\d+-\d+\.pdf", filename)
            
            if teams:
                team1, team2 = teams[0]
            else:
                # Try to extract team names from page content
                team_pattern = r"([A-Za-z\s]+)\s+\d+\s*[–—-]\s*\d+"
                teams = re.findall(team_pattern, text)
                if len(teams) >= 2:
                    team1, team2 = teams[:2]
                else:
                    return pd.DataFrame()  # Return empty DataFrame if no teams found
            
            # Extract formations and time ranges
            matches = re.findall(pattern, text)
            
            for idx, match in enumerate(matches):
                formation, time_range = match
                start_time, end_time = re.split(r'\s*[–—-]\s*', time_range)
                
                team = team1 if idx % 2 == 0 else team2
                
                formation_data.append({
                    "Match": filename.replace(".pdf", ""),
                    "Team": team.strip(),
                    "Formation": formation,
                    "Start_Time": start_time.strip(),
                    "End_Time": end_time.strip()
                })
            
            return pd.DataFrame(formation_data)
    except Exception as e:
        print(f"Error processing {pdf_path}: {str(e)}")
        return pd.DataFrame()

def process_all_pdfs(root_folder):
    all_formations = []
    
    # Traverse all subdirectories
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                formations_df = extract_formation_changes(pdf_path)
                if not formations_df.empty:
                    all_formations.append(formations_df)
    
    # Combine all results
    if all_formations:
        return pd.concat(all_formations, ignore_index=True)
    return pd.DataFrame()

# Example usage
root_folder = "/Users/da-eunji/Downloads/Hackathon Data/match-reports"
all_formations_df = process_all_pdfs(root_folder)

# Display results
print("All Formation Changes:")
print(all_formations_df)

All Formation Changes:
                                                 Match  \
0     Rutgers Scarlet Knights - Seton Hall Pirates 1-3   
1     Rutgers Scarlet Knights - Seton Hall Pirates 1-3   
2     Rutgers Scarlet Knights - Seton Hall Pirates 1-3   
3     Rutgers Scarlet Knights - Seton Hall Pirates 1-3   
4     Rutgers Scarlet Knights - Seton Hall Pirates 1-3   
...                                                ...   
1462        San Diego Toreros - Washington Huskies 1-1   
1463        San Diego Toreros - Washington Huskies 1-1   
1464        San Diego Toreros - Washington Huskies 1-1   
1465        San Diego Toreros - Washington Huskies 1-1   
1466        San Diego Toreros - Washington Huskies 1-1   

                         Team Formation Start_Time End_Time  
0     Rutgers Scarlet Knights     4-1-2         1'      30'  
1          Seton Hall Pirates     3-4-2        46'      59'  
2     Rutgers Scarlet Knights     4-4-2         1'      34'  
3          Seton Hall Pirates   