# Mercedes F1 Infringement Document Filter

This notebook extracts Mercedes-specific infringement documents from the FIA PDF collection and converts them to text files for further processing.

## Objective
- Parse all PDF files in the Documents folder
- Filter for documents addressed to "Mercedes-AMG PETRONAS F1 Team"
- Extract text content from filtered PDFs
- Save as individual .txt files organized by year


In [10]:
pip install PyPDF2

Note: you may need to restart the kernel to use updated packages.


In [11]:
# Import required libraries
import os
import re
from pathlib import Path
import PyPDF2
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


In [13]:
# Define paths
base_path = Path("Documents")
years = ["2020-infringement_profile", "2021-infringement_profile", 
         "2022-infringement_profile", "2023-infringement_profile" , "2024-infringement_profile"]

print("Available year folders:")
for year in years:
    year_path = base_path / year
    if year_path.exists():
        pdf_count = len(list(year_path.glob("*.pdf")))
        print(f"  {year}: {pdf_count} PDF files")


Available year folders:
  2020-infringement_profile: 131 PDF files
  2021-infringement_profile: 175 PDF files
  2022-infringement_profile: 227 PDF files
  2023-infringement_profile: 194 PDF files
  2024-infringement_profile: 215 PDF files


In [14]:
# Function to extract text from PDF
def extract_pdf_text(pdf_path):
    """
    Extract text from PDF file
    Returns the full text content
    """
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n"
            
            return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None


In [15]:
# Updated function to check if document is addressed to Mercedes (includes car numbers)
def is_mercedes_document_updated(text):
    """
    Check if the document is addressed to Mercedes team or involves Mercedes drivers
    Look for the specific header pattern and car numbers
    """
    if not text:
        return False
    
    # Convert to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Look for Mercedes team references in the header section
    mercedes_patterns = [
        r'to:\s*the team manager,?\s*mercedes[-\s]*amg\s*petronas\s*f1\s*team',
        r'mercedes[-\s]*amg\s*petronas\s*f1\s*team',
        r'mercedes\s*petronas\s*f1\s*team',
        r'mercedes\s*amg\s*petronas'
    ]
    
    # Mercedes car numbers by year:
    # 2020-2021: Car 44 (Hamilton), Car 77 (Bottas)
    # 2022-2023: Car 44 (Hamilton), Car 63 (Russell)
    # Also Car 42 (Bottas in some contexts)
    mercedes_car_numbers = [44, 77, 63, 42]
    
    # Check first 2000 characters (header section)
    header_section = text_lower[:2000]
    
    # Check for Mercedes team references
    for pattern in mercedes_patterns:
        if re.search(pattern, header_section, re.IGNORECASE):
            return True
    
    # Check for Mercedes car numbers in document title/header
    # Look for patterns like "Car 44", "Car 77", etc.
    for car_num in mercedes_car_numbers:
        car_pattern = rf'\bcar\s*{car_num}\b'
        if re.search(car_pattern, header_section, re.IGNORECASE):
            return True
    
    return False

# Use the updated function instead of the original
is_mercedes_document = is_mercedes_document_updated


In [16]:
# Function to check if document is addressed to Mercedes
def is_mercedes_document(text):
    """
    Check if the document is addressed to Mercedes team
    Look for the specific header pattern
    """
    if not text:
        return False
    
    # Convert to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Look for Mercedes team references in the header section
    mercedes_patterns = [
        r'to:\s*the team manager,?\s*mercedes[-\s]*amg\s*petronas\s*f1\s*team',
        r'mercedes[-\s]*amg\s*petronas\s*f1\s*team',
        r'mercedes\s*petronas\s*f1\s*team',
        r'mercedes\s*amg\s*petronas'
    ]
    
    # Check first 2000 characters (header section)
    header_section = text_lower[:2000]
    
    for pattern in mercedes_patterns:
        if re.search(pattern, header_section, re.IGNORECASE):
            return True
    
    return False


In [17]:
# Function to clean and save text content
def clean_text(text):
    """
    Clean extracted text
    """
    if not text:
        return ""
    
    # Remove excessive whitespace and normalize
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text


In [18]:
# Main processing function
def process_year_folder(year_folder):
    """
    Process all PDFs in a year folder and extract Mercedes documents
    """
    year_path = base_path / year_folder
    if not year_path.exists():
        print(f"Folder {year_path} does not exist")
        return
    
    pdf_files = list(year_path.glob("*.pdf"))
    print(f"\nProcessing {len(pdf_files)} PDF files in {year_folder}...")
    
    mercedes_docs = []
    
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        # Extract text from PDF
        text = extract_pdf_text(pdf_file)
        
        if text and is_mercedes_document(text):
            # Clean the text
            cleaned_text = clean_text(text)
            
            # Create output filename
            txt_filename = pdf_file.stem + ".txt"
            txt_path = year_path / txt_filename
            
            # Save as text file
            try:
                with open(txt_path, 'w', encoding='utf-8') as f:
                    f.write(cleaned_text)
                
                mercedes_docs.append({
                    'pdf_file': pdf_file.name,
                    'txt_file': txt_filename,
                    'year': year_folder,
                    'text_length': len(cleaned_text)
                })
                
                print(f"✓ Found Mercedes document: {pdf_file.name}")
                
            except Exception as e:
                print(f"Error saving {txt_filename}: {e}")
    
    return mercedes_docs


In [19]:
# Process all year folders
all_mercedes_docs = []

for year_folder in years:
    mercedes_docs = process_year_folder(year_folder)
    if mercedes_docs:
        all_mercedes_docs.extend(mercedes_docs)
        print(f"Found {len(mercedes_docs)} Mercedes documents in {year_folder}")
    else:
        print(f"No Mercedes documents found in {year_folder}")



Processing 131 PDF files in 2020-infringement_profile...


Processing PDFs:  15%|█▍        | 19/131 [00:00<00:00, 184.35it/s]

✓ Found Mercedes document: 2020 Austrian Grand Prix - Decision - Car 44 - alleged failure to slow for yellow flags.pdf
✓ Found Mercedes document: 2020 Austrian Grand Prix - Offence - Car 44 - Failure to slow for yellow flags (post review).pdf


Processing PDFs:  49%|████▉     | 64/131 [00:00<00:00, 155.07it/s]

✓ Found Mercedes document: 2020 Austrian Grand Prix - Offence - Car 44 - Track Limits turn 10.pdf
✓ Found Mercedes document: 2020 Austrian Grand Prix - Decision - review of decision (document 33).pdf
✓ Found Mercedes document: 2020 Russian Grand Prix - Offence - Car 44 - 2nd Practice start .pdf
✓ Found Mercedes document: 2020 Russian Grand Prix - Replacement for Document 47 - Offence - Car 44 - 2nd Practice Start.pdf


Processing PDFs:  81%|████████  | 106/131 [00:00<00:00, 162.09it/s]

✓ Found Mercedes document: 2020 Austrian Grand Prix - Offence - Car 44 - incident with car 23.pdf
✓ Found Mercedes document: 2020 Russian Grand Prix - Decision - Car 44 - Turn 2 .pdf
✓ Found Mercedes document: 2020 Russian Grand Prix - Offence - Car 44 - 1st Practice start.pdf
✓ Found Mercedes document: 2020 Russian Grand Prix - Replacement for Document 46 - Offence - Car 44 - 1st Practice Start.pdf


Processing PDFs: 100%|██████████| 131/131 [00:00<00:00, 163.71it/s]


✓ Found Mercedes document: 2020 Sakhir Grand Prix - Offence - Mercedes - Car 63 incorrect use of tyres.pdf
✓ Found Mercedes document: 2020 Italian Grand Prix - Offence - Car 44 - Entering closed pit lane.pdf
✓ Found Mercedes document: 2020 Austrian Grand Prix - Offence - Car 44 - Leaving the track in turn 10.pdf
Found 13 Mercedes documents in 2020-infringement_profile

Processing 175 PDF files in 2021-infringement_profile...


Processing PDFs:  10%|▉         | 17/175 [00:00<00:00, 167.37it/s]

✓ Found Mercedes document: 2021 Brazilian Grand Prix - Offence - Car 44 - DRS.pdf
✓ Found Mercedes document: 2021 Qatar Grand Prix - Offence - Car 77 - Single waved yellow flag.pdf
✓ Found Mercedes document: 2021 Hungarian Grand Prix - Offence - Car 77 - causing a collision.pdf


Processing PDFs:  25%|██▍       | 43/175 [00:00<00:00, 220.60it/s]

✓ Found Mercedes document: 2021 Brazilian Grand Prix - Offence - Car 44 - PU element.pdf


Processing PDFs:  38%|███▊      | 66/175 [00:00<00:00, 190.77it/s]

✓ Found Mercedes document: 2021 United States Grand Prix - Offence - Car 77 - PU elements.pdf


Processing PDFs:  51%|█████▏    | 90/175 [00:00<00:00, 207.99it/s]

✓ Found Mercedes document: 2021 Italian Grand Prix - Offence - Car 77 - PU elements.pdf
✓ Found Mercedes document: 2021 Russian Grand Prix - Offence - Car 77 - PU elements.pdf
✓ Found Mercedes document: 2021 Austrian Grand Prix - Decision - Car 77 - Alleged driving unnecessarily slowly .pdf
✓ Found Mercedes document: 2021 Saudi Arabian Grand Prix - Offence - Car 44 - Impeding.pdf
✓ Found Mercedes document: 2021 Turkish Grand Prix - Offence - Car 44 - PU element.pdf


Processing PDFs:  65%|██████▌   | 114/175 [00:00<00:00, 217.47it/s]

✓ Found Mercedes document: 2021 Hungarian Grand Prix - Offence - Car 77 - Pre-Race procedure.pdf


Processing PDFs:  79%|███████▉  | 139/175 [00:00<00:00, 225.29it/s]

✓ Found Mercedes document: 2021 Brazilian Grand Prix - Offence - Car 44 - Safety Belts.pdf
✓ Found Mercedes document: 2021 Italian Grand Prix - Offence - Car 77 - PU element.pdf
✓ Found Mercedes document: 2021 Mexican Grand Prix - Offence - Car 44 - Turn 2 .pdf


Processing PDFs:  93%|█████████▎| 162/175 [00:00<00:00, 190.76it/s]

✓ Found Mercedes document: 2021 Saudi Arabian Grand Prix - Decision - Car 44 - double yellow.pdf
✓ Found Mercedes document: 2021 Abu Dhabi Grand Prix - Decision - Mercedes Protest Art. 48.8.pdf
✓ Found Mercedes document: 2021 British Grand Prix - Offence - Car 44 - Causing a collision with car 33.pdf


Processing PDFs: 100%|██████████| 175/175 [00:00<00:00, 202.33it/s]


Found 17 Mercedes documents in 2021-infringement_profile

Processing 227 PDF files in 2022-infringement_profile...


Processing PDFs:  11%|█         | 25/227 [00:00<00:00, 241.09it/s]

✓ Found Mercedes document: 2022 Abu Dhabi Grand Prix - Offence - Car 63 - Unsafe release.pdf
✓ Found Mercedes document: 2022 Austrian Grand Prix - Offence - Car 44 - Parc Ferme Instructions.pdf
✓ Found Mercedes document: 2022 Dutch Grand Prix - Offence - Car 44 - Alleged impeding of Car 55.pdf
✓ Found Mercedes document: 2022 Austrian Grand Prix - Offence - Car 63 - Entered the track on foot.pdf


Processing PDFs:  22%|██▏       | 50/227 [00:00<00:00, 241.05it/s]

✓ Found Mercedes document: 2022 Azerbaijan Grand Prix - Decision - Car 44 - Allegedly driving unnecessarily slowly during Qualifying.pdf


Processing PDFs:  43%|████▎     | 98/227 [00:00<00:00, 214.10it/s]

✓ Found Mercedes document: 2022 Singapore Grand Prix - Offence - Car 63 - PU elements.pdf


Processing PDFs:  54%|█████▍    | 123/227 [00:00<00:00, 224.31it/s]

✓ Found Mercedes document: 2022 Austrian Grand Prix - Offence - Car 63 - Causing a collision.pdf
✓ Found Mercedes document: 2022 Singapore Grand Prix - Offence - Car 63 - Pit lane speeding.pdf


Processing PDFs:  65%|██████▌   | 148/227 [00:00<00:00, 232.23it/s]

✓ Found Mercedes document: 2022 Abu Dhabi Grand Prix - Offence - Car 44 - Pit lane speeding.pdf
✓ Found Mercedes document: 2022 Belgian Grand Prix - Offence - Car 44 - Alleged causing a collision.pdf
✓ Found Mercedes document: 2022 Belgian Grand Prix - Offence - Car 44 - Refusal to visit Medical Centre.pdf
✓ Found Mercedes document: 2022 Italian Grand Prix - Offence - Car 44 - PU element.pdf
✓ Found Mercedes document: 2022 Australian Grand Prix - Decision - Car 44 - Alleged impeding of Car 18 at turn 13.pdf


Processing PDFs:  76%|███████▌  | 172/227 [00:00<00:00, 229.89it/s]

✓ Found Mercedes document: 2022 Austrian Grand Prix - Decision - Team Radio Communication - Formation Lap.pdf
✓ Found Mercedes document: 2022 Abu Dhabi Grand Prix - Decision - Car 44 - Red Flag_0.pdf


Processing PDFs:  86%|████████▋ | 196/227 [00:00<00:00, 227.31it/s]

✓ Found Mercedes document: 2022 Singapore Grand Prix - Decision - Car 44 - Breach of Appendix L.pdf


Processing PDFs: 100%|██████████| 227/227 [00:01<00:00, 224.35it/s]


✓ Found Mercedes document: 2022 United States Grand Prix - Offence - Car 63 - T1 Incident with car 55.pdf
Found 17 Mercedes documents in 2022-infringement_profile

Processing 194 PDF files in 2023-infringement_profile...


Processing PDFs:   0%|          | 0/194 [00:00<?, ?it/s]

✓ Found Mercedes document: 2023 Qatar Grand Prix - Infringement - Car 44 - Crossing the track.pdf
✓ Found Mercedes document: 2023 Monaco Grand Prix - Infringement - Car 44 - Pit Lane Speeding.pdf


Processing PDFs:  12%|█▏        | 23/194 [00:00<00:00, 229.01it/s]

✓ Found Mercedes document: 2023 United States Grand Prix - Infringement - Car 63 - Impeding of Car 16.pdf
✓ Found Mercedes document: 2023 Abu Dhabi Grand Prix - Infringement - Mercedes - Team Principal (Updated).pdf
✓ Found Mercedes document: 2023 Australian Grand Prix - Decision - Mercedes - Inaccurate Self Scrutineering Form.pdf


Processing PDFs:  24%|██▍       | 47/194 [00:00<00:00, 214.38it/s]

✓ Found Mercedes document: 2023 Spanish Grand Prix - Infringement - Car 63 - Abnormal change of direction.pdf
✓ Found Mercedes document: 2023 Austrian Grand Prix - Infringement - Car 44 - Pit Lane Speeding.pdf
✓ Found Mercedes document: 2023 Monaco Grand Prix - Infringement - Car 63 - Unsafe Rejoin.pdf


Processing PDFs:  36%|███▌      | 69/194 [00:00<00:00, 207.17it/s]

✓ Found Mercedes document: 2023 Miami Grand Prix - Decision - Car 44 - Turn 17 Incident.pdf
✓ Found Mercedes document: 2023 United States Grand Prix - Infringement - Car 63 - Leaving the track.pdf
✓ Found Mercedes document: 2023 Canadian Grand Prix - Decision - Car 44 - Alleged Unsafe Release.pdf


Processing PDFs:  48%|████▊     | 93/194 [00:00<00:00, 216.61it/s]

✓ Found Mercedes document: 2023 São Paulo Grand Prix - Infringement - Car 63 - Impeding at Pit Exit.pdf
✓ Found Mercedes document: 2023 Spanish Grand Prix - Infringement - Mercedes - Parc Ferme.pdf
✓ Found Mercedes document: 2023 Italian Grand Prix - Infringement - Car 63 - Leaving the track.pdf
✓ Found Mercedes document: 2023 Bahrain Grand Prix - Decision - Car 44 - Wearing of Jewellery.pdf
✓ Found Mercedes document: 2023 Saudi Arabian Grand Prix - Offence - Mercedes - Inaccurate Scrutineering Form.pdf


Processing PDFs:  59%|█████▉    | 115/194 [00:00<00:00, 178.29it/s]

✓ Found Mercedes document: 2023 Monaco Grand Prix - Infringement - Car 63 - Pit Lane Speeding.pdf
✓ Found Mercedes document: 2023 Belgian Grand Prix - Infringement - Car 44 - Causing a Collision.pdf


Processing PDFs:  92%|█████████▏| 179/194 [00:00<00:00, 173.71it/s]

✓ Found Mercedes document: 2023 United States Grand Prix - Infringement - Car 44 - Technical non-compliance (Plank).pdf
✓ Found Mercedes document: 2023 Las Vegas Grand Prix - Infringement - Car 63 - Causing a collision.pdf
✓ Found Mercedes document: 2023 British Grand Prix - Infringement - Mercedes - Thursday Press Conference.pdf
✓ Found Mercedes document: 2023 Italian Grand Prix - Infringement - Car 44 - Causing a Collision.pdf


Processing PDFs: 100%|██████████| 194/194 [00:01<00:00, 188.21it/s]


✓ Found Mercedes document: 2023 Austrian Grand Prix - Infringement - Car 44 - Leaving the track multiple times.pdf
Found 23 Mercedes documents in 2023-infringement_profile

Processing 215 PDF files in 2024-infringement_profile...


Processing PDFs:   9%|▉         | 19/215 [00:00<00:01, 181.32it/s]

✓ Found Mercedes document: 2024 São Paulo Grand Prix - Infringement - Car 63 - Tyre Pressure Checks.pdf
✓ Found Mercedes document: 2024 Dutch Grand Prix - Infringement - Car 44 - impeding of Car 11.pdf
✓ Found Mercedes document: 2024 Belgian Grand Prix - Infringement - Race Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 Qatar Grand Prix - Infringement - Car 44 - False Start.pdf
✓ Found Mercedes document: 2024 Belgian Grand Prix - Infringement - Car 63 - Technical non-compliance (Weight).pdf
✓ Found Mercedes document: 2024 Saudi Arabian Grand Prix - Infringement - Car 44 - Impeding of Car 2.pdf


Processing PDFs:  20%|█▉        | 42/215 [00:00<00:00, 204.07it/s]

✓ Found Mercedes document: 2024 São Paulo Grand Prix - Infringement - Qualifying Deleted Lap Times.pdf


Processing PDFs:  29%|██▉       | 63/215 [00:00<00:00, 177.51it/s]

✓ Found Mercedes document: 2024 British Grand Prix - Infringement - Qualifying Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 United States Grand Prix - Infringement - Car 63 - Breach of Parc Ferme.pdf
✓ Found Mercedes document: 2024 Italian Grand Prix - Infringement - Race Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 Las Vegas Grand Prix - Infringement - Qualifying Deleted Lap Times.pdf


Processing PDFs:  38%|███▊      | 82/215 [00:00<00:00, 138.14it/s]

✓ Found Mercedes document: 2024 United States Grand Prix - Infringement - Race Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 São Paulo Grand Prix - Infringement - Race Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 Las Vegas Grand Prix - Infringement - Race Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 Miami Grand Prix - Infringement - Car 44 - Pit Lane Speeding.pdf
✓ Found Mercedes document: 2024 Italian Grand Prix - Infringement - Qualifying Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 Qatar Grand Prix - Infringement - Car 63 - Failing to maintain distance behind the Safety Car.pdf
✓ Found Mercedes document: 2024 Qatar Grand Prix - Infringement - Race Deleted Lap Times.pdf


Processing PDFs:  65%|██████▍   | 139/215 [00:00<00:00, 162.44it/s]

✓ Found Mercedes document: 2024 Azerbaijan Grand Prix - Infringement - Car 44 - Changes made during Parc Ferme.pdf
✓ Found Mercedes document: 2024 Singapore Grand Prix - Infringement - Qualifying Deleted Lap Times (Corrected).pdf
✓ Found Mercedes document: 2024 British Grand Prix - Infringement - Race Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 São Paulo Grand Prix - Infringement - Car 44 - Tyre Pressure Checks.pdf
✓ Found Mercedes document: 2024 Hungarian Grand Prix - Infringement - Qualifying Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 São Paulo Grand Prix - Infringement - Qualifying Deleted Lap Times - Yellow Flags.pdf
✓ Found Mercedes document: 2024 United States Grand Prix - Infringement - Qualifying Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 Austrian Grand Prix - Infringement - Car 44 - Unsafe release.pdf


Processing PDFs:  83%|████████▎ | 178/215 [00:01<00:00, 158.92it/s]

✓ Found Mercedes document: 2024 United States Grand Prix - Infringement - Sprint Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 United States Grand Prix - Infringement - Car 63 - Forcing another driver off the track.pdf
✓ Found Mercedes document: 2024 Azerbaijan Grand Prix - Infringement - Qualifying Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 Mexico City Grand Prix - Infringement - Race Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 Azerbaijan Grand Prix - Infringement - Car 63 - Failing to slow for yellow flags.pdf


Processing PDFs: 100%|██████████| 215/215 [00:01<00:00, 153.91it/s]

✓ Found Mercedes document: 2024 Austrian Grand Prix - Infringement - Car 44 - Crossing the line at Pit Entry.pdf
✓ Found Mercedes document: 2024 Qatar Grand Prix - Infringement - Car 44 - Speeding in the Pit Lane.pdf
✓ Found Mercedes document: 2024 São Paulo Grand Prix - Infringement - Sprint Deleted Lap Times.pdf
✓ Found Mercedes document: 2024 São Paulo Grand Prix - Infringement - Car 63 - Aborted Start incident.pdf
✓ Found Mercedes document: 2024 Japanese Grand Prix - Infringement - Car 63 - Unsafe release.pdf
Found 36 Mercedes documents in 2024-infringement_profile





In [20]:
# Summary statistics
if all_mercedes_docs:
    df_mercedes = pd.DataFrame(all_mercedes_docs)
    
    print("\n" + "="*50)
    print("MERCEDES DOCUMENT EXTRACTION SUMMARY")
    print("="*50)
    
    print(f"Total Mercedes documents found: {len(all_mercedes_docs)}")
    print(f"Total text length: {sum([doc['text_length'] for doc in all_mercedes_docs]):,} characters")
    
    print("\nDocuments by year:")
    year_counts = df_mercedes['year'].value_counts().sort_index()
    for year, count in year_counts.items():
        print(f"  {year}: {count} documents")
    
    print("\nSample documents:")
    for _, row in df_mercedes.head().iterrows():
        print(f"  - {row['pdf_file']} ({row['text_length']} chars)")
    
    # Save summary
    df_mercedes.to_csv('mercedes_documents_summary.csv', index=False)
    print("\nSummary saved to: mercedes_documents_summary.csv")
    
else:
    print("\nNo Mercedes documents found. Please check the filtering criteria.")



MERCEDES DOCUMENT EXTRACTION SUMMARY
Total Mercedes documents found: 106
Total text length: 190,742 characters

Documents by year:
  2020-infringement_profile: 13 documents
  2021-infringement_profile: 17 documents
  2022-infringement_profile: 17 documents
  2023-infringement_profile: 23 documents
  2024-infringement_profile: 36 documents

Sample documents:
  - 2020 Austrian Grand Prix - Decision - Car 44 - alleged failure to slow for yellow flags.pdf (1438 chars)
  - 2020 Austrian Grand Prix - Offence - Car 44 - Failure to slow for yellow flags (post review).pdf (1632 chars)
  - 2020 Austrian Grand Prix - Offence - Car 44 - Track Limits turn 10.pdf (1361 chars)
  - 2020 Austrian Grand Prix - Decision - review of decision (document 33).pdf (1124 chars)
  - 2020 Russian Grand Prix - Offence - Car 44 - 2nd Practice start .pdf (1603 chars)

Summary saved to: mercedes_documents_summary.csv


In [21]:
# Display sample of extracted text for verification
if all_mercedes_docs:
    print("\n" + "="*50)
    print("SAMPLE EXTRACTED TEXT")
    print("="*50)
    
    # Get first Mercedes document
    first_doc = all_mercedes_docs[0]
    year_path = base_path / first_doc['year']
    txt_path = year_path / first_doc['txt_file']
    
    if txt_path.exists():
        with open(txt_path, 'r', encoding='utf-8') as f:
            sample_text = f.read()
        
        print(f"Document: {first_doc['pdf_file']}")
        print(f"Year: {first_doc['year']}")
        print(f"Length: {len(sample_text)} characters")
        print("\nFirst 1000 characters:")
        print("-" * 50)
        print(sample_text[:1000] + "..." if len(sample_text) > 1000 else sample_text)



SAMPLE EXTRACTED TEXT
Document: 2020 Austrian Grand Prix - Decision - Car 44 - alleged failure to slow for yellow flags.pdf
Year: 2020-infringement_profile
Length: 1438 characters

First 1000 characters:
--------------------------------------------------
From The Stewards To The Team Manager, Mercedes-AMG Petronas F1 TeamDocument 33 Date 04 July 2020 Time 19:44 2020 AUSTRIAN GRAND PRIX 2 - 5 July 2020 The StewardsThe Stewards, having received a report from the Race Director, summoned (document 29) and heard from the driver and team representative, have considered the following matter and determine the following: No / Driver 44 - Lewis Hamilton Competitor Mercedes-AMG Petronas F1 Team Time 15:59 Session Qualifying Fact Alleged failure to slow for single waved yellow flags between turn 5 and 7. Offence Alleged breach of Appendix H Article 2.5.5.1.b) of the FIA International Sporting Code. Decision No further action. Reason The Stewards heard from the driver of Car 44 (Lewis Hamilton) an

## Next Steps

After running this notebook:
1. Review the extracted Mercedes documents
2. Check the sample text to ensure proper extraction
3. Proceed to consolidation and preprocessing steps
4. Begin text summarization process


In [28]:
import pandas as pd
import os
from pathlib import Path
def combine_year_texts():
    # Create empty list to store year, race and text data
    race_texts = []
    
    # Iterate through each year folder
    for year in years:
        year_path = base_path / year
        
        if year_path.exists():
            # Get all txt files in the year folder
            txt_files = list(year_path.glob('*.txt'))
            
            # Process each race document separately
            for txt_file in txt_files:
                try:
                    with open(txt_file, 'r', encoding='utf-8') as f:
                        # Extract race name from filename
                        race_name = txt_file.stem.split('-')[0].strip()
                        
                        # Add as separate row with year, race and content
                        race_texts.append({
                            'year': year.split('-')[0],  # Extract year from folder name
                            'race': race_name,
                            'text': f.read().strip()  # Remove extra whitespace
                        })
                except Exception as e:
                    print(f"Error reading {txt_file}: {e}")
    
    # Create DataFrame with columns for year, race and text
    df = pd.DataFrame(race_texts)
    
    # Sort by year and race
    df = df.sort_values(['year', 'race'])
    
    # Save to CSV
    df.to_csv('group.csv', index=False)
    print("Created group.csv with documents organized by year and race")
# Run the function
combine_year_texts()


Created group.csv with documents organized by year and race


In [34]:
import os
from dotenv import load_dotenv
import openai
import pandas as pd
from tqdm import tqdm
from openai import OpenAI

# Load environment variables
load_dotenv()

# Set up OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# Read the grouped data
df = pd.read_csv('group.csv')

# Create empty list to store summaries
year_summaries = []

# Group texts by year
grouped_by_year = df.groupby('year')

# Process each year
for year, group in tqdm(grouped_by_year, desc="Generating summaries"):
    # Combine all texts for the year
    year_texts = group['text'].tolist()
    
    # For large texts, split into two parts
    if len(" ".join(year_texts)) > 12000:  # Adjust threshold as needed
        mid_point = len(year_texts) // 2
        text_parts = [
            " ".join(year_texts[:mid_point]),
            " ".join(year_texts[mid_point:])
        ]
    else:
        text_parts = [" ".join(year_texts)]
    
    summaries = []
    for i, text_part in enumerate(text_parts):
        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that summarizes Formula 1 stewards' decisions and incidents."},
                    {"role": "user", "content": f"Summarize the following F1 stewards' decisions from {year} part {i+1} in 100 words:\n\n{text_part}"}
                ]
            )
            summaries.append(response.choices[0].message.content)
        except Exception as e:
            print(f"Error processing year {year} part {i+1}: {e}")
            summaries.append(f"Error processing part {i+1}")
    
    # Combine summaries for the year
    year_summaries.append({
        'year': year,
        'summary': " ".join(summaries)
    })

# Create DataFrame with summaries
summary_df = pd.DataFrame(year_summaries)

# Save summaries to CSV
summary_df.to_csv('cluster_summary.csv', index=False)
print("Created cluster_summary.csv with yearly summaries")


Generating summaries:  80%|████████  | 4/5 [02:05<00:38, 38.54s/it]

Error processing year 2024 part 1: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 9584 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


Generating summaries: 100%|██████████| 5/5 [02:50<00:00, 34.15s/it]

Error processing year 2024 part 2: Error code: 429 - {'error': {'message': 'Request too large for gpt-4 in organization org-e8EBhIEVL8bFtiiZiLieriaX on tokens per min (TPM): Limit 10000, Requested 10009. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Created cluster_summary.csv with yearly summaries



