# F1 Infringement Analysis - Exploratory Notebook

This notebook provides an interactive exploration of the F1 infringement analysis pipeline.

## Setup

In [1]:
import sys
from pathlib import Path
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

from data_processing.pdf_extractor import PDFExtractor
from utils.team_detector import TeamDetector
from entity_extraction.entity_extractor import EntityExtractor
from summarization.extractive_summarizer import ExtractiveSummarizer
from summarization.team_summarizer import TeamYearSummarizer

# Set style
sns.set_style('whitegrid')
%matplotlib inline

ModuleNotFoundError: No module named 'PyPDF2'

## 1. Load Data

In [None]:
# Load processed results
records_df = pd.read_csv('../outputs/infringement_records.csv')
summaries_df = pd.read_csv('../outputs/team_year_summaries.csv')

print(f"Total records: {len(records_df)}")
print(f"Total summaries: {len(summaries_df)}")

records_df.head()

## 2. Exploratory Data Analysis

In [None]:
# Infractions by year
plt.figure(figsize=(12, 6))
records_df['year'].value_counts().sort_index().plot(kind='bar')
plt.title('Infractions by Year')
plt.xlabel('Year')
plt.ylabel('Number of Infractions')
plt.show()

In [None]:
# Infractions by team
plt.figure(figsize=(14, 6))
records_df['team'].value_counts().plot(kind='barh')
plt.title('Infractions by Team (2020-2024)')
plt.xlabel('Number of Infractions')
plt.ylabel('Team')
plt.tight_layout()
plt.show()

In [None]:
# Most common infraction types
plt.figure(figsize=(12, 6))
records_df['primary_infraction'].value_counts().head(10).plot(kind='barh')
plt.title('Top 10 Infraction Types')
plt.xlabel('Count')
plt.ylabel('Infraction Type')
plt.tight_layout()
plt.show()

In [None]:
# Penalty type distribution
plt.figure(figsize=(10, 6))
records_df['penalty_type'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Penalty Type Distribution')
plt.ylabel('')
plt.show()

## 3. Team-Specific Analysis

In [None]:
# Select a team to analyze
team_name = 'Mercedes'  # Change this to analyze different teams

team_data = records_df[records_df['team'] == team_name]
print(f"\n{team_name} Analysis:")
print(f"Total infractions: {len(team_data)}")
print(f"\nYearly breakdown:")
print(team_data['year'].value_counts().sort_index())

In [None]:
# Team infraction trends over years
team_yearly = records_df[records_df['team'] == team_name].groupby('year').size()

plt.figure(figsize=(10, 6))
team_yearly.plot(kind='line', marker='o')
plt.title(f'{team_name} Infractions Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Infractions')
plt.grid(True)
plt.show()

## 4. Test Individual Components

In [None]:
# Test PDF extraction on a sample file
sample_pdf = Path('../../Documents/2023-infridgement_profile').glob('*.pdf').__next__()

extractor = PDFExtractor()
text = extractor.extract_text_from_pdf(sample_pdf)
cleaned_text = extractor.clean_text(text)

print(f"Extracted text length: {len(cleaned_text)} characters")
print(f"\nFirst 500 characters:\n{cleaned_text[:500]}")

In [None]:
# Test team detection
detector = TeamDetector('../config/team_mappings.json')
team_result = detector.detect_team(text, '2023')

if team_result:
    team_name, confidence = team_result
    print(f"Detected team: {team_name}")
    print(f"Confidence: {confidence:.2f}")
else:
    print("No team detected")

In [None]:
# Test entity extraction
entity_extractor = EntityExtractor()
entities = entity_extractor.extract_all_entities(cleaned_text, team_name)

print("Extracted entities:")
print(json.dumps(entities, indent=2))

In [None]:
# Test extractive summarization
summarizer = ExtractiveSummarizer(method='lexrank', num_sentences=3)
summary = summarizer.summarize(cleaned_text)

print("Extractive Summary:")
print(summary)

## 5. Generate Custom Team Summary

In [None]:
# Generate team-year summary
team_summarizer = TeamYearSummarizer()

# Convert dataframe to records
records = records_df.to_dict('records')

# Generate summary for specific team-year
summary = team_summarizer.generate_team_year_summary(
    records,
    team='Mercedes',
    year='2023',
    style='factual',
    num_insights=5
)

if summary:
    print(f"\n{'='*60}")
    print(f"{summary['team']} - {summary['year']}")
    print(f"{'='*60}\n")
    for insight in summary['insights']:
        print(f"• {insight}")
else:
    print("No summary generated")

## 6. Visualization: Heatmap of Team Infractions by Year

In [None]:
# Create pivot table
pivot = records_df.pivot_table(
    index='team',
    columns='year',
    values='filename',
    aggfunc='count',
    fill_value=0
)

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Team Infractions Heatmap (2020-2024)')
plt.xlabel('Year')
plt.ylabel('Team')
plt.tight_layout()
plt.show()

## 7. Export Custom Analysis

In [None]:
# Create custom summary report
output_path = '../outputs/custom_analysis.txt'

with open(output_path, 'w') as f:
    f.write("F1 INFRINGEMENT ANALYSIS - CUSTOM REPORT\n")
    f.write("="*60 + "\n\n")
    
    # Overall stats
    f.write(f"Total Documents: {len(records_df)}\n")
    f.write(f"Years Covered: {', '.join(map(str, sorted(records_df['year'].unique())))}\n")
    f.write(f"Teams: {len(records_df['team'].unique())}\n\n")
    
    # Top infractions
    f.write("Top 5 Infraction Types:\n")
    for i, (infraction, count) in enumerate(records_df['primary_infraction'].value_counts().head(5).items(), 1):
        f.write(f"{i}. {infraction}: {count}\n")
    
    f.write("\n" + "="*60 + "\n")

print(f"Custom report saved to: {output_path}")

## Conclusion

This notebook provides an interactive way to explore the F1 infringement data and test the analysis pipeline components. Modify the cells above to customize your analysis!