# Football Match Event Analysis
## Professional Data Analysis & Visualization

**Author:** Data Analysis Project  
**Date:** February 2026  
**Dataset:** Match Event Data (3,098 events)

---

### Executive Summary
This notebook presents a comprehensive analysis of football match event data, examining:
- Event distribution and patterns
- Team performance metrics
- Temporal analysis of game flow
- Player contributions and positioning
- Tactical insights

## 1. Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings

warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("‚úì Libraries imported successfully")

In [None]:
# Load the dataset
with open('19736.json', 'r') as f:
    data = json.load(f)

print(f"Dataset loaded: {len(data):,} events")
print(f"Data structure: {type(data)}")

## 2. Data Exploration & Preprocessing

In [None]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(data)

# Extract nested fields
df['event_type'] = df['type'].apply(lambda x: x.get('name', 'Unknown') if isinstance(x, dict) else 'Unknown')
df['team_name'] = df['team'].apply(lambda x: x.get('name', 'Unknown') if isinstance(x, dict) else 'Unknown')
df['possession_team_name'] = df['possession_team'].apply(lambda x: x.get('name', 'Unknown') if isinstance(x, dict) else 'Unknown')
df['play_pattern_name'] = df['play_pattern'].apply(lambda x: x.get('name', 'Unknown') if isinstance(x, dict) else 'Unknown')

print("Dataset Overview:")
print("="*60)
print(df.info())
print("\n" + "="*60)
print("\nFirst few rows:")
df.head()

In [None]:
# Basic statistics
print("DATASET STATISTICS")
print("="*60)
print(f"Total Events: {len(df):,}")
print(f"Unique Event Types: {df['event_type'].nunique()}")
print(f"Teams: {df['team_name'].unique()}")
print(f"Match Periods: {sorted(df['period'].unique())}")
print(f"Time Range: {df['minute'].min()}-{df['minute'].max()} minutes")
print(f"Average Duration per Event: {df['duration'].mean():.2f} seconds")

## 3. Event Type Analysis

In [None]:
# Event type distribution
event_counts = df['event_type'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
event_counts.head(15).plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Top 15 Event Types', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Frequency', fontsize=12)
axes[0].set_ylabel('Event Type', fontsize=12)
axes[0].grid(axis='x', alpha=0.3)

# Pie chart for top 10
colors = plt.cm.Set3(range(10))
event_counts.head(10).plot(kind='pie', ax=axes[1], autopct='%1.1f%%', 
                            startangle=90, colors=colors)
axes[1].set_title('Top 10 Event Types (% Distribution)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig('event_type_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nEvent Type Summary:")
print(event_counts.head(10))

## 4. Team Performance Analysis

In [None]:
# Team event distribution
team_events = df.groupby(['team_name', 'event_type']).size().unstack(fill_value=0)

# Key performance indicators
key_events = ['Pass', 'Shot', 'Duel', 'Interception', 'Clearance', 'Foul Committed']
available_events = [e for e in key_events if e in team_events.columns]

if available_events:
    fig, ax = plt.subplots(figsize=(14, 6))
    team_events[available_events].plot(kind='bar', ax=ax, width=0.8)
    ax.set_title('Team Performance Comparison - Key Events', fontsize=14, fontweight='bold')
    ax.set_xlabel('Team', fontsize=12)
    ax.set_ylabel('Event Count', fontsize=12)
    ax.legend(title='Event Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(axis='y', alpha=0.3)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('team_performance_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

# Overall team statistics
print("\nTeam Event Statistics:")
print("="*60)
team_totals = df['team_name'].value_counts()
for team, count in team_totals.items():
    pct = (count / len(df)) * 100
    print(f"{team}: {count:,} events ({pct:.1f}%)")

## 5. Temporal Analysis

In [None]:
# Events over time
time_df = df.groupby('minute').size().reset_index(name='event_count')

fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Line plot - event intensity over time
axes[0].plot(time_df['minute'], time_df['event_count'], marker='o', 
             linewidth=2, markersize=4, color='darkblue', alpha=0.7)
axes[0].fill_between(time_df['minute'], time_df['event_count'], alpha=0.3)
axes[0].set_title('Match Event Intensity Over Time', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Minute', fontsize=12)
axes[0].set_ylabel('Number of Events', fontsize=12)
axes[0].grid(True, alpha=0.3)
axes[0].axvline(x=45, color='red', linestyle='--', alpha=0.5, label='Half-time')
axes[0].legend()

# Period comparison
period_counts = df['period'].value_counts().sort_index()
axes[1].bar(period_counts.index, period_counts.values, color=['#3498db', '#e74c3c', '#2ecc71'][:len(period_counts)])
axes[1].set_title('Events by Match Period', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Period', fontsize=12)
axes[1].set_ylabel('Event Count', fontsize=12)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('temporal_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nPeriod Statistics:")
print(period_counts)

## 6. Pass Analysis

In [None]:
# Detailed pass analysis
passes = df[df['event_type'] == 'Pass'].copy()

if len(passes) > 0:
    print(f"Total Passes: {len(passes):,}")
    
    # Extract pass outcomes if available
    if 'pass' in passes.columns:
        passes['pass_length'] = passes['pass'].apply(
            lambda x: x.get('length', np.nan) if isinstance(x, dict) else np.nan
        )
        passes['pass_outcome'] = passes['pass'].apply(
            lambda x: x.get('outcome', {}).get('name', 'Complete') if isinstance(x, dict) else 'Complete'
        )
        
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Pass length distribution
        valid_lengths = passes['pass_length'].dropna()
        if len(valid_lengths) > 0:
            axes[0].hist(valid_lengths, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
            axes[0].axvline(valid_lengths.mean(), color='red', linestyle='--', 
                           linewidth=2, label=f'Mean: {valid_lengths.mean():.1f}m')
            axes[0].set_title('Pass Length Distribution', fontsize=14, fontweight='bold')
            axes[0].set_xlabel('Pass Length (meters)', fontsize=12)
            axes[0].set_ylabel('Frequency', fontsize=12)
            axes[0].legend()
            axes[0].grid(axis='y', alpha=0.3)
        
        # Pass outcomes
        outcome_counts = passes['pass_outcome'].value_counts()
        axes[1].pie(outcome_counts.values, labels=outcome_counts.index, autopct='%1.1f%%',
                   startangle=90, colors=plt.cm.Pastel1(range(len(outcome_counts))))
        axes[1].set_title('Pass Outcomes', fontsize=14, fontweight='bold')
        
        plt.tight_layout()
        plt.savefig('pass_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    # Passes by team
    team_passes = passes['team_name'].value_counts()
    print("\nPasses by Team:")
    for team, count in team_passes.items():
        pct = (count / len(passes)) * 100
        print(f"{team}: {count:,} passes ({pct:.1f}%)")

## 7. Possession Analysis

In [None]:
# Possession statistics
possession_stats = df['possession_team_name'].value_counts()

fig, ax = plt.subplots(figsize=(10, 8))

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
wedges, texts, autotexts = ax.pie(possession_stats.values, 
                                    labels=possession_stats.index,
                                    autopct='%1.1f%%',
                                    startangle=90,
                                    colors=colors[:len(possession_stats)],
                                    explode=[0.05] * len(possession_stats),
                                    shadow=True)

# Enhance text
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')
    autotext.set_fontsize(11)

ax.set_title('Possession Distribution', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('possession_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nPossession Statistics:")
print("="*60)
for team, count in possession_stats.items():
    pct = (count / len(df)) * 100
    print(f"{team}: {count:,} events ({pct:.1f}%)")

## 8. Play Pattern Analysis

In [None]:
# Play patterns
play_patterns = df['play_pattern_name'].value_counts()

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.barh(play_patterns.index, play_patterns.values, color='coral')

# Add value labels
for i, (bar, value) in enumerate(zip(bars, play_patterns.values)):
    ax.text(value, i, f' {value:,}', va='center', fontweight='bold')

ax.set_title('Play Patterns Distribution', fontsize=14, fontweight='bold')
ax.set_xlabel('Frequency', fontsize=12)
ax.set_ylabel('Play Pattern', fontsize=12)
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('play_patterns.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nPlay Pattern Summary:")
print(play_patterns)

## 9. Player Analysis

In [None]:
# Extract player information from events
player_events = []

for idx, row in df.iterrows():
    if 'player' in row and isinstance(row['player'], dict):
        player_events.append({
            'player_name': row['player'].get('name', 'Unknown'),
            'event_type': row['event_type'],
            'team': row['team_name']
        })

if player_events:
    player_df = pd.DataFrame(player_events)
    
    # Top players by event count
    top_players = player_df['player_name'].value_counts().head(15)
    
    fig, ax = plt.subplots(figsize=(12, 8))
    bars = ax.barh(range(len(top_players)), top_players.values, color='mediumpurple')
    ax.set_yticks(range(len(top_players)))
    ax.set_yticklabels(top_players.index)
    ax.set_xlabel('Number of Events', fontsize=12)
    ax.set_ylabel('Player', fontsize=12)
    ax.set_title('Top 15 Most Active Players', fontsize=14, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, (bar, value) in enumerate(zip(bars, top_players.values)):
        ax.text(value, i, f' {value}', va='center', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig('top_players.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nTop 10 Most Active Players:")
    print("="*60)
    print(top_players.head(10))
else:
    print("No player information available in the dataset.")

## 10. Heatmap: Events by Period and Type

In [None]:
# Create heatmap of events by period and type
period_event_matrix = df.groupby(['period', 'event_type']).size().unstack(fill_value=0)

# Select top event types for readability
top_event_types = df['event_type'].value_counts().head(10).index
period_event_matrix = period_event_matrix[top_event_types]

fig, ax = plt.subplots(figsize=(14, 6))
sns.heatmap(period_event_matrix.T, annot=True, fmt='d', cmap='YlOrRd', 
            cbar_kws={'label': 'Event Count'}, ax=ax, linewidths=0.5)
ax.set_title('Event Distribution: Period vs Event Type', fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Period', fontsize=12)
ax.set_ylabel('Event Type', fontsize=12)

plt.tight_layout()
plt.savefig('period_event_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

## 11. Summary Statistics & Key Insights

In [None]:
print("="*80)
print("MATCH ANALYSIS SUMMARY")
print("="*80)

print("\nüìä GENERAL STATISTICS")
print("-" * 80)
print(f"Total Events Recorded: {len(df):,}")
print(f"Unique Event Types: {df['event_type'].nunique()}")
print(f"Match Duration: {df['minute'].max()} minutes")
print(f"Number of Periods: {df['period'].nunique()}")

print("\n‚öΩ TOP 5 EVENT TYPES")
print("-" * 80)
for i, (event, count) in enumerate(df['event_type'].value_counts().head(5).items(), 1):
    pct = (count / len(df)) * 100
    print(f"{i}. {event}: {count:,} ({pct:.1f}%)")

print("\nüë• TEAM STATISTICS")
print("-" * 80)
for team, count in df['team_name'].value_counts().items():
    pct = (count / len(df)) * 100
    print(f"{team}: {count:,} events ({pct:.1f}%)")

if len(passes) > 0 and 'pass_length' in passes.columns:
    valid_pass_lengths = passes['pass_length'].dropna()
    if len(valid_pass_lengths) > 0:
        print("\nüìç PASSING STATISTICS")
        print("-" * 80)
        print(f"Total Passes: {len(passes):,}")
        print(f"Average Pass Length: {valid_pass_lengths.mean():.2f} meters")
        print(f"Longest Pass: {valid_pass_lengths.max():.2f} meters")
        print(f"Shortest Pass: {valid_pass_lengths.min():.2f} meters")

print("\n‚è±Ô∏è TEMPORAL INSIGHTS")
print("-" * 80)
for period in sorted(df['period'].unique()):
    period_data = df[df['period'] == period]
    print(f"Period {period}: {len(period_data):,} events")

print("\n" + "="*80)
print("Analysis Complete! All visualizations saved.")
print("="*80)

## 12. Export Processed Data

In [None]:
# Export key dataframes for further analysis
df_export = df[['index', 'period', 'minute', 'second', 'event_type', 
                'team_name', 'possession_team_name', 'play_pattern_name', 'duration']].copy()

df_export.to_csv('processed_match_data.csv', index=False)
print("‚úì Processed data exported to 'processed_match_data.csv'")

# Export summary statistics
summary_stats = {
    'total_events': len(df),
    'event_types': df['event_type'].value_counts().to_dict(),
    'team_distribution': df['team_name'].value_counts().to_dict(),
    'period_distribution': df['period'].value_counts().to_dict()
}

with open('summary_statistics.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)
    
print("‚úì Summary statistics exported to 'summary_statistics.json'")

---

## Conclusion

This analysis provides comprehensive insights into the match event data, including:

- **Event Distribution**: Clear patterns in event types with passes being the most common
- **Team Performance**: Comparative analysis of team activities and contributions
- **Temporal Patterns**: How events unfold across different periods of the match
- **Player Contributions**: Individual player activity and impact
- **Tactical Insights**: Play patterns and possession dynamics

All visualizations have been exported as high-resolution PNG files for presentation use.

---

**For questions or further analysis, please refer to the README.md file.**