# Football Event Data Analysis

This notebook provides a comprehensive analysis of football event data from 15946.json, including match statistics, team performance metrics, player contributions, and visual insights.

## Section 1: Load and Explore the Dataset

In [1]:
import json

# Load the football events data
with open('/home/sohamkc/something/15946.json', 'r') as f:
    events_data = json.load(f)

print(f"âœ“ Total events loaded: {len(events_data)}")
print(f"âœ“ Sample event keys: {list(events_data[0].keys())}")

âœ“ Total events loaded: 3762
âœ“ Sample event keys: ['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type', 'possession', 'possession_team', 'play_pattern', 'team', 'duration', 'tactics']


In [2]:
# Extract basic information without pandas
event_types = {}
teams = {}
players = set()

for event in events_data:
    # Count event types
    if 'type' in event and isinstance(event['type'], dict):
        event_name = event['type'].get('name', 'Unknown')
        event_types[event_name] = event_types.get(event_name, 0) + 1
    
    # Count teams
    if 'team' in event and isinstance(event['team'], dict):
        team_name = event['team'].get('name', 'Unknown')
        teams[team_name] = teams.get(team_name, 0) + 1
    
    # Count players
    if 'player' in event and isinstance(event['player'], dict):
        player_name = event['player'].get('name', 'Unknown')
        if player_name != 'Unknown':
            players.add(player_name)

print("ðŸ“Š TEAMS IN THE DATASET:")
for team, count in sorted(teams.items(), key=lambda x: x[1], reverse=True):
    print(f"  {team}: {count} events ({count/len(events_data)*100:.1f}%)")

print(f"\nðŸ“Š UNIQUE EVENT TYPES: {len(event_types)}")
print("\nTop 10 Event Types:")
for event, count in sorted(event_types.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"  {event}: {count}")

print(f"\nðŸ‘¥ TOTAL PLAYERS: {len(players)}")

ðŸ“Š TEAMS IN THE DATASET:
  Barcelona: 2788 events (74.1%)
  Deportivo AlavÃ©s: 974 events (25.9%)

ðŸ“Š UNIQUE EVENT TYPES: 24

Top 10 Event Types:
  Pass: 1163
  Ball Receipt*: 1058
  Carry: 890
  Pressure: 212
  Ball Recovery: 89
  Duel: 53
  Clearance: 37
  Goal Keeper: 34
  Block: 32
  Shot: 28

ðŸ‘¥ TOTAL PLAYERS: 28


In [3]:
print("âœ“ Data exploration complete!")

âœ“ Data exploration complete!


## Section 2: Data Cleaning and Preprocessing

In [4]:
# Clean and organize the data
team_stats = {}
for team in teams.keys():
    team_stats[team] = {
        'passes': 0,
        'shots': 0,
        'fouls': 0,
        'interceptions': 0,
        'clearances': 0,
        'total': teams[team]
    }

for event in events_data:
    if 'team' in event and isinstance(event['team'], dict):
        team_name = event['team'].get('name', 'Unknown')
        if 'type' in event and isinstance(event['type'], dict):
            event_name = event['type'].get('name', 'Unknown')
            
            if team_name in team_stats:
                if event_name == 'Pass':
                    team_stats[team_name]['passes'] += 1
                elif event_name == 'Shot':
                    team_stats[team_name]['shots'] += 1
                elif event_name == 'Foul Committed':
                    team_stats[team_name]['fouls'] += 1
                elif event_name == 'Interception':
                    team_stats[team_name]['interceptions'] += 1
                elif event_name == 'Clearance':
                    team_stats[team_name]['clearances'] += 1

print("âœ“ Data cleaning complete!")

âœ“ Data cleaning complete!


## Section 3: Match Statistics Analysis

In [5]:
print("="*60)
print("EVENT TYPE DISTRIBUTION (Top 15)")
print("="*60)

sorted_events = sorted(event_types.items(), key=lambda x: x[1], reverse=True)[:15]
for i, (event, count) in enumerate(sorted_events, 1):
    pct = (count / len(events_data)) * 100
    bar = "â–ˆ" * int(pct / 2)
    print(f"{i:2d}. {event:20s} {count:4d} ({pct:5.1f}%) {bar}")

EVENT TYPE DISTRIBUTION (Top 15)
 1. Pass                 1163 ( 30.9%) â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ
 2. Ball Receipt*        1058 ( 28.1%) â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ
 3. Carry                 890 ( 23.7%) â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ
 4. Pressure              212 (  5.6%) â–ˆâ–ˆ
 5. Ball Recovery          89 (  2.4%) â–ˆ
 6. Duel                   53 (  1.4%) 
 7. Clearance              37 (  1.0%) 
 8. Goal Keeper            34 (  0.9%) 
 9. Block                  32 (  0.9%) 
10. Shot                   28 (  0.7%) 
11. Interception           24 (  0.6%) 
12. Dribble                24 (  0.6%) 
13. Foul Committed         23 (  0.6%) 
14. Dispossessed           21 (  0.6%) 
15. Foul Won               21 (  0.6%) 


In [6]:
print("\n" + "="*60)
print("TEAM STATISTICS")
print("="*60)

for team, stats in team_stats.items():
    print(f"\n{team}:")
    print(f"  Total Events: {stats['total']}")
    print(f"  Passes: {stats['passes']}")
    print(f"  Shots: {stats['shots']}")
    print(f"  Fouls: {stats['fouls']}")
    print(f"  Interceptions: {stats['interceptions']}")
    print(f"  Clearances: {stats['clearances']}")


TEAM STATISTICS

Barcelona:
  Total Events: 2788
  Passes: 917
  Shots: 25
  Fouls: 7
  Interceptions: 6
  Clearances: 5

Deportivo AlavÃ©s:
  Total Events: 974
  Passes: 246
  Shots: 3
  Fouls: 16
  Interceptions: 18
  Clearances: 32


In [7]:
# Calculate possession
possession_count = {}
for event in events_data:
    if 'possession_team' in event and isinstance(event['possession_team'], dict):
        team_name = event['possession_team'].get('name', 'Unknown')
        possession_count[team_name] = possession_count.get(team_name, 0) + 1

print("\n" + "="*60)
print("POSSESSION ANALYSIS")
print("="*60)

total_events = sum(possession_count.values())
for team, count in sorted(possession_count.items(), key=lambda x: x[1], reverse=True):
    pct = (count / total_events) * 100
    print(f"\n{team}:")
    print(f"  Possession Events: {count}")
    print(f"  Possession %: {pct:.1f}%")


POSSESSION ANALYSIS

Barcelona:
  Possession Events: 3118
  Possession %: 82.9%

Deportivo AlavÃ©s:
  Possession Events: 644
  Possession %: 17.1%


## Section 4: Team Performance Metrics

In [8]:
print("\n" + "="*60)
print("TOP PLAYERS BY EVENT CONTRIBUTION")
print("="*60)

# Count events by player
player_events = {}
for event in events_data:
    if 'player' in event and isinstance(event['player'], dict):
        player_name = event['player'].get('name', 'Unknown')
        if player_name != 'Unknown':
            player_events[player_name] = player_events.get(player_name, 0) + 1

# Sort and display top 15
sorted_players = sorted(player_events.items(), key=lambda x: x[1], reverse=True)[:15]
print("\nTop 15 Players:")
for i, (player, count) in enumerate(sorted_players, 1):
    print(f"  {i:2d}. {player:40s} - {count:3d} events")


TOP PLAYERS BY EVENT CONTRIBUTION

Top 15 Players:
   1. Ivan RakitiÄ‡                             - 408 events
   2. Jordi Alba Ramos                         - 355 events
   3. Lionel AndrÃ©s Messi Cuccittini           - 304 events
   4. Sergi Roberto Carnicer                   - 280 events
   5. Sergio Busquets i Burgos                 - 251 events
   6. Ousmane DembÃ©lÃ©                          - 228 events
   7. Gerard PiquÃ© BernabÃ©u                    - 215 events
   8. Samuel Yves Umtiti                       - 180 events
   9. Philippe Coutinho Correia                - 169 events
  10. Luis Alberto SuÃ¡rez DÃ­az                 - 141 events
  11. Ibai GÃ³mez PÃ©rez                         - 110 events
  12. NÃ©lson Cabral Semedo                     - 105 events
  13. Manuel Alejandro GarcÃ­a SÃ¡nchez          - 103 events
  14. Mubarak Wakaso                           -  97 events
  15. Jonathan RodrÃ­guez MenÃ©ndez              -  92 events


In [9]:
import matplotlib.pyplot as plt

# Simple bar chart of team statistics
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

teams_list = list(team_stats.keys())
passes = [team_stats[t]['passes'] for t in teams_list]
shots = [team_stats[t]['shots'] for t in teams_list]

# Passes
axes[0].bar(teams_list, passes, color=['#FF6B6B', '#4ECDC4'])
axes[0].set_ylabel('Number of Passes')
axes[0].set_title('Passes by Team')
axes[0].grid(axis='y', alpha=0.3)

# Shots
axes[1].bar(teams_list, shots, color=['#FF6B6B', '#4ECDC4'])
axes[1].set_ylabel('Number of Shots')
axes[1].set_title('Shots by Team')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("âœ“ Charts displayed!")

: 

## Section 5: Player Event Analysis

In [None]:
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)

print(f"\nTotal Events: {len(events_data)}")
print(f"Total Teams: {len(teams)}")
print(f"Total Players: {len(players)}")
print(f"Total Unique Event Types: {len(event_types)}")

print("\n" + "="*60)
print("âœ“ ANALYSIS COMPLETE!")
print("="*60)

## Section 6: Advanced Insights and Visualizations

In [None]:
# Timeline analysis
timeline = {}
for event in events_data:
    minute = event.get('minute', 0)
    if minute not in timeline:
        timeline[minute] = 0
    timeline[minute] += 1

print("Match events per minute (sample):")
for minute in sorted(list(timeline.keys())[:10]):
    print(f"  Minute {minute}: {timeline[minute]} events")

In [None]:
print("\n" + "="*60)
print("KEY INSIGHTS")
print("="*60)

print("\nBarcelona vs Deportivo AlavÃ©s:")
for team in teams.keys():
    stats = team_stats[team]
    pct = (stats['total'] / len(events_data)) * 100
    print(f"\n{team}:")
    print(f"  Possession: {pct:.1f}%")
    print(f"  Pass/Event Ratio: {stats['passes']/stats['total']*100:.1f}%")
    print(f"  Shot Success: {stats['shots']} shots from {stats['passes']} passes")