In [None]:
''' Football Player Performance - Exploratory Data Analysis (EDA)

This notebook provides comprehensive exploratory data analysis of football datasets across multiple leagues and teams.

## Dataset Structure:
- **8 Leagues**: EPL, La Liga, Bundesliga, Serie A, Ligue 1, Eredivise, Segunda Division, Brasil Serie A
- **Teams**: Multiple teams per league
- **CSV Files per Team**: 12 different performance aspects

## Files per Team:
1. `players.csv` - Player information
2. `matches.csv` - Match data
3. `playing_time.csv` - Playing time statistics
4. `shooting.csv` - Shooting statistics
5. `passing.csv` - Passing statistics
6. `defensive_actions.csv` - Defensive performance
7. `possession.csv` - Possession statistics
8. `goalkeepers.csv` - Goalkeeper data
9. `advanced_goalkeeping.csv` - Advanced goalkeeper stats
10. `g_e_s_creation.csv` - Goal, Expected Goals, Shot creation
11. `pass_types.csv` - Different types of passes
12. `miscellaneous_stats.csv` - Other statistics
'''

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)


In [None]:
## 1. Dataset Overview and Structure Analysis


In [1]:
# Define the base path and leagues
base_path = "archive (3)"
leagues = ["EPL", "La Liga", "Bundesliga", "Serie A", "Ligue 1", "Eredivise", "Segunda Division", "Brasil Serie A"]

print("=== FOOTBALL PERFORMANCE DATASET OVERVIEW ===")
print(f"Base Path: {base_path}")
print(f"Number of Leagues: {len(leagues)}")
print(f"Leagues: {', '.join(leagues)}")


=== FOOTBALL PERFORMANCE DATASET OVERVIEW ===
Base Path: archive (3)
Number of Leagues: 8
Leagues: EPL, La Liga, Bundesliga, Serie A, Ligue 1, Eredivise, Segunda Division, Brasil Serie A


In [3]:
# Analyze league and team structure
def analyze_league_structure():
    league_info = {}
    total_teams = 0
    
    for league in leagues:
        league_path = os.path.join(base_path, league)
        if os.path.exists(league_path):
            # Get all team folders
            team_folders = [f for f in os.listdir(league_path) 
                          if os.path.isdir(os.path.join(league_path, f))]
            
            league_info[league] = {
                'num_teams': len(team_folders),
                'teams': team_folders
            }
            total_teams += len(team_folders)
            
            print(f"\n{league}:")
            print(f"  Teams: {len(team_folders)}")
            print(f"  Team names: {', '.join(team_folders[:5])}{'...' if len(team_folders) > 5 else ''}")
        else:
            print(f"\n{league}: Directory not found")
            league_info[league] = {'num_teams': 0, 'teams': []}
    
    print(f"\n=== SUMMARY ===")
    print(f"Total Teams Across All Leagues: {total_teams}")
    
    return league_info

league_structure = analyze_league_structure()



EPL:
  Teams: 21
  Team names: Arsenal, Aston Villa, Bournemouth, Brentford, Brighton and Hove Albion...

La Liga:
  Teams: 20
  Team names: Alaves, Almeria, Athletic Club, Atletico Madrid, Barcelona...

Bundesliga:
  Teams: 18
  Team names: Augsburg, Bayer Leverkusen, Bayern Munich, Bochum, Darmstadt 98...

Serie A:
  Teams: 20
  Team names: Atalanta, Bologna, Cagliari, Empoli, Fiorentina...

Ligue 1:
  Teams: 18
  Team names: Brest, Clermont Foot, Le Havre, Lens, Lille...

Eredivise:
  Teams: 18
  Team names: Ajax, Almere City, AZ Alkmaar, Excelsior, Feyenoord...

Segunda Division:
  Teams: 22
  Team names: Albacete, Alcorcon, Amorebieta, Burgos, Cartagena...

Brasil Serie A:
  Teams: 20
  Team names: Athletico Paranaense, Atletico Goianiense, Atletico Mineiro, Bahia, Botafogo RJ...

=== SUMMARY ===
Total Teams Across All Leagues: 157


In [4]:
# Analyze file types available across teams
def analyze_file_structure():
    expected_files = [
        "players.csv", "matches.csv", "playing_time.csv", "shooting.csv",
        "passing.csv", "defensive_actions.csv", "possession.csv",
        "goalkeepers.csv", "advanced_goalkeeping.csv", "g_e_s_creation.csv",
        "pass_types.csv", "miscellaneous_stats.csv"
    ]
    
    file_availability = {file: 0 for file in expected_files}
    total_teams_checked = 0
    
    for league in leagues:
        league_path = os.path.join(base_path, league)
        if os.path.exists(league_path):
            team_folders = [f for f in os.listdir(league_path) 
                          if os.path.isdir(os.path.join(league_path, f))]
            
            for team in team_folders:
                team_path = os.path.join(league_path, team)
                total_teams_checked += 1
                
                for file_name in expected_files:
                    file_path = os.path.join(team_path, file_name)
                    if os.path.exists(file_path):
                        file_availability[file_name] += 1
    
    print(f"=== FILE AVAILABILITY ANALYSIS ===")
    print(f"Total teams checked: {total_teams_checked}")
    print(f"\nFile availability across all teams:")
    
    for file_name, count in file_availability.items():
        percentage = (count / total_teams_checked) * 100 if total_teams_checked > 0 else 0
        print(f"  {file_name:<25}: {count:3d}/{total_teams_checked} ({percentage:5.1f}%)")
    
    return file_availability, total_teams_checked

file_stats, total_teams = analyze_file_structure()


=== FILE AVAILABILITY ANALYSIS ===
Total teams checked: 157

File availability across all teams:
  players.csv              : 157/157 (100.0%)
  matches.csv              : 157/157 (100.0%)
  playing_time.csv         : 157/157 (100.0%)
  shooting.csv             : 157/157 (100.0%)
  passing.csv              : 157/157 (100.0%)
  defensive_actions.csv    : 157/157 (100.0%)
  possession.csv           : 157/157 (100.0%)
  goalkeepers.csv          : 157/157 (100.0%)
  advanced_goalkeeping.csv : 157/157 (100.0%)
  g_e_s_creation.csv       : 157/157 (100.0%)
  pass_types.csv           : 157/157 (100.0%)
  miscellaneous_stats.csv  : 157/157 (100.0%)
