In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
PATH = "/kaggle/input/champions-league-era-stats"

# The country with the most clubs to participate in the Champions League
Lets analyze the AllTimeRankingByClub which is the more detailed file.  The All Time RankingByCountry is the summary file which removes the club names and just adds up  the numbers per country.

You can use this dictionary if you'd like to convert between the country code and country name:

In [None]:
country_dict = { "ESP": "Spain", "ENG": "England", "ITA" :"Italy", "GER":"Germany", "POR": "Portugal",
               "FRA":"France", "NED":"Netherlands", "POL":"Poland", "ROU":"Romania", "RUS":"Russia", 
               "AUT":"Austria", "SCO":"Scotland", "SRB":"Serbia", "SUI":"Switzerland", "SVK":"Slovakia",
               "SVN":"SLovenia", "TUR":"Turkey", "NOR":"Norway", "ISR":"Israel", "KAZ":"Kazakhstan", 
               "AZE":"Azerbaijan", "HUN":"Hungary", "GRE":"Greece", "FIN":"Finland", "DEN":"Denmark", 
                "SWE": "Swedend", "CZE":"Czech Republic", "CYP":"Cyprus", "CRO": "Croatia", "BUL":"Bulgaria", 
                "BLR":"Belarus", "BEL":"Belguim", "UKR":"Ukraine"}

In [None]:
AllTimeClub = pd.read_csv(os.path.join(PATH, "AllTimeRankingByClub.csv"), encoding='utf-16')
AllTimeClub.columns

In [None]:
df = AllTimeClub.copy(deep=True)
plt.figure(figsize=(16,8))
plt.title("Top Countries with different clubs participating in competition")
df = df.groupby("Country")["Club"].count().sort_values(ascending=False)[:20].plot(kind='barh')

With the domination of the spanish clubs in the last decade I expected them to be in the top but the German clubs are a surprise for me. Even though both leagues have been dominated by a few clubs (2 in Spain and mostly 1 in Germany) it looks like overall the competition produces more balanced results for their clubs compared  other leagues by sending different clubs to Europe's top club competition

# Top scoring clubs per game in the competition

In [None]:
df = AllTimeClub.copy(deep=True)
df["GPG"] = df["Goals For"] / df["Played"]
plt.figure(figsize=(16,8))
plt.title("Clubs with highest Goals Per Game average in the competition")
df = df.groupby(["Club", "Country"])["GPG"].max().sort_values(ascending=False)[:20].plot(kind='barh')

This list has more surprising results than the country list.  PSG, Manchester City, Hoffenheim & Tottenham were not teams I was expecting to see so high in the list. Obviously the less games you participate in, while scoring many goals in those few games skews the results.  Lets confirm

So lets look at those clubs and how many games have they played or have participated in the competition.

In [None]:
df = AllTimeClub.copy(deep=True)
df["GPG"] = df["Goals For"] / df["Played"]
df = df.groupby("Club")["GPG"].max().sort_values(ascending=False)[:20]
AllTimeClub.loc[AllTimeClub["Club"].isin(df.index.tolist())]

We can see that at least 3 teams have only played once in the competition, while 3 others have participated 5 times or less.  There are many ways to slice and dice this dataset so please enjoy and share some of the interesting findings that you come up with.

In [None]:
del df

# Coach with the most appearances for a single club
We will analyze the CoachesAppearDetails file which has the details broken down by each club that the coach represented.  The CoachsAppearTotals is just the sum of this data without the club represented

In [None]:
CoachAppear = pd.read_csv(os.path.join(PATH, "CoachesAppearDetails.csv"), index_col='Unnamed: 0')
CoachAppear

Lets see who are the coaches with the most appearances with a single club and which club it was

In [None]:
df = CoachAppear.copy(deep=True)
df = df.groupby(["Coach", "Club"])["Appearance"].max().sort_values(ascending=False)[:10]
df

# Coach that represented the most clubs in the competition
And now lets looks at which coach has coached the most clubs in the competition, I expected Ancelotti to be up there but I didn't know that Koeman has coached so many diffeernt clubs, need to brush up on my CL history :)

In [None]:
df = CoachAppear.copy(deep=True)
plt.figure(figsize=(16,8))
df = df.groupby(["Coach"])["Club"].count().sort_values(ascending=False)[:15].plot(kind='bar')

In [None]:
del df

# Analyzing the Player appearances
This dataset looks very similar to the coaches data so lets repeat the same analysis.  The player that played the most games for a single club (Note: this player could play for multiple clubs but this is the most games for a club) is:

In [None]:
PlayerAppear = pd.read_csv(os.path.join(PATH, "PlayerAppearDetails.csv"), index_col='Unnamed: 0')
PlayerAppear

In [None]:
df = PlayerAppear.copy(deep=True)
df = df.groupby(["Player", "Club"])["Appearances"].max().sort_values(ascending=False)[:15]
df

In [None]:
df = PlayerAppear.copy(deep=True)
plt.figure(figsize=(16,8))
df = df.groupby(["Player"])["Club"].count().sort_values(ascending=False)[:20].plot(kind='bar')

# Analyzing the top goal scorers
Lets see which player scored the most goals for a single club by using the PlayerGoalDetails file.  The PlayerGoalTotals is just a summary of this data without the club information.

In [None]:
PlayerGoals = pd.read_csv(os.path.join(PATH, "PlayerGoalDetails.csv"), index_col='Unnamed: 0')
PlayerGoals.head(10)

In [None]:
df = PlayerGoals.copy(deep=True)
plt.figure(figsize=(16,8))
df = df.groupby(["Player", "Club"])["Goals"].max().sort_values(ascending=False)[:25]
df

Once again lets look at which player scored the most goals for different clubs

In [None]:
df = PlayerGoals.copy(deep=True)
plt.figure(figsize=(16,8))
df = df.groupby("Player")["Club"].count().sort_values(ascending=False)[:20].plot(kind='barh')