In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

---
## Récupération des jeux de données et compréhension

In [124]:
df_matches = pd.read_csv("Africa Cup of Nations Matches MaJ.csv", sep=";")
df_countries = pd.read_csv("African Countries.csv", sep=";")
df_stats = pd.read_csv("Tournaments General Statistics MaJ.csv", sep=";")

In [57]:
df_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 746 entries, 0 to 745
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Year                  746 non-null    int64  
 1   Date                  662 non-null    object 
 2   Time                  333 non-null    object 
 3   HomeTeam              746 non-null    object 
 4   AwayTeam              746 non-null    object 
 5   HomeTeamGoals         742 non-null    float64
 6   AwayTeamGoals         742 non-null    float64
 7   Stage                 746 non-null    object 
 8   SpecialWinConditions  63 non-null     object 
 9   Stadium               742 non-null    object 
 10  City                  742 non-null    object 
 11  Attendance            522 non-null    float64
dtypes: float64(3), int64(1), object(8)
memory usage: 70.1+ KB


In [58]:
df_matches.isna().sum()

Year                      0
Date                     84
Time                    413
HomeTeam                  0
AwayTeam                  0
HomeTeamGoals             4
AwayTeamGoals             4
Stage                     0
SpecialWinConditions    683
Stadium                   4
City                      4
Attendance              224
dtype: int64

In [59]:
df_matches.loc[df_matches['HomeTeamGoals'].isna()]

Unnamed: 0,Year,Date,Time,HomeTeam,AwayTeam,HomeTeamGoals,AwayTeamGoals,Stage,SpecialWinConditions,Stadium,City,Attendance
1,1957,10/02/1957,,Ethiopia,South Africa,,,Semifinals,Ethiopia wins due to disqualification of othe...,,,
489,2010,11/01/2010,19:30,Ghana,Togo,,,Group B,Ghana wins due to disqualification of other team,,,
490,2010,15/01/2010,17:00,Burkina Faso,Togo,,,Group B,Burkina Faso wins due to disqualification of ...,,,
493,2010,19/01/2010,17:00,Ivory Coast,Togo,,,Group B,Ivory Coast wins due to disqualification of o...,,,


Nous allons supprimer tous les matchs qui n'ont pas été joués

In [60]:
df_matches.dropna(subset=['HomeTeamGoals'], inplace=True)

In [61]:
df_matches.isna().sum()

Year                      0
Date                     84
Time                    412
HomeTeam                  0
AwayTeam                  0
HomeTeamGoals             0
AwayTeamGoals             0
Stage                     0
SpecialWinConditions    683
Stadium                   0
City                      0
Attendance              220
dtype: int64

In [62]:
# Conversion des types float en int sur les buts et le nombre de personnes au stade
df_matches['HomeTeamGoals'] = df_matches['HomeTeamGoals'].astype(int)
df_matches['AwayTeamGoals'] = df_matches['AwayTeamGoals'].astype(int)

In [63]:
df_matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 742 entries, 0 to 745
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Year                  742 non-null    int64  
 1   Date                  658 non-null    object 
 2   Time                  330 non-null    object 
 3   HomeTeam              742 non-null    object 
 4   AwayTeam              742 non-null    object 
 5   HomeTeamGoals         742 non-null    int32  
 6   AwayTeamGoals         742 non-null    int32  
 7   Stage                 742 non-null    object 
 8   SpecialWinConditions  59 non-null     object 
 9   Stadium               742 non-null    object 
 10  City                  742 non-null    object 
 11  Attendance            522 non-null    float64
dtypes: float64(1), int32(2), int64(1), object(8)
memory usage: 69.6+ KB


In [64]:
df_matches['HomeTeam'].nunique()

73

In [65]:
# Enlever les espaces
df_matches['HomeTeam'] = df_matches['HomeTeam'].str.strip()

In [66]:
df_matches['HomeTeam'].nunique()

46

In [67]:
df_matches['AwayTeam'] = df_matches['AwayTeam'].str.strip()

In [68]:
df_matches['HomeTeam'].unique()

array(['Sudan', 'Egypt', 'Ethiopia', 'Tunisia', 'Ghana', 'Senegal',
       'Ivory Coast', 'Algeria', 'Congo-Kinshasa', 'Cameroon',
       'United Arab Rep.', 'Guinea', 'Mali', 'Togo', 'Congo', 'Zaire',
       'Morocco', 'Zambia', 'Nigeria', 'Uganda', 'Libya', 'Malawi',
       'South Africa', 'Angola', 'Sierra Leone', 'Gabon', 'Burkina Faso',
       'DR Congo', 'Liberia', 'Rwanda', 'Kenya', 'Zimbabwe', 'Namibia',
       'Mozambique', 'Guinea-Bissau', 'Madagascar', 'Burundi', 'Tanzania',
       'Mauritania', 'Benin', 'Comoros', 'Equatorial Guinea',
       'Cape Verde', 'Gambia', 'Niger', 'Botswana'], dtype=object)

In [69]:
df_matches['AwayTeam'].unique()

array(['Egypt', 'Ethiopia', 'Sudan', 'Tunisia', 'Uganda', 'Nigeria',
       'Congo-Léopoldville', 'Ivory Coast', 'Algeria', 'Senegal',
       'Congo-Brazzaville', 'Congo-Kinshasa', 'Ghana', 'Cameroon',
       'Guinea', 'Kenya', 'Togo', 'Mali', 'Morocco', 'Congo', 'Zaire',
       'Zambia', 'Mauritius', 'Upper Volta', 'Tanzania', 'Libya',
       'Malawi', 'Mozambique', 'Gabon', 'Sierra Leone', 'Angola',
       'Burkina Faso', 'Liberia', 'DR Congo', 'Namibia', 'South Africa',
       'Rwanda', 'Zimbabwe', 'Benin', 'Equatorial Guinea', 'Cape Verde',
       'Guinea-Bissau', 'Burundi', 'Madagascar', 'Mauritania', 'Gambia',
       'Comoros', 'Niger', 'Botswana'], dtype=object)

In [70]:
# Gérer le cas des Congo, du Burkina Faso, de l'Egypte qui ont changé de nom
df_matches['HomeTeam'].replace('Congo-Kinshasa', 'DR Congo', inplace=True)
df_matches['AwayTeam'].replace('Congo-Kinshasa', 'DR Congo', inplace=True)

df_matches['HomeTeam'].replace('Zaire', 'DR Congo', inplace=True)
df_matches['AwayTeam'].replace('Zaire', 'DR Congo', inplace=True)

df_matches['HomeTeam'].replace('Congo-Léopoldville', 'DR Congo', inplace=True)
df_matches['AwayTeam'].replace('Congo-Léopoldville', 'DR Congo', inplace=True)

df_matches['HomeTeam'].replace('Congo-Brazzaville', 'Congo', inplace=True)
df_matches['AwayTeam'].replace('Congo-Brazzaville', 'Congo', inplace=True)

df_matches['HomeTeam'].replace('Upper Volta', 'Burkina Faso', inplace=True)
df_matches['AwayTeam'].replace('Upper Volta', 'Burkina Faso', inplace=True)

df_matches['HomeTeam'].replace('United Arab Rep.', 'Egypt', inplace=True)
df_matches['AwayTeam'].replace('United Arab Rep.', 'Egypt', inplace=True)

In [71]:
df_matches['HomeTeam'].unique()

array(['Sudan', 'Egypt', 'Ethiopia', 'Tunisia', 'Ghana', 'Senegal',
       'Ivory Coast', 'Algeria', 'DR Congo', 'Cameroon', 'Guinea', 'Mali',
       'Togo', 'Congo', 'Morocco', 'Zambia', 'Nigeria', 'Uganda', 'Libya',
       'Malawi', 'South Africa', 'Angola', 'Sierra Leone', 'Gabon',
       'Burkina Faso', 'Liberia', 'Rwanda', 'Kenya', 'Zimbabwe',
       'Namibia', 'Mozambique', 'Guinea-Bissau', 'Madagascar', 'Burundi',
       'Tanzania', 'Mauritania', 'Benin', 'Comoros', 'Equatorial Guinea',
       'Cape Verde', 'Gambia', 'Niger', 'Botswana'], dtype=object)

In [72]:
df_matches['AwayTeam'].unique()

array(['Egypt', 'Ethiopia', 'Sudan', 'Tunisia', 'Uganda', 'Nigeria',
       'DR Congo', 'Ivory Coast', 'Algeria', 'Senegal', 'Congo', 'Ghana',
       'Cameroon', 'Guinea', 'Kenya', 'Togo', 'Mali', 'Morocco', 'Zambia',
       'Mauritius', 'Burkina Faso', 'Tanzania', 'Libya', 'Malawi',
       'Mozambique', 'Gabon', 'Sierra Leone', 'Angola', 'Liberia',
       'Namibia', 'South Africa', 'Rwanda', 'Zimbabwe', 'Benin',
       'Equatorial Guinea', 'Cape Verde', 'Guinea-Bissau', 'Burundi',
       'Madagascar', 'Mauritania', 'Gambia', 'Comoros', 'Niger',
       'Botswana'], dtype=object)

---
## Feature Engineering pour déterminer si l'équipe a gagné (ou perdu) à domicile (ou à l'extérieur) et déterminer le gagnant du match

In [73]:
df_matches['HomeWin'] = df_matches['HomeTeamGoals'] > df_matches['AwayTeamGoals']
df_matches['AwayWin'] = df_matches['AwayTeamGoals'] > df_matches['HomeTeamGoals']

df_matches['HomeLoss'] = df_matches['AwayWin']
df_matches['AwayLoss'] = df_matches['HomeWin']

df_matches['Draw'] = df_matches['HomeTeamGoals'] == df_matches['AwayTeamGoals']

In [74]:
df_matches.sample(5)

Unnamed: 0,Year,Date,Time,HomeTeam,AwayTeam,HomeTeamGoals,AwayTeamGoals,Stage,SpecialWinConditions,Stadium,City,Attendance,HomeWin,AwayWin,HomeLoss,AwayLoss,Draw
5,1959,29/05/1959,,Egypt,Sudan,2,1,Final Tournament,,Prince Farouk Stadium,Cairo,30000.0,True,False,False,True,False
231,1992,15/01/1992,,Ghana,Zambia,1,0,Group D,,Stade Aline Sitoe Diatta,Ziguinchor,5000.0,True,False,False,True,False
30,1968,16/01/1968,,Ivory Coast,Uganda,2,1,Group A,,Hailé Sélassié Stadium,Addis Ababa,15000.0,True,False,False,True,False
64,1972,25/02/1972,,Congo,Morocco,1,1,Group B,,Stade de la Réunification,Douala,,False,False,False,False,True
232,1992,17/01/1992,,Ghana,Egypt,1,0,Group D,,Stade Aline Sitoe Diatta,Ziguinchor,5000.0,True,False,False,True,False


In [75]:
df_matches['Winner'] = np.where(~df_matches['Draw'],
                                np.where(df_matches['HomeWin'], df_matches['HomeTeam'], df_matches['AwayTeam']),
                                df_matches['SpecialWinConditions'].apply(lambda x: x.split(' win')[0] if isinstance(x, str) else ''))

---
# Calcul du total de victoires, défaites et matchs nuls par équipe.

In [76]:
total_wins = pd.concat([df_matches.groupby('HomeTeam')['HomeWin'].sum(),
                        df_matches.groupby('AwayTeam')['AwayWin'].sum()], axis=1, sort=False).sum(axis=1)

In [77]:
total_losses = pd.concat([df_matches.groupby('HomeTeam')['HomeLoss'].sum(),
                        df_matches.groupby('AwayTeam')['AwayLoss'].sum()], axis=1, sort=False).sum(axis=1)

In [78]:
total_draws = pd.concat([df_matches.groupby('HomeTeam')['Draw'].sum(),
                        df_matches.groupby('AwayTeam')['Draw'].sum()], axis=1, sort=False).sum(axis=1)

In [79]:
total_games = pd.concat([df_matches['HomeTeam'].value_counts(),
                         df_matches['AwayTeam'].value_counts()], axis=1, sort=False).sum(axis=1)
total_games

Ghana                102.0
Cameroon              91.0
Egypt                107.0
Nigeria               97.0
Ivory Coast           99.0
Morocco               70.0
Algeria               77.0
Zambia                67.0
Senegal               67.0
Tunisia               80.0
DR Congo              73.0
South Africa          43.0
Burkina Faso          48.0
Mali                  54.0
Gabon                 25.0
Guinea                47.0
Ethiopia              30.0
Congo                 26.0
Sudan                 27.0
Angola                26.0
Libya                 11.0
Uganda                23.0
Equatorial Guinea     15.0
Togo                  25.0
Zimbabwe              15.0
Malawi                10.0
Niger                  6.0
Cape Verde            11.0
Madagascar             5.0
Guinea-Bissau          9.0
Kenya                 17.0
Mauritania             6.0
Namibia                9.0
Benin                 14.0
Rwanda                 3.0
Liberia                5.0
Gambia                 5.0
S

In [80]:
# Transformer ces séries en dictionnaire
dict_games = total_games.to_dict()
dict_wins = total_wins.to_dict()
dict_losses = total_losses.to_dict()
dict_draws = total_draws.to_dict()

In [81]:
dict_games

{'Ghana': 102.0,
 'Cameroon': 91.0,
 'Egypt': 107.0,
 'Nigeria': 97.0,
 'Ivory Coast': 99.0,
 'Morocco': 70.0,
 'Algeria': 77.0,
 'Zambia': 67.0,
 'Senegal': 67.0,
 'Tunisia': 80.0,
 'DR Congo': 73.0,
 'South Africa': 43.0,
 'Burkina Faso': 48.0,
 'Mali': 54.0,
 'Gabon': 25.0,
 'Guinea': 47.0,
 'Ethiopia': 30.0,
 'Congo': 26.0,
 'Sudan': 27.0,
 'Angola': 26.0,
 'Libya': 11.0,
 'Uganda': 23.0,
 'Equatorial Guinea': 15.0,
 'Togo': 25.0,
 'Zimbabwe': 15.0,
 'Malawi': 10.0,
 'Niger': 6.0,
 'Cape Verde': 11.0,
 'Madagascar': 5.0,
 'Guinea-Bissau': 9.0,
 'Kenya': 17.0,
 'Mauritania': 6.0,
 'Namibia': 9.0,
 'Benin': 14.0,
 'Rwanda': 3.0,
 'Liberia': 5.0,
 'Gambia': 5.0,
 'Sierra Leone': 8.0,
 'Botswana': 3.0,
 'Mozambique': 12.0,
 'Burundi': 3.0,
 'Tanzania': 6.0,
 'Comoros': 4.0,
 'Mauritius': 3.0}

In [82]:
# Constitution d'une série avec les équipes et les éditions auxquels ils ont participé...
unique_home_teams_and_years = df_matches[["HomeTeam", "Year"]].drop_duplicates()
unique_home_teams_and_years.rename(columns={'HomeTeam': 'Team'}, inplace=True)

unique_away_teams_and_years = df_matches[["AwayTeam", "Year"]].drop_duplicates()
unique_away_teams_and_years.rename(columns={'AwayTeam': 'Team'}, inplace=True)

unique_teams_and_years = (pd.concat([unique_home_teams_and_years, unique_away_teams_and_years])).drop_duplicates()
unique_teams_and_years

Unnamed: 0,Team,Year
0,Sudan,1957
2,Egypt,1957
3,Egypt,1959
4,Sudan,1959
6,Ethiopia,1962
...,...,...
495,Benin,2010
501,Tunisia,2010
517,Tunisia,2012
724,Angola,2012


In [83]:
# ... Calcul du nombre  et transformation en dictionnaire 
dict_tournaments = unique_teams_and_years['Team'].value_counts().to_dict()
dict_tournaments

{'Egypt': 25,
 'Ivory Coast': 24,
 'Ghana': 23,
 'Tunisia': 20,
 'Cameroon': 20,
 'Nigeria': 19,
 'Algeria': 19,
 'DR Congo': 19,
 'Morocco': 18,
 'Zambia': 17,
 'Senegal': 16,
 'Guinea': 13,
 'Mali': 12,
 'Burkina Faso': 12,
 'Ethiopia': 11,
 'South Africa': 10,
 'Sudan': 9,
 'Angola': 8,
 'Gabon': 8,
 'Togo': 8,
 'Uganda': 7,
 'Congo': 7,
 'Kenya': 6,
 'Zimbabwe': 5,
 'Mozambique': 4,
 'Benin': 4,
 'Cape Verde': 3,
 'Equatorial Guinea': 3,
 'Guinea-Bissau': 3,
 'Sierra Leone': 3,
 'Namibia': 3,
 'Malawi': 3,
 'Libya': 3,
 'Tanzania': 2,
 'Mauritania': 2,
 'Liberia': 2,
 'Niger': 2,
 'Rwanda': 1,
 'Madagascar': 1,
 'Burundi': 1,
 'Comoros': 1,
 'Gambia': 1,
 'Botswana': 1,
 'Mauritius': 1}

In [84]:
total_goals_scored = pd.concat([df_matches.groupby('HomeTeam')['HomeTeamGoals'].sum(),
                        df_matches.groupby('AwayTeam')['AwayTeamGoals'].sum()], axis=1, sort=False).sum(axis=1)

In [85]:
dict_goals_scored = total_goals_scored.to_dict()
dict_goals_scored

{'Algeria': 94.0,
 'Angola': 30.0,
 'Benin': 7.0,
 'Botswana': 2.0,
 'Burkina Faso': 47.0,
 'Burundi': 0.0,
 'Cameroon': 137.0,
 'Cape Verde': 6.0,
 'Comoros': 4.0,
 'Congo': 27.0,
 'DR Congo': 88.0,
 'Egypt': 168.0,
 'Equatorial Guinea': 11.0,
 'Ethiopia': 31.0,
 'Gabon': 24.0,
 'Gambia': 4.0,
 'Ghana': 133.0,
 'Guinea': 61.0,
 'Guinea-Bissau': 2.0,
 'Ivory Coast': 144.0,
 'Kenya': 11.0,
 'Liberia': 5.0,
 'Libya': 12.0,
 'Madagascar': 7.0,
 'Malawi': 9.0,
 'Mali': 65.0,
 'Mauritania': 1.0,
 'Morocco': 82.0,
 'Mozambique': 4.0,
 'Namibia': 10.0,
 'Niger': 1.0,
 'Nigeria': 137.0,
 'Rwanda': 3.0,
 'Senegal': 78.0,
 'Sierra Leone': 4.0,
 'South Africa': 48.0,
 'Sudan': 29.0,
 'Tanzania': 5.0,
 'Togo': 19.0,
 'Tunisia': 109.0,
 'Uganda': 21.0,
 'Zambia': 81.0,
 'Zimbabwe': 16.0,
 'Mauritius': 2.0}

In [86]:
total_goals_conceded = pd.concat([df_matches.groupby('HomeTeam')['AwayTeamGoals'].sum(),
                                df_matches.groupby('AwayTeam')['HomeTeamGoals'].sum()], axis=1, sort=False).sum(axis=1)
dict_goals_conceded = total_goals_conceded.to_dict()
dict_goals_conceded

{'Algeria': 89.0,
 'Angola': 39.0,
 'Benin': 24.0,
 'Botswana': 9.0,
 'Burkina Faso': 72.0,
 'Burundi': 4.0,
 'Cameroon': 83.0,
 'Cape Verde': 9.0,
 'Comoros': 7.0,
 'Congo': 40.0,
 'DR Congo': 102.0,
 'Egypt': 90.0,
 'Equatorial Guinea': 14.0,
 'Ethiopia': 67.0,
 'Gabon': 30.0,
 'Gambia': 3.0,
 'Ghana': 87.0,
 'Guinea': 66.0,
 'Guinea-Bissau': 12.0,
 'Ivory Coast': 103.0,
 'Kenya': 31.0,
 'Liberia': 7.0,
 'Libya': 13.0,
 'Madagascar': 7.0,
 'Malawi': 15.0,
 'Mali': 65.0,
 'Mauritania': 11.0,
 'Morocco': 63.0,
 'Mozambique': 26.0,
 'Namibia': 24.0,
 'Niger': 9.0,
 'Nigeria': 102.0,
 'Rwanda': 3.0,
 'Senegal': 56.0,
 'Sierra Leone': 14.0,
 'South Africa': 45.0,
 'Sudan': 42.0,
 'Tanzania': 14.0,
 'Togo': 42.0,
 'Tunisia': 94.0,
 'Uganda': 38.0,
 'Zambia': 69.0,
 'Zimbabwe': 31.0,
 'Mauritius': 8.0}

In [87]:
df_summary = pd.DataFrame({'nb_tournaments' : dict_tournaments,
                          'nb_games' : dict_games,
                          'nb_wins' : dict_wins,
                          'nb_losses' : dict_losses,
                          'nb_draws' : dict_draws,
                          'nb_goals_scored' : dict_goals_scored,
                          'nb_goals_conceded' : dict_goals_conceded}
                          )

df_summary

Unnamed: 0,nb_tournaments,nb_games,nb_wins,nb_losses,nb_draws,nb_goals_scored,nb_goals_conceded
Egypt,25,107.0,60.0,27.0,20.0,168.0,90.0
Ivory Coast,24,99.0,44.0,28.0,27.0,144.0,103.0
Ghana,23,102.0,54.0,27.0,21.0,133.0,87.0
Tunisia,20,80.0,26.0,26.0,28.0,109.0,94.0
Cameroon,20,91.0,45.0,16.0,30.0,137.0,83.0
Nigeria,19,97.0,53.0,23.0,21.0,137.0,102.0
Algeria,19,77.0,28.0,27.0,22.0,94.0,89.0
DR Congo,19,73.0,20.0,29.0,24.0,88.0,102.0
Morocco,18,70.0,27.0,19.0,24.0,82.0,63.0
Zambia,17,67.0,26.0,21.0,20.0,81.0,69.0


In [88]:
df_summary.reset_index(inplace=True)
df_summary

Unnamed: 0,index,nb_tournaments,nb_games,nb_wins,nb_losses,nb_draws,nb_goals_scored,nb_goals_conceded
0,Egypt,25,107.0,60.0,27.0,20.0,168.0,90.0
1,Ivory Coast,24,99.0,44.0,28.0,27.0,144.0,103.0
2,Ghana,23,102.0,54.0,27.0,21.0,133.0,87.0
3,Tunisia,20,80.0,26.0,26.0,28.0,109.0,94.0
4,Cameroon,20,91.0,45.0,16.0,30.0,137.0,83.0
5,Nigeria,19,97.0,53.0,23.0,21.0,137.0,102.0
6,Algeria,19,77.0,28.0,27.0,22.0,94.0,89.0
7,DR Congo,19,73.0,20.0,29.0,24.0,88.0,102.0
8,Morocco,18,70.0,27.0,19.0,24.0,82.0,63.0
9,Zambia,17,67.0,26.0,21.0,20.0,81.0,69.0


In [89]:
df_summary.rename(columns={'index' : 'team'}, inplace=True)
df_summary

Unnamed: 0,team,nb_tournaments,nb_games,nb_wins,nb_losses,nb_draws,nb_goals_scored,nb_goals_conceded
0,Egypt,25,107.0,60.0,27.0,20.0,168.0,90.0
1,Ivory Coast,24,99.0,44.0,28.0,27.0,144.0,103.0
2,Ghana,23,102.0,54.0,27.0,21.0,133.0,87.0
3,Tunisia,20,80.0,26.0,26.0,28.0,109.0,94.0
4,Cameroon,20,91.0,45.0,16.0,30.0,137.0,83.0
5,Nigeria,19,97.0,53.0,23.0,21.0,137.0,102.0
6,Algeria,19,77.0,28.0,27.0,22.0,94.0,89.0
7,DR Congo,19,73.0,20.0,29.0,24.0,88.0,102.0
8,Morocco,18,70.0,27.0,19.0,24.0,82.0,63.0
9,Zambia,17,67.0,26.0,21.0,20.0,81.0,69.0


In [90]:
df_summary['win_percentage'] = (df_summary['nb_wins']/df_summary['nb_games'])*100
df_summary['loss_percentage'] = (df_summary['nb_losses']/df_summary['nb_games'])*100
df_summary['draw_percentage'] = (df_summary['nb_draws']/df_summary['nb_games'])*100

In [91]:
df_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   team               44 non-null     object 
 1   nb_tournaments     44 non-null     int64  
 2   nb_games           44 non-null     float64
 3   nb_wins            44 non-null     float64
 4   nb_losses          44 non-null     float64
 5   nb_draws           44 non-null     float64
 6   nb_goals_scored    44 non-null     float64
 7   nb_goals_conceded  44 non-null     float64
 8   win_percentage     44 non-null     float64
 9   loss_percentage    44 non-null     float64
 10  draw_percentage    44 non-null     float64
dtypes: float64(9), int64(1), object(1)
memory usage: 3.9+ KB


In [92]:
df_summary.describe()

Unnamed: 0,nb_tournaments,nb_games,nb_wins,nb_losses,nb_draws,nb_goals_scored,nb_goals_conceded,win_percentage,loss_percentage,draw_percentage
count,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0
mean,8.636364,33.727273,12.113636,12.113636,9.5,40.431818,40.431818,24.256859,49.188739,26.554402
std,7.514609,32.972408,16.712641,8.890067,9.304912,47.849278,32.502878,17.694798,23.704379,12.552914
min,1.0,3.0,0.0,1.0,0.0,0.0,3.0,0.0,17.582418,0.0
25%,2.75,7.5,1.0,4.0,2.0,4.75,11.75,11.941176,30.91487,19.672897
50%,6.5,20.0,3.5,10.0,5.0,17.5,31.0,25.462963,39.863014,27.922078
75%,13.75,57.25,19.25,19.0,17.5,68.25,66.25,37.549834,66.666667,34.464286
max,25.0,107.0,60.0,29.0,30.0,168.0,103.0,60.0,100.0,54.545455


In [93]:
df_summary['nb_tournaments'] = df_summary['nb_tournaments'].astype(int)
df_summary['nb_games'] = df_summary['nb_games'].astype(int)
df_summary['nb_wins'] = df_summary['nb_wins'].astype(int)
df_summary['nb_draws'] = df_summary['nb_draws'].astype(int)
df_summary['nb_losses'] = df_summary['nb_losses'].astype(int)
df_summary['nb_goals_scored'] = df_summary['nb_goals_scored'].astype(int)
df_summary['nb_goals_conceded'] = df_summary['nb_goals_conceded'].astype(int)

In [94]:
df_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   team               44 non-null     object 
 1   nb_tournaments     44 non-null     int32  
 2   nb_games           44 non-null     int32  
 3   nb_wins            44 non-null     int32  
 4   nb_losses          44 non-null     int32  
 5   nb_draws           44 non-null     int32  
 6   nb_goals_scored    44 non-null     int32  
 7   nb_goals_conceded  44 non-null     int32  
 8   win_percentage     44 non-null     float64
 9   loss_percentage    44 non-null     float64
 10  draw_percentage    44 non-null     float64
dtypes: float64(3), int32(7), object(1)
memory usage: 2.7+ KB


In [95]:
df_summary['ratio_goals_scored'] = df_summary['nb_goals_scored'] / df_summary['nb_games']
df_summary['ratio_goals_conceded'] = df_summary['nb_goals_conceded'] / df_summary['nb_games']

In [96]:
df_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   team                  44 non-null     object 
 1   nb_tournaments        44 non-null     int32  
 2   nb_games              44 non-null     int32  
 3   nb_wins               44 non-null     int32  
 4   nb_losses             44 non-null     int32  
 5   nb_draws              44 non-null     int32  
 6   nb_goals_scored       44 non-null     int32  
 7   nb_goals_conceded     44 non-null     int32  
 8   win_percentage        44 non-null     float64
 9   loss_percentage       44 non-null     float64
 10  draw_percentage       44 non-null     float64
 11  ratio_goals_scored    44 non-null     float64
 12  ratio_goals_conceded  44 non-null     float64
dtypes: float64(5), int32(7), object(1)
memory usage: 3.4+ KB


---
## Visualisation

Je vais tracer un graphique à barres pour les 10 équipes ayant participé à plus d'éditions de la CAN. Il est difficile de mettre toutes les équipes sur le même graphique 

In [103]:
top_10_teams = df_summary.sort_values(by='nb_tournaments', ascending=False).head(10)

fig = px.bar(top_10_teams, x='team', y='nb_tournaments',
             title='Nombre de tournois par équipe (Top 10)',
             text='nb_tournaments',
             color='team')
fig.update_traces(textposition='outside')  # Position des annotations
fig.show()

Je vais tracer un graphique à barres empilées pour les 10 équipes ayant le plus de victoires. Ce graphique comportera les statistiques de victoires, défaites et nuls.

In [109]:
top_10_teams = df_summary.sort_values(by='nb_games', ascending=False).head(10)

fig = px.bar(top_10_teams, x='team', y=['nb_wins', 'nb_losses', 'nb_draws'],
             title='Statistiques de victoires, défaites et matchs nuls par équipe (Top 10)',
             barmode='stack')
fig.show()

Existe-t-il une corrélation entre le nombre de buts marqués et le nombre de buts encaissés ?

In [115]:
fig = px.scatter(df_summary, x='nb_goals_scored', y='nb_goals_conceded',
                 size='nb_tournaments',
                 title='Corrélation entre le nombre de buts marqués et concédés')

fig.add_shape(type='line',
              x0=df_summary['nb_goals_scored'].min(),
              y0=df_summary['nb_goals_conceded'].min(),
              x1=df_summary['nb_goals_scored'].max(),
              y1=df_summary['nb_goals_conceded'].max(),
              line=dict(color='red', width=2, dash='dash'))

fig.show()

Existe-t-il une corrélation entre le nombre de buts marqués et le nombre de victoires ?

In [116]:
fig = px.scatter(df_summary, x='nb_goals_scored', y='nb_wins',
                 size='nb_tournaments',
                 title='Corrélation entre le nombre de buts marqués et le nombre de victoires')

fig.add_shape(type='line',
              x0=df_summary['nb_goals_scored'].min(),
              y0=df_summary['nb_wins'].min(),
              x1=df_summary['nb_goals_scored'].max(),
              y1=df_summary['nb_wins'].max(),
              line=dict(color='red', width=2, dash='dash'))

fig.show()

---
## Autres explorations

In [118]:
df_countries

Unnamed: 0,Country,Region
0,Algeria,UNAF
1,Angola,COSAFA
2,Benin,UFOA
3,Botswana,COSAFA
4,Burkina Faso,UFOA
5,Burundi,CECAFA
6,Cameroon,UNIFFAC
7,Cape Verde,UFOA
8,Central African Republic,UNIFFAC
9,Chad,UNIFFAC


In [122]:
nb_region = df_countries['Region'].value_counts().reset_index()
nb_region.columns = ['Region', 'Number of Countries']

fig = px.bar(nb_region, x='Region', y='Number of Countries',
             title='Nombre de pays par région',
             labels={'Number of Countries': 'Nombre de pays'},
             text='Number of Countries',
             color='Region')

fig.update_traces(textposition='outside')  # Position des annotations
fig.show()

In [119]:
df_stats

Unnamed: 0,Year,Host,Champion(titles),WinningCoach,TopScorer(goals),MostValuablePlayer
0,1957,Sudan,Egypt (1),Mourad Fahmy,Ad-Diba (5),El Gohary
1,1959,Egypt,Egypt (2),Pál Titkos,Mahmoud El Gohary (3),El Deeba
2,1962,Ethiopia,Ethiopia (1),Ydnekatchew Tessema,Mengistu Worku (3) Badawi Abdel Fattah (3),Mengistu Worku
3,1963,Ghana,Ghana (1),Charles Gyamfi,Hassan El Shazly (6),Hassan El Shazly
4,1965,Tunisia,Ghana (2),Charles Gyamfi,Ben Acheampong (3) Osei Kofi (3) Eustache ...,Osei Kofi
5,1968,Ethiopia,Congo-Kinshasa (1),Ferenc Csanádi,Laurent Pokou (6),Kazadi Mwamba
6,1970,Sudan,Sudan (1),Jiří Starosta,Laurent Pokou (8),Laurent Pokou
7,1972,Cameroon,Congo (1),Adolphe Bibanzoulou,Salif Keita (5),François M'Pelé
8,1974,Egypt,Zaire (2),Blagoje Vidinić,Ndaye Mulamba (9),Mohamed Timoumi
9,1976,Ethiopia,Morocco (1),Gheorghe Mărdărescu,Aliou Mamadou Keita (4),Ahmed Faras


In [133]:
# df_stats = pd.read_csv("Tournaments General Statistics MaJ.csv", sep=";")

In [134]:
df_stats.insert(2, "Champion", df_stats["Champion(titles)"].apply(lambda champion: champion.split(' (')[0]))
df_stats['Champion'] = df_stats['Champion'].str.strip()
df_stats

Unnamed: 0,Year,Host,Champion,Champion(titles),WinningCoach,TopScorer(goals),MostValuablePlayer
0,1957,Sudan,Egypt,Egypt (1),Mourad Fahmy,Ad-Diba (5),El Gohary
1,1959,Egypt,Egypt,Egypt (2),Pál Titkos,Mahmoud El Gohary (3),El Deeba
2,1962,Ethiopia,Ethiopia,Ethiopia (1),Ydnekatchew Tessema,Mengistu Worku (3) Badawi Abdel Fattah (3),Mengistu Worku
3,1963,Ghana,Ghana,Ghana (1),Charles Gyamfi,Hassan El Shazly (6),Hassan El Shazly
4,1965,Tunisia,Ghana,Ghana (2),Charles Gyamfi,Ben Acheampong (3) Osei Kofi (3) Eustache ...,Osei Kofi
5,1968,Ethiopia,Congo-Kinshasa,Congo-Kinshasa (1),Ferenc Csanádi,Laurent Pokou (6),Kazadi Mwamba
6,1970,Sudan,Sudan,Sudan (1),Jiří Starosta,Laurent Pokou (8),Laurent Pokou
7,1972,Cameroon,Congo,Congo (1),Adolphe Bibanzoulou,Salif Keita (5),François M'Pelé
8,1974,Egypt,Zaire,Zaire (2),Blagoje Vidinić,Ndaye Mulamba (9),Mohamed Timoumi
9,1976,Ethiopia,Morocco,Morocco (1),Gheorghe Mărdărescu,Aliou Mamadou Keita (4),Ahmed Faras


In [135]:
df_stats['Champion'].replace("Congo-Kinshasa", "DR Congo", inplace=True)
df_stats['Champion'].replace("Zaire", "DR Congo", inplace=True)

In [136]:
df_stats = df_stats.merge(df_countries, left_on='Host', right_on='Country', how='left').rename(columns={'Region': 'HostRegion'})
df_stats = df_stats.merge(df_countries, left_on='Champion', right_on='Country', how='left').rename(columns={'Region': 'ChampionRegion'})

df_stats = df_stats.drop(['Country_x', 'Country_y'], axis=1)
df_stats

Unnamed: 0,Year,Host,Champion,Champion(titles),WinningCoach,TopScorer(goals),MostValuablePlayer,HostRegion,ChampionRegion
0,1957,Sudan,Egypt,Egypt (1),Mourad Fahmy,Ad-Diba (5),El Gohary,CECAFA,UNAF
1,1959,Egypt,Egypt,Egypt (2),Pál Titkos,Mahmoud El Gohary (3),El Deeba,UNAF,UNAF
2,1962,Ethiopia,Ethiopia,Ethiopia (1),Ydnekatchew Tessema,Mengistu Worku (3) Badawi Abdel Fattah (3),Mengistu Worku,CECAFA,CECAFA
3,1963,Ghana,Ghana,Ghana (1),Charles Gyamfi,Hassan El Shazly (6),Hassan El Shazly,UFOA,UFOA
4,1965,Tunisia,Ghana,Ghana (2),Charles Gyamfi,Ben Acheampong (3) Osei Kofi (3) Eustache ...,Osei Kofi,UNAF,UFOA
5,1968,Ethiopia,DR Congo,Congo-Kinshasa (1),Ferenc Csanádi,Laurent Pokou (6),Kazadi Mwamba,CECAFA,UNIFFAC
6,1970,Sudan,Sudan,Sudan (1),Jiří Starosta,Laurent Pokou (8),Laurent Pokou,CECAFA,CECAFA
7,1972,Cameroon,Congo,Congo (1),Adolphe Bibanzoulou,Salif Keita (5),François M'Pelé,UNIFFAC,UNIFFAC
8,1974,Egypt,DR Congo,Zaire (2),Blagoje Vidinić,Ndaye Mulamba (9),Mohamed Timoumi,UNAF,UNIFFAC
9,1976,Ethiopia,Morocco,Morocco (1),Gheorghe Mărdărescu,Aliou Mamadou Keita (4),Ahmed Faras,CECAFA,UNAF


In [144]:
df_stats.loc[(df_stats['Host']=="Ghana  Nigeria"), "HostRegion"] = "UFOA"
df_stats.loc[df_stats['Host']=="Gabon  Equatorial Guinea", "HostRegion"] = "UNIFFAC"

df_stats

Unnamed: 0,Year,Host,Champion,Champion(titles),WinningCoach,TopScorer(goals),MostValuablePlayer,HostRegion,ChampionRegion
0,1957,Sudan,Egypt,Egypt (1),Mourad Fahmy,Ad-Diba (5),El Gohary,CECAFA,UNAF
1,1959,Egypt,Egypt,Egypt (2),Pál Titkos,Mahmoud El Gohary (3),El Deeba,UNAF,UNAF
2,1962,Ethiopia,Ethiopia,Ethiopia (1),Ydnekatchew Tessema,Mengistu Worku (3) Badawi Abdel Fattah (3),Mengistu Worku,CECAFA,CECAFA
3,1963,Ghana,Ghana,Ghana (1),Charles Gyamfi,Hassan El Shazly (6),Hassan El Shazly,UFOA,UFOA
4,1965,Tunisia,Ghana,Ghana (2),Charles Gyamfi,Ben Acheampong (3) Osei Kofi (3) Eustache ...,Osei Kofi,UNAF,UFOA
5,1968,Ethiopia,DR Congo,Congo-Kinshasa (1),Ferenc Csanádi,Laurent Pokou (6),Kazadi Mwamba,CECAFA,UNIFFAC
6,1970,Sudan,Sudan,Sudan (1),Jiří Starosta,Laurent Pokou (8),Laurent Pokou,CECAFA,CECAFA
7,1972,Cameroon,Congo,Congo (1),Adolphe Bibanzoulou,Salif Keita (5),François M'Pelé,UNIFFAC,UNIFFAC
8,1974,Egypt,DR Congo,Zaire (2),Blagoje Vidinić,Ndaye Mulamba (9),Mohamed Timoumi,UNAF,UNIFFAC
9,1976,Ethiopia,Morocco,Morocco (1),Gheorghe Mărdărescu,Aliou Mamadou Keita (4),Ahmed Faras,CECAFA,UNAF


In [145]:
df_stats.columns.to_list()

['Year',
 'Host',
 'Champion',
 'Champion(titles)',
 'WinningCoach',
 'TopScorer(goals)',
 'MostValuablePlayer',
 'HostRegion',
 'ChampionRegion']

In [146]:
new_col_order = ['Year',
                 'Host',
                 'HostRegion',
                 'Champion',
                 'ChampionRegion',
                 'WinningCoach',
                 'TopScorer(goals)',
                 'MostValuablePlayer']
df_stats = df_stats [new_col_order]

In [147]:
df_stats

Unnamed: 0,Year,Host,HostRegion,Champion,ChampionRegion,WinningCoach,TopScorer(goals),MostValuablePlayer
0,1957,Sudan,CECAFA,Egypt,UNAF,Mourad Fahmy,Ad-Diba (5),El Gohary
1,1959,Egypt,UNAF,Egypt,UNAF,Pál Titkos,Mahmoud El Gohary (3),El Deeba
2,1962,Ethiopia,CECAFA,Ethiopia,CECAFA,Ydnekatchew Tessema,Mengistu Worku (3) Badawi Abdel Fattah (3),Mengistu Worku
3,1963,Ghana,UFOA,Ghana,UFOA,Charles Gyamfi,Hassan El Shazly (6),Hassan El Shazly
4,1965,Tunisia,UNAF,Ghana,UFOA,Charles Gyamfi,Ben Acheampong (3) Osei Kofi (3) Eustache ...,Osei Kofi
5,1968,Ethiopia,CECAFA,DR Congo,UNIFFAC,Ferenc Csanádi,Laurent Pokou (6),Kazadi Mwamba
6,1970,Sudan,CECAFA,Sudan,CECAFA,Jiří Starosta,Laurent Pokou (8),Laurent Pokou
7,1972,Cameroon,UNIFFAC,Congo,UNIFFAC,Adolphe Bibanzoulou,Salif Keita (5),François M'Pelé
8,1974,Egypt,UNAF,DR Congo,UNIFFAC,Blagoje Vidinić,Ndaye Mulamba (9),Mohamed Timoumi
9,1976,Ethiopia,CECAFA,Morocco,UNAF,Gheorghe Mărdărescu,Aliou Mamadou Keita (4),Ahmed Faras


Nombre de victoires des pays organisateurs

In [152]:
df_stats['HostWinner'] = df_stats['Host'] == df_stats['Champion']

# Créons un DataFrame avec le décompte des victoires par pays organisateur
host_wins_count = df_stats[df_stats['HostWinner']].groupby('Host').size().reset_index(name='Wins')

data = host_wins_count.sort_values(by='Wins', ascending=False).head(10)

# Créons un diagramme à barres empilées
fig = px.bar(data, x='Host', y='Wins', color='Host',
             title='Nombre de victoires des pays organisateurs',
             labels={'Host': 'Pays organisateur', 'Wins': 'Nombre de victoires'})

fig.show()

Nombre de victoires des sous-confédérations organisatrices

In [153]:
# Ajoutons une colonne indiquant si le pays champion est de la même région que l'organisateur
df_stats['SameRegionWinner'] = df_stats['HostRegion'] == df_stats['ChampionRegion']

# Créons un DataFrame avec le décompte des victoires par pays champion dans la même région que l'organisateur
same_region_wins_count = df_stats[df_stats['SameRegionWinner']].groupby('Champion').size().reset_index(name='Wins')

data = same_region_wins_count.sort_values(by='Wins', ascending=False).head(10)

# Crée un diagramme à barres empilées
fig = px.bar(data, x='Champion', y='Wins', color='Champion',
             title='Nombre de victoires des pays champions dans la même région que l\'organisateur',
             labels={'Champion': 'Pays champion', 'Wins': 'Nombre de victoires'})

fig.show()

In [156]:
# Ajoutons une colonne indiquant si le pays champion est de la même région que l'organisateur
df_stats['SameRegionWinner'] = df_stats['HostRegion'] == df_stats['ChampionRegion']

# Créons un DataFrame avec le décompte des victoires par pays champion dans la même région que l'organisateur
same_region_wins_count = df_stats[df_stats['SameRegionWinner']].groupby('ChampionRegion').size().reset_index(name='Wins')

data = same_region_wins_count.sort_values(by='Wins', ascending=False).head(10)

# Crée un diagramme à barres empilées
fig = px.bar(data, x='ChampionRegion', y='Wins', color='ChampionRegion',
             title='Nombre de victoires (groupées) des pays champions dans la même sous-confédération que l\'organisateur',
             labels={'ChampionRegion': 'Sous-confédération', 'Wins': 'Nombre de victoires'})

ig.show()

In [160]:
# Ajoutons une colonne indiquant si le pays champion est dans une autre région que l'organisateur
df_stats['DifferentRegionWinner'] = df_stats['HostRegion'] != df_stats['ChampionRegion']

# Filtre les occurrences où le champion est dans une autre région que l'organisateur
different_region_wins = df_stats[df_stats['DifferentRegionWinner']]['Champion'].value_counts().reset_index(name='Wins')

# Affiche les pays qui gagnent le plus dans une autre région
top_different_region_winners = different_region_wins.head(5)

print(top_different_region_winners)

      index  Wins
0     Egypt     4
1  Cameroon     4
2     Ghana     2
3  DR Congo     2
4   Nigeria     2
