In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))        

# Any results you write to the current directory are saved as output.

# Notebook for studies on data of the Campeonato Brasileiro (Brazilian Soccer Championship) in the period between 2000 a 2019.
### By: Marcus Nudelman Trugilho - October 2020
## Analysis of the goals for each team and at the end, an animated chart showing the goals evolution trough time.
### Rmk: you will need the FFmpeg encoder installed in the system to be able to generate the video. It can be found in the address: https://www.ffmpeg.org/download.html.

## Importing the necessary libraries.

In [1]:
import colorsys
from random import randint
import matplotlib.pyplot as plt
import matplotlib.colors as mc
import matplotlib.animation as ani
from matplotlib.animation import FuncAnimation
from matplotlib import rc
pd.set_option('display.max_columns', None)

### Importing the dataset.


In [1]:
df = pd.read_csv('/kaggle/input/campeonato-brasileiro-de-futebol/campeonato-brasileiro-full.csv')

### Adjusting teams names to avoid duplications of columns and defining the format of the dates.

In [1]:
df['Clube 1'] = df['Clube 1'].str.capitalize()
df['Clube 2'] = df['Clube 2'].str.capitalize()
df['Horário'].fillna(method='ffill', inplace=True)
df['Data'] = pd.to_datetime(df['Data'], format='%Y-%m-%d')

### The column Rodadas (turns) has string values. We want to separate the numbers. In the process, we need to understand and change some values so we can define numbers to them. There was a time when the turns reached playoffs and we will set sequential turn numbers to them.

In [1]:
df['Rodada#'] = df['Rodada'].str.extract('(\d+)')

data1 = pd.to_datetime('2001-12-04', format='%Y-%m-%d')
data2 = pd.to_datetime('2001-12-23', format='%Y-%m-%d')
data3 = pd.to_datetime('2002-11-23', format='%Y-%m-%d')
data4 = pd.to_datetime('2002-12-15', format='%Y-%m-%d')
R = 0

def rvalue(value):
    if value['Rodada'] == 'SEGUNDA FASE':
        R = 35
        return R
    elif value['Rodada'] == 'QUARTAS DE FINAL':
        R = 36
        return R
    elif value['Rodada'] == 'SEMI FINAL':
        R = 37
        return R
    elif value['Rodada'] == 'FINAL':
        R = 38
        return R
    elif (value['Rodada'] == 'Quartas de Final') & (value['Data'] > data1 <= data2):
        R = 28
        return R
    elif (value['Rodada'] == 'Semi Final') & (value['Data'] > data1 <= data2):
        R = 29
        return R
    elif (value['Rodada'] == 'Final') & (value['Data'] > data1 <= data2):
        R = 30
        return R
    elif (value['Rodada'] == 'Quartas de Final') & (value['Data'] > data3 <= data4):
        R = 30
        return R
    elif (value['Rodada'] == 'Semi Final') & (value['Data'] > data3 <= data4):
        R = 31
        return R
    elif (value['Rodada'] == 'Final') & (value['Data'] > data3 <= data4):
        R = 32
        return R
    else:
        return value['Rodada#']
    
df['Rodada#'] = df.apply(rvalue, axis=1)
df['Rodada#'] = df['Rodada#'].astype('float64')

### Let's find out when the home team won, creating a categorical column. The home team is the one in the 'Clube 1' column.

In [1]:
def mandante(value):
    if value['Vencedor'] == value['Clube 1']:
        result = 1
    else:
        result = 0
    return result
df['Mandante Vence'] = df.apply(mandante, axis=1)


### To be able to analyse the number of goals of each team, let's create a column for each team. These columns will be used for the matrix and animated charts. We will also define the goals by cumulative sum.

In [1]:
clubes = df['Clube 1'].unique()


for i in clubes:
    def gols(row):
        if row['Clube 1'] == i:
            val = row['Clube 1 Gols']
        elif row['Clube 2'] == i:
            val = row['Clube 2 Gols']
        else:
            val = 0
        return val
    df[i] = df.apply(gols, axis=1)
    df[i] = df[i].cumsum()

### Let's prepare a dataframe to analyse the evolution of the goals scored by each of the 9 best teams (highest scores in the period).

In [1]:
# Filter only the Date and teams columns. And also clean duplicated lines.
df_y = df.drop(df.iloc[:, 0:2], axis=1)
df_y = df_y.drop(df_y.iloc[:, 1:15], axis=1)
df_y = df_y.drop_duplicates(subset = ["Data"])
df_y.reset_index(inplace=True)

# Group dates by year.
df_y2 = df_y.set_index('Data')
df_y2.drop(axis=1, columns='index', inplace=True)
df_y2 = df_y2.groupby(pd.Grouper(freq="Y"))
df_y2 = df_y2.max()

# RAnk teams by the total goals scored and separating the 9 top teams.
df_y3 = df_y2.reset_index()
s = df_y3.iloc[-1]
df_y3 = df_y3.iloc[:, ((-s[1:]).argsort() + 1).reindex(df_y3.columns, fill_value=0)]
df_y3 = df_y3.set_index('Data')
df_y3 = df_y3[df_y3.columns[:9]]
df_y3.dropna(axis=0, how='any', inplace=True)

# For the matrix chart we will need an 'x' column with sequential numbers.
df_y3['x'] = range(1, 1+len(df_y3))

### Now we create the matrix chart to show the goals evolution of each team isolated, but keeping the others in the background. (Rmk: the original code can be found at: https://python-graph-gallery.com/125-small-multiples-for-line-chart/)

In [1]:
# Initialize the figure
plt.style.use('seaborn-darkgrid')
 
# create a color palette
palette = plt.get_cmap('winter', lut=18)

# setting the figure size.
plt.figure(figsize=(10, 10), dpi=144, tight_layout=True)

# multiple line plot
num=0
for column in df_y3.drop('x', axis=1):
    num+=1
 
    plt.subplot(3,3, num)
    for v in df_y3.drop('x', axis=1):
        plt.plot(df_y3['x'], df_y3[v], marker='', color='grey', linewidth=0.6, alpha=0.3)
    plt.plot(df_y3['x'], df_y3[column], marker='', color=palette(num), linewidth=2.4, alpha=0.9, label=column)
    plt.xlim(0,20)
    plt.ylim(0,1200)
    tiks = list(range(2000, 2020))
    plt.xticks(np.arange(20), labels=tiks, fontsize=6, rotation=45.0)
    if num in range(7) :
        plt.tick_params(labelbottom='off')
    if num not in [1,4,7] :
        plt.tick_params(labelleft='off')
    plt.title(column, loc='left', fontsize=12, fontweight=0, color=palette(num) )
 
# general title
plt.suptitle("Evolução de gols no Brasileirão por time.  2000 - 2019", fontsize=13, fontweight=0, color='black', style='italic', y=1.02)
 
# Axis title
plt.text(0.5, 0.02, 'Time', ha='center', va='center')
plt.text(0.06, 0.5, 'Note', ha='center', va='center', rotation='vertical')

 
## Preparing the animated chart (bar_chart_race).

### We need a dataframe with a secific format for this chart to work. Only the teams and index will be used.

In [1]:
# Eliminating unwanted columns and duplicated rows.
df1 = df.drop(df.iloc[:, 0:2], axis=1)
df1 = df1.drop(df1.iloc[:, 1:15], axis=1)
df1 = df1.drop_duplicates(subset = ["Data"])
df1.reset_index(inplace=True)
df1 = df1.set_index('Data')
df1.drop(axis=1, columns='index', inplace=True)
df1.index.astype(str)

# Grouping date by month and excluding those where there were no games.
df1 = df1.groupby(pd.Grouper(freq="M"))
df1 = df1.max()
df1.dropna(axis=0, how='any', inplace=True)

# Expanding the dataframe for the smooth transitions of ranks with 5 steps.
steps = 5
df1 = df1.reset_index()
df1.index = df1.index * steps
last_idx = df1.index[-1] + 1
df_expanded = df1.reindex(range(last_idx))
df_expanded['Data'] = df_expanded['Data'].fillna(method='ffill')

In [1]:
def transform_color(color, amount = 0.5):

    try:
        c = mc.cnames[color]
    except:
        c = color
        c = colorsys.rgb_to_hls(*mc.to_rgb(c))
    return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2])

all_names = df_expanded.columns[1:].tolist()
random_hex_colors = []
for i in range(len(all_names)):
    random_hex_colors.append('#' + '%06X' % randint(0, 0xFFFFFF))

rgb_colors = [transform_color(i, 1) for i in random_hex_colors]
rgb_colors_opacity = [rgb_colors[x] + (0.5,) for x in range(len(rgb_colors))]
rgb_colors_dark = [transform_color(i, 1.12) for i in random_hex_colors]

# Defining figsize, resolution and other details.
fig = plt.Figure(figsize=(6, 3), dpi=144, tight_layout=True)
ax = fig.add_subplot()
def nice_axes(ax):
    plt.rcParams["axes.axisbelow"] = "line"
    ax.set_facecolor('.8')
    ax.tick_params(labelsize=8, length=0)
    ax.grid(True, axis='x', color='white')
    ax.set_axisbelow(True)
    [spine.set_visible(False) for spine in ax.spines.values()]

### Now we will use the bar_chart_race code, created by Ted Petrou, who made a great step by step article to explain how the chart works. The details can be found here: https://medium.com/dunder-data/create-a-bar-chart-race-animation-in-python-with-matplotlib-477ed1590096

In [1]:
# Init routine to clean the axes and avoit noise.
def init():
    ax.clear()
    nice_axes(ax)
    ax.set_ylim(.2, 10.8)
    ax.set_xlim(0, 1200)
    ax.grid(False, axis='y')

# Creating the animation frames.
def update(i):
    for bar in ax.containers:
        bar.remove()
    df_e = df_expanded.copy()
    idata = df_e.loc[i, 'Data']
    coluna1 = df_e.columns[1]
    id_data = df_e.loc[df_e[coluna1].notnull()]
    valid_index = id_data.loc[id_data['Data'] == idata]
    vi = valid_index.index
    s = df_e.iloc[vi[0]]
    df_e = df_e.iloc[:, ((-s[1:]).argsort() + 1).reindex(df_e.columns, fill_value=0)]
    df_e = df_e.set_index('Data')
    df_e = df_e[df_e.columns[:10]]
    df_rank_e = df_e.rank(axis=1, method='first')
    df_e = df_e.interpolate()
    df_rank_e = df_rank_e.interpolate()        
    labels = df_e.columns
    y = df_rank_e.iloc[i]
    width = df_e.iloc[i]
    normal_colors = dict(zip(df_expanded.columns[1:], rgb_colors_opacity))
    dark_colors = dict(zip(df_expanded.columns[1:], rgb_colors_dark))    
    ax.barh(y=y, width=width, color=[normal_colors[x] for x in df_e.columns], edgecolor =([dark_colors[x] for x in df_e.columns]), tick_label=labels)
    date_str = df_e.index[i].strftime('%B, %Y')
    ax.set_title(f'Evolução de Nº de gols no Brasileirão - {date_str}', fontsize='small')

### Creating the animation sequence.

In [1]:
anim = FuncAnimation(fig=fig, func=update, init_func=init, frames=len(df_expanded), interval=100, repeat=False)

### Generating the HTML chart.

In [1]:
from IPython.display import HTML

rc('animation', html='jshtml')
rc

HTML(anim.to_jshtml())