In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.options.mode.chained_assignment = None  

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 # We aim to cluster the teams from different leagues according to their play style. 

For that we will use such parameters as events (fouls, attempts, ...), their locations (wings, midfield, ...), assist methods (short passes, crosses, ...), situations (open play and different set pieces) and body parts involved (feet or head). 

Credits to IBM ML Coursera course for the idea of the agglomerative algorithm.

We will count all required parameters and group them by teams, having 46 features in total.

In [None]:
df = pd.read_csv('/kaggle/input/football-events/events.csv', delimiter=',')

df1 = df[['event_team','event_type', 'event_type2','location', 'assist_method', 'situation', 'bodypart']]

df1.groupby('event_team')
df2 = df1.groupby('event_team', group_keys=False)
k = df1['event_team'].sort_values().unique()

appended_data = []

for col in df1.columns[df1.columns != 'event_team']: 
    appended_data.append(df2[col].value_counts(dropna=True).sort_index().unstack(col).add_prefix(col+'_'))

df3 = pd.concat(appended_data, axis=1)    
df3['Event team'] = k
df3.head()

As different teams can have a different number of games, we will count games for each team (home and away) and normalize all features by the number of games.

In [None]:
ginf = pd.read_csv('/kaggle/input/football-events/ginf.csv')
ginf.head()

gam = ginf['ht'].value_counts() + ginf['at'].value_counts()

df3['games'] = gam
df3.head(5)

for i in df3.columns:
    if (type(df3[i][0]) == np.float64) | (type(df3[i][0]) == np.int64):
        df3[i] = df3[i].div(gam, axis=0)

Let's get rid of all decimals in column names.

In [None]:
df3.rename(columns={'location_10.0': 'location_10'}, inplace = True)
#df3['games'] = gam
for col in df3.columns:   
    if col.endswith('.0'):
        col1 = col.strip('.0')
    else:
        col1 = col
    df3 = df3.rename(columns={col : col1})   

print(df3.columns)

Now we will use the dictionary to decode all parameters and give columns user-friendly names.

In [None]:
f = open("/kaggle/input/football-events/dictionary.txt", "r")

x = []
k = []
r = []
for lines in f:
    x.append(lines.split())

line_base = x[0][0]

for line in x:
    if len(line) > 1:        
        ll = ''
        for j in range(1, len(line)):
            if j < (len(line) - 1):
                ll = ll + line[j] + ' '
            else:
                ll = ll + line[j]
        k.append([line_base + '_' + line[0], ll])
                
    elif len(line) == 1:
        line_base = line[0]
print(k)

In [None]:
i = 0
for col in k:    
    if col[0] in df3.columns:        
        df3 = df3.rename(columns={col[0] : col[1]})
    
print(df3.columns)

In [None]:
df3.drop('Not recorded', axis=1, inplace=True)

When comparing playing styles, it doesn't make sense to distinguish right from left, so we will merge corresponding columns.

In [None]:
def column_sum(a,b):
    c = df3[[a,b]].sum(axis=1)    
    df3.drop([a,b], axis=1, inplace=True)
    return c    

df3['Foot'] = column_sum('right foot','left foot')
df3['Wing'] = column_sum('Right wing','Left wing')
df3['Difficult angle'] = column_sum('Difficult angle on the left','Difficult angle on the right')
df3['Side of the box'] = column_sum('Left side of the box','Right side of the box')
df3['Side of the six yard box'] = column_sum('Left side of the six yard box','Right side of the six yard box')


We do some cleaning and look at the data frame.

In [None]:
df3.rename(columns={'head': 'Head'}, inplace = True) 
df3.drop('games', axis=1, inplace=True)
df3.drop('Second yellow card', axis=1, inplace=True)

In [None]:
df3.dropna(axis=0,inplace=True)
df3.head()

Now let's apply the hierarchical agglomerative scipy clustering to the dataframe using scikit-learn preprocessing.

In [None]:
from sklearn.preprocessing import MinMaxScaler
featureset = df3.loc[:, df3.columns != 'Event team']
x = featureset.values 
min_max_scaler = MinMaxScaler()
feature_mtx = min_max_scaler.fit_transform(x)
feature_mtx [0:5]

from sklearn.metrics.pairwise import euclidean_distances
dist_matrix = euclidean_distances(feature_mtx,feature_mtx) 


In [None]:
from scipy.cluster import hierarchy 
print(dist_matrix)
Z_using_dist_matrix = hierarchy.linkage(dist_matrix, 'complete')

On the dendrogram, we can see 3 main clusters - English Premier League teams; top teams - Barcelona, Real Madrid, Bayern Munich; and rest of the teams - we can see local similarities between them.

In [None]:
import pylab
from matplotlib import pyplot as plt
fig = pylab.figure(figsize=(18,50))
def llf(id):
    return '[%s]' % (df3['Event team'][id]) 
    
dendro = hierarchy.dendrogram(Z_using_dist_matrix,  leaf_label_func=llf, leaf_rotation=0, leaf_font_size =12, orientation = 'right')
plt.savefig('dendrogram.png')

Now we will imply scikit-learn agglomerative clustering.

In [None]:
from sklearn.cluster import AgglomerativeClustering
agglom = AgglomerativeClustering(n_clusters = 6, linkage = 'complete')
agglom.fit(dist_matrix)

agglom.labels_

In [None]:
df3['cluster_'] = agglom.labels_

We can also create a plot with labels to see spatial positions of the clusters. Let's plot attempts against fouls per game as the most basic features with substitutions as a point size.

In [None]:
import matplotlib.cm as cm
n_clusters = max(agglom.labels_)+1
colors = cm.rainbow(np.linspace(0, 1, n_clusters))
cluster_labels = list(range(0, n_clusters))

plt.figure(figsize=(16,14))

for color, label in zip(colors, cluster_labels):
    subset = df3[df3.cluster_ == label]
    for i in subset.index:
            plt.text(subset.Attempt[i], subset.Foul[i],str(subset['Event team'][i]), rotation=25) 
    plt.scatter(subset.Attempt, subset.Foul, s= subset.Substitution*10, c=color, label='cluster'+str(label),alpha=0.5)

plt.legend()
plt.title('Clusters')
plt.xlabel('Attempt')
plt.ylabel('Foul')

Although we defined more clusters here, we are still able to see 3 main clusters which we found out previously. The similarity of the top teams between each other - Madrid, Barcelona and Bayern - can be a result of dominating style of play in the national leagues in general which implies higher ball possession, more attempts, passes etc. The similarity of English teams between each other can be explained rather by peculiarities of statistics gathering in the league than by the same play style, as one can see top and non-top teams in the cluster.