In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats

# Read data

In [None]:
df = pd.read_csv('../inputs/final_processed_data.csv')
df.head()

# Print general statistics

In [None]:
# print statistics
print("Data shape:",df.shape[0],"values","\n")
print(df['height'].describe(percentiles=[.25,.50,.75]),"\n")    
print(df['matches_year1'].describe(percentiles=[.25,.50,.75]),"\n")
print(df['goals_year1'].describe(percentiles=[.25,.50,.75]),"\n")
print(df['goals_per_match_year1'].describe(percentiles=[.25,.50,.75]),"\n")
print(df['matches_year2'].describe(percentiles=[.25,.50,.75]),"\n")
print(df['goals_year2'].describe(percentiles=[.25,.50,.75]),"\n")
print(df['goals_per_match_year2'].describe(percentiles=[.25,.50,.75]),"\n")
print(df['goals_per_match_year3'].describe(percentiles=[.25,.50,.75]),"\n")

In [None]:
# print statistics by field position
print("Number of centre players:", df['name'][df['center'] == 1].nunique())
print("Number of back players:", df['name'][df['back'] == 1].nunique())
print("Number of wing players:", df['name'][df['wing'] == 1].nunique())
print("Number of line players:", df['name'][df['line'] == 1].nunique())

In [None]:
# separating players by position
centre_df = df[df['center'] == 1]
back_df = df[df['back'] == 1]
wing_df = df[df['wing'] == 1]
line_df = df[df['line'] == 1]

# Density plots

In [None]:
# density plot for all players (parametric distribution)

# the blue line represents a kernel density estimate (KDE)

# the black line represents a probability density function using a gamma countinuos random variable, 
# used to view data as a normal distribution

# matches played
plt.figure()
plt.title('Matches played density plot')
sns.distplot(df['matches_year1'], kde=True, fit=stats.gamma)
plt.xlabel('matches played')
plt.ylabel('density')
plt.show()

# goals scored
plt.figure()
plt.title('Goals scored per season density plot')
sns.distplot(df['goals_year1'][df['goals_year1'] > 0], kde=True, fit=stats.gamma)
plt.xlabel('goals scored')
plt.ylabel('density')
plt.show()

# goals scored per match
plt.figure()
plt.title('Goals scored per match density plot')
sns.distplot(df['goals_per_match_year1'], kde=True, fit=stats.gamma)
plt.xlabel('goals scored per match')
plt.ylabel('density')
plt.show()

# Box plots

In [None]:
def compute_position(row):    
    if row['center'] == 1:
        return 'centre'
    if row['back'] == 1:
        return 'back'
    if row['wing'] == 1:
        return 'wing'
    if row['line'] == 1:
        return 'line'
    
copy_df = df.copy()
copy_df['position'] = copy_df.apply(lambda row: compute_position(row), axis=1)

# plt.figure()

copy_df.boxplot(column='goals_year1',by='position')
plt.title('Goals scored during a season boxplot by position')
plt.suptitle('')
plt.show()

copy_df.boxplot(column='goals_per_match_year1',by='position')
plt.title('Goals scored per match boxplot by position')
plt.suptitle('')
plt.ylim(0,10)
plt.show()

# average and median value of goals scored per match for each position
print('Average goals centre players:', centre_df['goals_year1'].mean())
print('Average goals back players:', back_df['goals_year1'].mean())
print('Average goals wing players:', wing_df['goals_year1'].mean())
print('Average goals line players:', line_df['goals_year1'].mean())
print()
print('Average goals/match centre players:', centre_df['goals_per_match_year1'].mean())
print('Average goals/match back players:', back_df['goals_per_match_year1'].mean())
print('Average goals/match wing players:', wing_df['goals_per_match_year1'].mean())
print('Average goals/match line players:', line_df['goals_per_match_year1'].mean())

# Cat plots of height to goals per match

In [None]:
# cat plot of height to goals per game - all players
plt.figure()
sns.catplot(x="goals_per_match_year1", y="height", data=df[["height","goals_per_match_year1"]])
plt.title('Catplot - height to goals per match - all players')
plt.xscale('log')
plt.show()

In [None]:
# cat plot of height to goals per game - centre players
plt.figure()
sns.catplot(x="goals_per_match_year1", y="height", data=centre_df[["height","goals_per_match_year1"]])
plt.title('Catplot - height to goals per match - centre players')
plt.xscale('log')
plt.show()

In [None]:
# cat plot of height to goals per game - back players
plt.figure()
sns.catplot(x="goals_per_match_year1", y="height", data=back_df[["height","goals_per_match_year1"]])
plt.title('Catplot - height to goals per match - back players')
plt.xscale('log')
plt.show()

In [None]:
# cat plot of height to goals per game - wing players
plt.figure()
sns.catplot(x="goals_per_match_year1", y="height", data=wing_df[["height","goals_per_match_year1"]])
plt.title('Catplot - height to goals per match - wing players')
plt.xscale('log')
plt.show()

In [None]:
# cat plot of height to goals per game - line players
plt.figure()
sns.catplot(x="goals_per_match_year1", y="height", data=line_df[["goals_per_match_year1","height"]])
plt.title('Catplot - height to goals per match - line players')
plt.xscale('log')
plt.show()

# Cat plots of matches played to goals per match

In [None]:
# cat plot of matches played to goals per match - all players
plt.figure()
sns.catplot(x="matches_year1", y="goals_per_match_year1", data=df[["matches_year1","goals_per_match_year1"]])
plt.title('Catplot - matches played to goals per match - all players')
plt.xscale('log')
plt.xlim(0.6,60)
plt.ylim(0,10)
plt.show()

In [None]:
# cat plot of matches played to goals per match - centre players
plt.figure()
sns.catplot(x="matches_year1", y="goals_per_match_year1",data=centre_df[["matches_year1","goals_per_match_year1"]])
plt.title('Catplot - matches played to goals per match - centre players')
plt.xscale('log')
plt.xlim(0.6,60)
plt.ylim(0,10)
plt.show()

In [None]:
# cat plot of matches played to goals per match - back players
plt.figure()
sns.catplot(x="matches_year1", y="goals_per_match_year1",data=back_df[["matches_year1","goals_per_match_year1"]])
plt.title('Catplot - matches played to goals per match - back players')
plt.xscale('log')
plt.xlim(0.6,60)
plt.ylim(0,10)
plt.show()

In [None]:
# cat plot of matches played to goals per match - wing players
plt.figure()
sns.catplot(x="matches_year1", y="goals_per_match_year1",data=wing_df[["matches_year1","goals_per_match_year1"]])
plt.title('Catplot - matches played to goals per match - wing players')
plt.xscale('log')
plt.xlim(0.6,60)
plt.ylim(0,10)
plt.show()

In [None]:
# cat plot of matches played to goals per match - line players
plt.figure()
sns.catplot(x="matches_year1", y="goals_per_match_year1",data=line_df[["matches_year1","goals_per_match_year1"]])
plt.title('Catplot - matches played to goals per match - line players')
plt.xscale('log')
plt.xlim(0.6,60)
plt.ylim(0,10)
plt.show()

# Correlation matrixes

In [None]:
# correlation matrix - all players
plt.figure(figsize=[9,7])
plt.title('Correlation matrix - all players')
sns.heatmap(df[['center','back','wing','line','height','matches_year1','goals_year1','goals_per_match_year1',\
                'matches_year2','goals_year2','goals_per_match_year2','goals_per_match_year3']].corr(), annot=True)
plt.show()

In [None]:
# correlation matrix - centre players
plt.figure(figsize=[8,6])
plt.title('Correlation matrix - centre players')
sns.heatmap(centre_df[['height','matches_year1','goals_year1','goals_per_match_year1',\
            'matches_year2','goals_year2','goals_per_match_year2','goals_per_match_year3']].corr(), annot=True)
plt.show()

In [None]:
# correlation matrix - back players
plt.figure(figsize=[8,6])
plt.title('Correlation matrix - back players')
sns.heatmap(back_df[['height','matches_year1','goals_year1','goals_per_match_year1',\
            'matches_year2','goals_year2','goals_per_match_year2','goals_per_match_year3']].corr(), annot=True)
plt.show()

In [None]:
# correlation matrix - wing players
plt.figure(figsize=[8,6])
plt.title('Correlation matrix - wing players')
sns.heatmap(wing_df[['height','matches_year1','goals_year1','goals_per_match_year1',\
            'matches_year2','goals_year2','goals_per_match_year2','goals_per_match_year3']].corr(), annot=True)
plt.show()

In [None]:
# correlation matrix - line players
plt.figure(figsize=[8,6])
plt.title('Correlation matrix - line players')
sns.heatmap(line_df[['height','matches_year1','goals_year1','goals_per_match_year1',\
            'matches_year2','goals_year2','goals_per_match_year2','goals_per_match_year3']].corr(), annot=True)
plt.show()