In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df = pd.read_csv('../input/english-premier-league202021/EPL_20_21.csv')

In [3]:
df.head(10)

In [4]:
df.info()

In [5]:
#adding two new columns of data from existing data

df['MinPerMatch'] = (df['Mins']/df['Matches']).astype(int)
df['GoalsPerMatch'] = (df['Goals']/df['Matches']).astype(float)
                     

In [6]:
df.head(10)

In [7]:
#Total goals scored in the season
total_goals = df['Goals'].sum()

In [8]:
total_penalty_scored = df['Penalty_Goals'].sum()
print(total_penalty_scored)

penalty_attempted = df['Penalty_Attempted'].sum()
print(penalty_attempted)

In [9]:
#Pie chart for the penalty kicks attemted and penalty kicks scored

plt.figure(figsize=(15,8))
not_scored = penalty_attempted - total_penalty_scored 
data = [not_scored, total_penalty_scored]
labels = ['Penalty Kicks Missed', 'Penalty Kicks Scored']
color = sns.color_palette("hls", 8)
plt.pie(data, labels = labels, colors = color, autopct = '%.0f%%')
plt.show()

In [10]:
#Total goalkeepers in the league
df[df['Position']=='GK']

In [11]:
#How many players were from different nations

np.size((df['Nationality'].unique()))

In [12]:
#A bar chart which represent the most represented country in the league

nationality = df.groupby('Nationality').size().sort_values(ascending=False)
nationality.head(15).plot(kind='bar', figsize=(15,8), color = sns.color_palette("tab10"))

In [13]:
#Total players based on age group
under18 = df[df['Age']<=18]
under18_count = under18['Name'].count()
print(under18_count)

age18to25 = df[(df['Age'] >18) & (df['Age'] <=25)]
age18to25_count = age18to25['Name'].count()
print(age18to25_count)

age25to30 = df[(df['Age'] >25) & (df['Age'] <=30)]
age25to30_count = age25to30['Name'].count()
print(age25to30_count)

age30to35 = df[(df['Age'] >30) & (df['Age'] <=35)]
age30to35_count = age30to35['Name'].count()
print(age30to35_count)

above35 = df[df['Age']>35]
above35_count = above35['Name'].count()
print(above35_count)

In [14]:
x = np.array([under18_count, age18to25_count, age25to30_count, age30to35_count, above35_count])
mylabels = ["Age <= 18", "Age > 18 &  <=25", "Age > 25 &  <=30", "Age > 30 &  <=35", "Age > 35"]
plt.title('Total Players in Each Age-Group', fontsize =23)
plt.pie(x, labels = mylabels, autopct = "%.0f%%")
plt.show()

In [15]:
#Under 20 Players in each club

players_20 = df[df['Age'] < 20]
players_20['Club'].value_counts().plot(kind='bar', figsize=(15,8), color = sns.color_palette("tab10"))

In [16]:
#Average age of players in each club
plt.figure(figsize=(15,8))
sns.boxplot(x='Club', y='Age', data=df)
plt.xticks(rotation=90)

In [17]:
#Total number of assists by each club

assists_club = pd.DataFrame(df.groupby('Club', as_index = False)['Assists'].sum())
sns.set_theme(style="whitegrid", color_codes=True)
ac = sns.barplot(x='Club', y='Assists', data=assists_club.sort_values(by="Assists"), palette="tab10")
ac.set_xlabel("Club", fontsize=30)
ac.set_ylabel("Assists", fontsize=25)
plt.xticks(rotation=75)
plt.rcParams["figure.figsize"] = (25,8)
plt.title('Bar Chart of Clubs and Total Assists by its Players', fontsize=25)

In [18]:
#Top 5 Assists

top5_assists = df[['Name', 'Club', 'Assists', 'Matches']].nlargest(n=5, columns='Assists')
top5_assists

In [19]:
#Goals scored by each club

goals_club = pd.DataFrame(df.groupby('Club', as_index = False)['Goals'].sum())
sns.set_theme(style="whitegrid", color_codes=True)
ac = sns.barplot(x='Club', y='Goals', data=goals_club.sort_values(by="Goals"), palette="tab10")
ac.set_xlabel("Club", fontsize=30)
ac.set_ylabel("Goals", fontsize=25)
plt.xticks(rotation=75)
plt.rcParams["figure.figsize"] = (25,8)
plt.title('Bar Chart of Clubs and Total Goals by its Players', fontsize=25)

In [20]:
#Top 10 Goals

top10_goals = df[['Name', 'Club', 'Goals', 'Matches']].nlargest(n=10, columns='Goals')
top10_goals

In [21]:
#Top 10 players with most yellow cards

yellow = df.sort_values(by='Yellow_Cards', ascending=False)[:10]
plt.figure(figsize=(25,8))
plt.title("Top 10 Players with Most Yellow Cards")
z=sns.barplot(x=yellow['Name'],y=yellow['Yellow_Cards'],label='Players', color='Yellow')
plt.ylabel('Number of Yellow Cards')
z.set_xticklabels(z.get_xticklabels(),rotation=45)
z