In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
daily_match_df = pd.read_csv('../input/dream11-ipl2020-live/Daily_matchdata.csv')

In [None]:
daily_match_df.head()

In [None]:
sns.heatmap(daily_match_df.isna())

In [None]:
#Looking at the dataset, we can understand that the missing values in the dismissal column implies that the player did 
#NOT bat in that innings.
#Hence replacing nulls in dismissal column with 'DNB' (Did Not Bat)

daily_match_df['Dismissal'] = daily_match_df['Dismissal'].fillna('DNB')

In [None]:
#missing values in runs and wickets columns mean that the player did not bat or bowl.
#So replacing nulls in runs and wickets with -1 representing that the player did not bat or bowl in that innings.

daily_match_df['Runs'] = daily_match_df['Runs'].fillna(-1)
daily_match_df['Wickets'] = daily_match_df['Wickets'].fillna(-1)

In [None]:
daily_match_df.drop(index=[1320],axis=0,inplace=True) #dropping a null row that came by mistake in the dataset

In [None]:
daily_match_df['Dismissal'] = daily_match_df['Dismissal'].replace('BOLD','BOWLED')

In [None]:
sns.heatmap(daily_match_df.isna())
plt.show()

All missing values has been imputed

## Players

### Number players played in IPL 2020

In [None]:
print("Total number of players that has been in playing 11 atleast once: ",len(daily_match_df['Player'].unique()))

### Number of players used by each team in IPL 2020

In [None]:
temp_df = pd.DataFrame(daily_match_df.groupby('Team')['Player'].unique().apply(lambda x:len(x)))
temp_df.columns = ['No. of Players Used']
temp_df.sort_values('No. of Players Used')

In [None]:
colors = ['yellow','deepskyblue','indigo','indianred','blue','red','violet','orangered']

In [None]:
plt.figure(figsize=(8,6))
plt.title("Number players used by each Team")
sns.barplot(y=temp_df.sort_index()['No. of Players Used'],x=temp_df.sort_index().index,palette=colors)
plt.xlabel('Team')
plt.show()

#### MI used the least number of players whereas SRH used the most number of players.

## Batting

In [None]:
batting_df = daily_match_df[daily_match_df['Runs']!=-1]  #df with all players who batted in IPL 2020
bowling_df = daily_match_df[daily_match_df['Wickets']!=-1] #df with all players who bowled in IPL 2020

In [None]:
batsman = []
total_runs = []
batsman_team = []
no_of_matches_batted = []
batting_avg = []
no_of_NO = []
freq_out = []
for player in batting_df['Player'].unique():
    batsman.append(player)
    temp_df = batting_df[batting_df['Player'] == player]
    runs = temp_df['Runs'].sum()
    total_runs.append(runs)
    no_of_matches = temp_df.shape[0]
    no_of_matches_batted.append(no_of_matches)
    batsman_out = temp_df[temp_df['Dismissal'] != 'NO']
    number_of_NO = no_of_matches - batsman_out.shape[0]
    no_of_NO.append(number_of_NO)
    if (no_of_matches-number_of_NO) != 0:
        bat_avg = round(runs/(no_of_matches-number_of_NO),2)
    else:
        bat_avg = runs
    batting_avg.append(bat_avg)
    batsman_team.append(temp_df.iloc[0,3])
    if batsman_out.shape[0]>0:
        freq_out.append(batsman_out['Dismissal'].mode()[0])
    else:
        freq_out.append('Never Out')
    
    

In [None]:
batting_stats_df = pd.DataFrame({'Player':batsman,
                                 'Team':batsman_team,
                                 'Number of matches batted':no_of_matches_batted,
                                 'Not Outs':no_of_NO,
                                 'Total Runs':total_runs,
                                 'Batting Avg':batting_avg,
                                'Most frequent dissmisal type':freq_out})

In [None]:
batting_stats_df.index = batting_stats_df['Player']
batting_stats_df.drop('Player',axis=1,inplace=True)

### Top 5 Players with Highest Batting Average in IPL 2020

In [None]:
batting_stats_df.sort_values('Batting Avg',ascending=False).head(5)

In [None]:
batting_stats_df.sort_values('Batting Avg',ascending=False).head(10)['Batting Avg'].plot(kind='bar')
plt.title('Top 10 Batting Average')
plt.ylabel('Batting Average')
plt.show()

### Top 5 Run Scorers in IPL 2020

In [None]:
batting_stats_df.sort_values('Total Runs',ascending=False).head(5)

In [None]:
batting_stats_df.sort_values('Total Runs',ascending=False).head(10)['Total Runs'].plot(kind='bar')
plt.title('Top 10 Run Scorers')
plt.ylabel('Runs')
plt.show()

### Top 5 Players with Most Number of Not Outs

In [None]:
batting_stats_df.sort_values('Not Outs',ascending=False).head()

In [None]:
batting_stats_df.sort_values('Not Outs',ascending=False).head(10)['Not Outs'].plot(kind='bar')
plt.title('Top 10 with most Not Outs')
plt.ylabel('Number of Not Outs')
plt.show()

### Total Runs Scored by each team

In [None]:
pd.DataFrame(batting_df.groupby(['Team'])['Runs'].sum())

In [None]:
batting_df.groupby(['Team'])['Runs'].sum().plot(kind='bar')
plt.title('Runs Scored by Each Team')
plt.ylabel('Runs Scored')
plt.show()

### Runs Scored by Left and Right handed player in each role

In [None]:
temp_df = batting_df.groupby(['Role','RH/LH'])['Runs'].sum()

In [None]:
temp_df.unstack()

In [None]:
temp_df.unstack().plot(kind='bar',stacked=True)
plt.title('Runs Scored Left and Right handed player in each role')
plt.ylabel('Runs Scored')
plt.show()

### Runs Scored by players having different roles for each team

In [None]:
temp_df = batting_df.groupby(['Team','Role'])['Runs'].sum()
temp_df.unstack()

In [None]:
temp_df.unstack().plot(kind='bar',stacked=False,figsize=(16,8))
plt.title('Runs Scored by players having different roles for each team')
plt.ylabel('Runs Scored')
plt.show()

### Top Run Scorers While Chasing a Target

In [None]:
temp_df = batting_df.groupby(['Player','Match_Type'])['Runs'].sum()

In [None]:
temp_df.unstack().sort_values('Chasing',ascending=False).head(5)

In [None]:
temp_df.unstack().sort_values('Chasing',ascending=False)['Chasing'].head(10).plot(kind='bar')
plt.title('Top 10 Run Scorer while Chasing a Target')
plt.ylabel('Runs')
plt.show()

### Top Run Scorers while Setting a Target

In [None]:
temp_df.unstack().sort_values('Defending',ascending=False).head(5)

In [None]:
temp_df.unstack().sort_values('Defending',ascending=False)['Defending'].head(10).plot(kind='bar')
plt.title('Top 10 Run Scorer while Seting a Target')
plt.ylabel('Runs')
plt.show()

### Top Run Scorers in Each Stadium

In [None]:
temp_df = batting_df.groupby(['Player','Ground'])['Runs'].sum()
temp_df = temp_df.unstack()

In [None]:
temp_df.head()

### Top Run Scorers in Dubai Stadium

In [None]:
temp_df.sort_values('Dubai',ascending=False).head()

In [None]:
temp_df.sort_values('Dubai',ascending=False)['Dubai'].head(10).plot(kind='bar')
plt.title('Top 10 Run Scorers in Dubai Stadium')
plt.ylabel('Runs')
plt.show()

### Top Run Scorers in Sharjah Stadium

In [None]:
temp_df.sort_values('Sharjah',ascending=False).head()

In [None]:
temp_df.sort_values('Sharjah',ascending=False)['Sharjah'].head(10).plot(kind='bar')
plt.title('Top 10 Run Scorers in Sharjah Stadium')
plt.ylabel('Runs')
plt.show()

### Top Run Scorers in Sheikh Zayedai Stadium

In [None]:
temp_df.sort_values('Sheikh Zayed',ascending=False).head()

In [None]:
temp_df.sort_values('Sheikh Zayed',ascending=False)['Sheikh Zayed'].head(10).plot(kind='bar')
plt.title('Top 10 Run Scorers in Sheikh Zayed Stadium')
plt.ylabel('Dubai')
plt.show()

### Players Who Scored 100 or More Runs in All the 3 Stdiums

In [None]:
temp_df[(temp_df['Dubai']>99) & (temp_df['Sharjah']>99) & (temp_df['Sheikh Zayed']>99)]

### Most Consistent Players in IPL 2020 who has a batting average above 35 runs and batted in more than 7 matches

In [None]:
std = []
for player in batting_stats_df.index:
    temp_df = batting_df[batting_df['Player'] == player]
    std.append(np.std(temp_df['Runs'])) #,ddof=batting_stats_df.loc[player,'Not Outs'])

In [None]:
df1 = batting_stats_df.copy()

In [None]:
df1['Standard Deviation'] = std

In [None]:
df1[(df1['Batting Avg'] > 35) & (df1['Number of matches batted'] > 6)].sort_values('Standard Deviation')

### Top Run Scorers of Each Team

In [None]:
batting_stats_df

In [None]:
top_3_run_getters_of_each_team = pd.DataFrame(batting_stats_df.groupby(['Team']).apply(lambda x: x.sort_values(['Total Runs'],ascending = False))['Total Runs'].groupby('Team').head(3))

In [None]:
top_3_run_getters_of_each_team

## Bowling

In [None]:
bowling_df

In [None]:
bowling_df.info()

In [None]:
bowling_df['Wickets'].value_counts()

In [None]:
bowling_df[bowling_df['Wickets']==' ']['Wickets'] = 3

In [None]:
bowling_df[bowling_df['Wickets']==' ']

In [None]:
bowling_df.loc[1317,'Wickets'] = 3

In [None]:
bowling_df['Wickets'] = bowling_df['Wickets'].astype('int')

In [None]:
bowling_df.info()

In [None]:
player = []
no_of_matches = []
tot_wicks = []
wicks_1st_inns = []
wicks_2nd_inns = []
wicks_sharjah = []
wicks_dubai = []
wicks_sheik = []
team = []
for bowler in bowling_df['Player'].unique():
    player.append(bowler)
    team.append(bowling_df[bowling_df['Player'] == bowler]['Team'].values[0])
    
    bowler_df = bowling_df[bowling_df['Player'] == bowler]
    no_of_matches.append(bowler_df.shape[0])
    
    wick1 = np.sum(bowler_df[bowler_df['Match_Type']=='Chasing']['Wickets'])
    wicks_1st_inns.append(wick1)
    
    wick2 = np.sum(bowler_df[bowler_df['Match_Type']=='Defending']['Wickets'])
    wicks_2nd_inns.append(wick2)
    
    tot_wicks.append(wick1+wick2)
    
    
    wickg1 = np.sum(bowler_df[bowler_df['Ground']=='Sharjah']['Wickets'])
    wicks_sharjah.append(wickg1)
    
    wickg2 = np.sum(bowler_df[bowler_df['Ground']=='Dubai']['Wickets'])
    wicks_dubai.append(wickg2)
    
    wickg3 = np.sum(bowler_df[bowler_df['Ground']=='Sheikh Zayed']['Wickets'])
    wicks_sheik.append(wickg3)
    
    

In [None]:
bowling_stats_df = pd.DataFrame(
    {
        'Team':team,
        'No. of matches bowled':no_of_matches,
        'Total Wickets':tot_wicks,
        'Wickets in 1st inns':wicks_1st_inns,
        'Wickets in 2nd inns':wicks_2nd_inns,
        'Wickets in Sharjah':wicks_sharjah,
        'Wickets in Dubai':wicks_dubai,
        'Wickets in Sheikh Zayed':wicks_sheik
    },
    index=player

)

In [None]:
bowling_stats_df = bowling_stats_df.sort_values('Total Wickets',ascending = False)

In [None]:
bowling_stats_df.head()

### Top 5 wicket takers in IPL 2020

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(bowling_stats_df.index[:5],bowling_stats_df['Total Wickets'][:5])
plt.xlabel('Bowler')
plt.title('Top 5 wicket takers in IPL 2020')
plt.show()

### Top 5 wicket takers in the 1st innings

In [None]:
temp_df = bowling_stats_df.sort_values('Wickets in 1st inns',ascending = False)
plt.figure(figsize=(8,5))
sns.barplot(temp_df.index[:5],temp_df['Wickets in 1st inns'][:5])
plt.xlabel('Bowler')
plt.title('Top 5 wicket takers in the 1st innings')
plt.show()

### Top 5 wicket takers in the 2nd innings

In [None]:
temp_df = bowling_stats_df.sort_values('Wickets in 2nd inns',ascending = False)
plt.figure(figsize=(8,5))
sns.barplot(temp_df.index[:5],temp_df['Wickets in 2nd inns'][:5])
plt.xlabel('Bowler')
plt.title('Top 5 wicket takers in the 2nd innings')
plt.show()

### Top Wicket Takers in each Stadium

In [None]:
plt.figure(figsize=(8,5))
temp_df = bowling_stats_df.sort_values('Wickets in Sharjah',ascending = False)
sns.barplot(temp_df.index[:5],temp_df['Wickets in Sharjah'][:5])
plt.xlabel('Bowler')
plt.title('Top 5 wicket takers in Sharjah')
plt.show()

#### All the top wicket takers in Sharjah are pace bowlers

In [None]:
plt.figure(figsize=(8,5))
temp_df = bowling_stats_df.sort_values('Wickets in Dubai',ascending = False)
sns.barplot(temp_df.index[:5],temp_df['Wickets in Dubai'][:5])
plt.xlabel('Bowler')
plt.title('Top 5 wicket takers in Dubai')
plt.show()

#### 4 out of 5 top wicket takers in Dubai are pace bowlers.

In [None]:
plt.figure(figsize=(8,5))
temp_df = bowling_stats_df.sort_values('Wickets in Sheikh Zayed',ascending = False)
sns.barplot(temp_df.index[:5],temp_df['Wickets in Sheikh Zayed'][:5])
plt.xlabel('Bowler')
plt.title('Top 5 wicket takers in Sheikh Zayed')
plt.show()

#### 3 out of top 5 wicket takers in Sheikh Zayed stadium are spin bowlers

### Total Wickets Taken by each Team

In [None]:
pd.DataFrame(bowling_stats_df.groupby('Team')['Total Wickets'].sum())

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x=bowling_stats_df.groupby('Team')['Total Wickets'].sum().index,y=bowling_stats_df.groupby('Team')['Total Wickets'].sum(),palette=colors)
plt.title('Total Wickets Taken by each Team')
plt.show()

### Wickets Taken by each Team in 1st Innings

In [None]:
pd.DataFrame(bowling_stats_df.groupby('Team')['Wickets in 1st inns'].sum())

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x=bowling_stats_df.groupby('Team')['Wickets in 1st inns'].sum().index,y=bowling_stats_df.groupby('Team')['Wickets in 1st inns'].sum(),palette=colors)
plt.title('Total Wickets Taken by each Team in 1st Inns')
plt.show()

### Wickets Taken by each Team in 2nd Innings

In [None]:
pd.DataFrame(bowling_stats_df.groupby('Team')['Wickets in 2nd inns'].sum())

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x=bowling_stats_df.groupby('Team')['Wickets in 2nd inns'].sum().index,y=bowling_stats_df.groupby('Team')['Wickets in 2nd inns'].sum(),palette=colors)
plt.title('Total Wickets Taken by each Team in 2nd Inns')
plt.show()

### Top 3 Wicket Takers of each Team

In [None]:
top_3_wick_takers_of_each_team = pd.DataFrame(bowling_stats_df.groupby('Team').apply(lambda x:x.sort_values(['Total Wickets'],ascending=False))['Total Wickets'].groupby('Team').head(3))
top_3_wick_takers_of_each_team

### Total Wickets taken in IPL 2020

In [None]:
print('Total Wickets Taken in IPL 2020: ',bowling_stats_df['Total Wickets'].sum())

In [None]:
print('Total Wickets Taken in 1st Innings of IPL 2020: ',bowling_stats_df['Wickets in 1st inns'].sum())

In [None]:
print('Total Wickets Taken in 2nd Innings of IPL 2020: ',bowling_stats_df['Wickets in 2nd inns'].sum())