In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/ipl-complete-dataset-20082020/IPL Matches 2008-2020.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  **Analyzing the IPL 2008-2020 Dataset**
* **Cleaning the data**
  * Handling missing values 
* **Analyzing and Visualizing**
  * Most Wins
  * Most MOM award winning player
  * Toss Winners and Match winners/losers
  * No. of times the teams won both the toss and match
  * Venues preferable for chasing or defending

In [None]:
df=pd.read_csv('../input/ipl-complete-dataset-20082020/IPL Matches 2008-2020.csv')
df.head()

# **Data Cleaning**

In [None]:
#Let's take a look at the datatypes
df.info()

In [None]:
#To know the number of null values in each column
df.isnull().sum()

In [None]:
#identifying missing values
import seaborn as sns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#19 not null values in 'method' column
#The not null values show that whether D/L method has been used or not
methods=pd.DataFrame(df['method'])
methods.dropna()

In [None]:
#Filling empty cells in 'method' column with None
df['method']=df['method'].fillna('None')
df.isnull().sum()

In [None]:
#Rows having 'result' column as null
df[df['result'].isnull()]

In [None]:
#Filling null values in 4 columns as None since the empty rows in 'result' column 
#indicating that the match was called off and hence no winner
df['result']=df['result'].fillna('None')
df['eliminator']=df['eliminator'].fillna('None')
df['player_of_match']=df['player_of_match'].fillna('None')
df['winner']=df['winner'].fillna('None')
df.isnull().sum()

In [None]:
#Rows having 'result_margin' column as null
#The corresponding mathes were tied
df[df['result_margin'].isnull()]

In [None]:
# Filling null value in 'result_margin' with 0.0
df['result_margin']=df['result_margin'].fillna('0.0')
df.isnull().sum()

In [None]:
#Rows having city null
#Matches held in stadiums of Sharjah and Dubai having city column empty
df[df['city'].isnull()]

In [None]:
#Replacing null value with Sharjah and Dubai according to the match venue
df['city']=df['city'].astype(str)
df['venue']=df['venue'].astype(str)
for i,row in df.iterrows():
    if df.at[i,'venue']=='Sharjah Cricket Stadium':
        df.at[i,'city']=='Sharjah'
    elif df.at[i,'venue']=='Dubai International Cricket Stadium':
        df.at[i,'city']=='Dubai'

In [None]:
#Now we have no null values in our dataset
df.isnull().sum()

# **Data Visualization**

# Most Wins

In [None]:
#Lets have a look at the number of times a team has won
most_wins=pd.DataFrame(df['winner'].value_counts()[:-1])
most_wins

In [None]:
#visualizing the teams and with their winning percentage
import matplotlib.pyplot as plt
plt.figure(figsize=(13,9))
plt.pie(df['winner'].value_counts()[:-1],labels=most_wins.index,autopct='%1.0f%%',pctdistance=0.85)
circle=plt.Circle((0,0),0.7,color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.title('Most wins',fontsize=23)
plt.show()
#Mumbai Indians have the highest win percentage followed by Chennai Super Kings and Kolkata Knight Riders.

# Player with most MOM

In [None]:
#Top 10 players with most number of MOM award
most_mom=df['player_of_match'].value_counts()
most_mom=most_mom[:10,]
plt.figure(figsize=(10,7))
sns.set_style('darkgrid')
sns.barplot(most_mom.index,most_mom.values)
plt.title('Players with most MOM (top 10)')
plt.xlabel('Players')
plt.ylabel('Number of times')
plt.xticks(rotation=90)
plt.show()
#AB de Villiers has the most Man of the Match awards.

# Toss Impact

In [None]:
#Number of times teams won the toss
toss = df['toss_winner'].value_counts()
toss

In [None]:
#Let's have a look at the teams that won the toss as well as the match
df['toss_&_actual_winner']=np.where((df['toss_winner']==df['winner']),df['winner'],np.nan)
df.head()


In [None]:
toss_viz= df.toss_winner==df.winner
tosss=pd.DataFrame(toss_viz.value_counts())
plt.figure(figsize=(12,8))
plt.pie(toss_viz.value_counts(),labels=tosss.index,autopct='%1.2f%%',pctdistance=0.85)
plt.title('Toss Winners and Match winners/losers')
plt.show()
#True: Teams who won the toss and the match
#False : Teams who won the toss and lost the match

In [None]:
toss_actual_win = pd.DataFrame(df,columns=['venue','toss_winner','toss_decision','winner','toss_&_actual_winner'])
toss_actual_win

In [None]:
#dropping null values
toss_actual_win.dropna()

In [None]:
#Let's see if the toss win has any affect on the actual wins
plt.figure(figsize=(12,8))
sns.countplot(x='toss_&_actual_winner',data=toss_actual_win)
plt.xticks(rotation=90)
plt.xlabel('Toss & Actual Winners')
plt.title('No. of times teams won both toss and actual Match')
plt.show()
#Mumbai Indians and Chennai Super Kings have won most matches after having won the toss.

In [None]:
#number of times the teams have elected to bat/bowl at a specific venue and won the match
plt.figure(figsize=(18,8))
sns.countplot(x='venue',hue='toss_decision',data=toss_actual_win)
plt.xticks(rotation=90)
plt.title('Venues favourable for chasing or defending?')
plt.show()
#Clearly,at M. Chinnaswamy Stadium it is highly favourable to chase down the target.