In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
import os
print(os.listdir("../input"))

In [None]:
df=pd.read_csv('../input/ipl-data-set/matches.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.drop(['umpire3'],axis=1,inplace=True)

In [None]:
df.columns

In [None]:
Teams={
    'Royal Challengers Bangalore':'RCB', 
    'Sunrisers Hyderabad':'SRH',
       'Rising Pune Supergiant':'RPS',
    'Mumbai Indians':'MI',
       'Kolkata Knight Riders':'KKR', 
    'Gujarat Lions':'GL',
    'Kings XI Punjab':'KXIP',
       'Delhi Daredevils':'DD',
    'Chennai Super Kings':'CSK',
    'Rajasthan Royals':'RR',
       'Deccan Chargers':'DC',
    'Kochi Tuskers Kerala':'KTK',
    'Pune Warriors':'PW',
       'Rising Pune Supergiants':'RPS'
}

In [None]:
df['team1']=df['team1'].map(Teams)
df['team2']=df['team2'].map(Teams)

In [None]:
df.head()

#### Some Basic Analysis

In [None]:
print('Total Matches Played:',df.shape[0])
print(' \n Venues Played At:',df['city'].unique())     
print(' \n Teams :',df['team1'].unique())

##### most man of the match awards

In [None]:
print(df['player_of_match'].value_counts())

##### maximum win by runs

In [None]:
df[df['win_by_runs']==df['win_by_runs'].max()]

##### Mumbai Indians(MI) defeated Delhi Daredevils(DD) with the highest run difference

#### highest wins by wickets

In [None]:
df[df['win_by_wickets']==df['win_by_wickets'].max()]

##### Toss Decisions across Seasons

In [None]:
sns.countplot(x='Season',hue='toss_decision',data=df)

##### In some seasons, the probablity that toss winners opt for batting is high while In 2016, the majority of toss winners opted for fielding

### Maximum Toss Winners

In [None]:
df['toss_winner'].value_counts().plot(kind='bar')

##### Total Matches vs Wins for Teams 

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['team2']

In [None]:
matches_played_byteams=pd.concat([df['team1'],df['team2']],axis=1)
matches_played_byteams

In [None]:
teams=(matches_played_byteams['team1'].value_counts()+matches_played_byteams['team2'].value_counts()).reset_index()
teams.columns=['team_name','Matches_played']
teams

In [None]:
df['winner']=df['winner'].map(Teams)

In [None]:
wins=pd.DataFrame(df['winner'].value_counts()).reset_index()
wins.columns=['team_name','wins']
wins

In [None]:
player=teams.merge(wins,left_on='team_name',right_on='team_name',how='inner')

In [None]:
player.columns=['team','matches_played','wins']
player

In [None]:
player['%win']=(player['wins']/player['matches_played'])*100
player

In [None]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [None]:
trace1 = go.Bar(
    x=player['team'],
    y=player['matches_played'],
    name='Total Matches'
)
trace2 = go.Bar(
    x=player.team,
    y=player['wins'],
    name='Matches Won'
)



In [None]:
data = [trace1, trace2]
py.iplot(data)

##### Matches played across each season

In [None]:
sns.countplot(df['Season'])

##### Runs Across the Seasons

In [None]:
df.head()

In [None]:
df2=pd.read_csv('../input/ipl-data-set/deliveries.csv')
df2.head()


In [None]:
season=df[['id','Season']].merge(df2, left_on = 'id', right_on = 'match_id', how = 'left').drop('id', axis = 1)
season

In [None]:
season=season.groupby(['Season'])['total_runs'].sum().reset_index()
season.set_index('Season').plot()
plt.title('Total Runs Across the Seasons')
plt.show()

##### There was a decline in total runs from 2008 to 2009.But there after there was a  increase in runs in every season until 2013, but from next season there was a slump in the total runs. But the number of matches are not equal in all seasons. We should check the average runs per match in each season

In [None]:
avg_runs=df.groupby(['Season'])['id'].count().reset_index().rename(columns={'id':'matches'})
avg_runs

In [None]:
season

In [None]:
final=pd.concat([avg_runs,season.iloc[:,1]],axis=1)
final

In [None]:
final['per_match_runs']=final['total_runs']/final['matches']
final.set_index('Season',inplace=True)

In [None]:
final

In [None]:
final['per_match_runs'].plot()

##### most lucky grounds for teams

In [None]:
def lucky(df,team_name):
    return df[df['winner']==team_name]['venue'].value_counts().nlargest(5)

In [None]:
lucky(df,'MI').plot(kind='bar')

##### comparison between 2 teams on the basis of their wins

In [None]:
def comparison(team1,team2):
    compare=df[((df['team1']==team1)|(df['team2']==team1))&((df['team1']==team2)|(df['team2']==team2))]
    sns.countplot(x='Season',hue='winner',data=compare)

In [None]:
comparison('MI','CSK')

#### Here we can see that MI and CSK have played against each other in more than 2 matches, and MI dominates

### Now, let's Analyse Deliveries.csv and get some Insights about it

In [None]:
import os
print(os.listdir("../input"))

In [None]:
df=pd.read_csv('../input/ipl-data-set/deliveries.csv')
df.head()


#### indepth Analysis of David Warner(Australian Batsman) Performance

In [None]:
filt=(df['batsman']=='DA Warner')
df_warner=df[filt]
df_warner.head()

In [None]:
df_warner['dismissal_kind'].value_counts().plot.pie()

#### total fours by da warner


In [None]:
len(df_warner[df_warner['batsman_runs']==4])

##### total 6s by da warner

In [None]:
len(df_warner[df_warner['batsman_runs']==6])

##### total runs made by DA warner

In [None]:
df_warner['total_runs'].sum()

##### pie chart of Runs contribution of David Warner

In [None]:
def count(df,runs):
    return len(df_warner[df_warner['batsman_runs']==runs])*runs

In [None]:
count(df_warner,1)

In [None]:
count(df_warner,2)

In [None]:
count(df_warner,3)

In [None]:
count(df_warner,4)

In [None]:
count(df_warner,6)

In [None]:
slices=[997,414,39,1604,960]
labels=[1,2,3,4,6]
explode=[0,0,0,0,0.1]
plt.pie(slices,labels=labels,autopct='%1.1f%%',explode=explode)
plt.title("DA Warner total runs contribution")
plt.tight_layout()
plt.show()

In [None]:
df.head()

In [None]:
df['bowling_team'].unique()

##### Replacing the Team Names with their abbreviations

In [None]:


Teams={
    'Royal Challengers Bangalore':'RCB', 
    'Sunrisers Hyderabad':'SRH',
       'Rising Pune Supergiant':'RPS',
    'Mumbai Indians':'MI',
       'Kolkata Knight Riders':'KKR', 
    'Gujarat Lions':'GL',
    'Kings XI Punjab':'KXIP',
       'Delhi Daredevils':'DD',
    'Chennai Super Kings':'CSK',
    'Rajasthan Royals':'RR',
       'Deccan Chargers':'DC',
    'Kochi Tuskers Kerala':'KTK',
    'Pune Warriors':'PW',
       'Rising Pune Supergiants':'RPS'
}

In [None]:
df['batting_team']=df['batting_team'].map(Teams)
df['bowling_team']=df['bowling_team'].map(Teams)

In [None]:
df.head()

In [None]:
df.columns

#### Score Distribution For Teams by Innings

In [None]:
runs=df.groupby(['match_id','inning','batting_team'])[['total_runs']].sum().reset_index()
runs.drop('match_id',axis=1,inplace=True)

In [None]:
runs.head()

In [None]:
inning1=runs[runs['inning']==1]
inning2=runs[runs['inning']==2]

In [None]:
sns.boxplot(x='batting_team',y='total_runs',data=inning1)

##### According to this plot, the batting by CSK & RCB seems best. 

In [None]:
sns.boxplot(x='batting_team',y='total_runs',data=inning2)

##### this figure also conveys the same story.in this, we see a point near 0 for RCB which may seem to be outlier. But it is on account of the match was disrupted.

##### how many times teams scores more than 200

In [None]:
high_scores=df.groupby(['match_id', 'inning','batting_team','bowling_team'])['total_runs'].sum().reset_index() 
high_scores

In [None]:
score_200=high_scores[high_scores['total_runs']>=200]
score_200

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(score_200['batting_team'])


In [None]:
sns.countplot(score_200['bowling_team'])

##### 1st graph shows the number of times a team has scored above 200 runs. 2nd graph shows the number of times a bowling team has conceeded above 200 runs.

##### how many times team scores >200 runs against batting team

In [None]:
high_200=df.groupby(['match_id', 'inning','batting_team','bowling_team'])['total_runs'].sum().reset_index()
high_200.set_index(['match_id'],inplace=True)


In [None]:
high_200['total_runs'].max()

In [None]:
high_200.columns

In [None]:
high_200.head()

In [None]:
high=high_200.rename(columns={'total_runs':'count'})
high=high[high['count']>=200].groupby(['inning','batting_team','bowling_team']).count()
high

##### Batsman Comparison

In [None]:
balls=df.groupby(['batsman'])['ball'].count().reset_index()
balls

In [None]:
runs=df.groupby(['batsman'])['batsman_runs'].sum().reset_index()
runs.columns=['batsman','runs']
runs

###runs=df.groupby(['batsman'])['batsman_runs'].agg(sum).reset_index()

In [None]:
df.groupby(['batsman'])['batsman_runs'].agg(sum)

In [None]:
balls=balls.merge(runs,left_on='batsman',right_on='batsman',how='outer')

In [None]:
four=df[df['batsman_runs']==4]

In [None]:
''' df.groupby('batsman')['batsman_runs'].agg(lambda x: (x==4).sum()).reset_index()'''

runs_4=four.groupby('batsman')['batsman_runs'].count().reset_index()
runs_4.columns=['batsman','4s']
runs_4


In [None]:
six=df.groupby('batsman')['batsman_runs'].agg(lambda x: (x==6).sum()).reset_index()
six.columns=['batsman','6s']
six

In [None]:
player=pd.concat([runs,balls.iloc[:,1],runs_4.iloc[:,1],six.iloc[:,1]],axis=1)
player

In [None]:
player['strike_rate']=player['runs']/player['ball']
player.head()

In [None]:
player.isna().sum()

In [None]:
player['4s'].fillna(0,inplace=True)

In [None]:
player.isnull().values.any()

In [None]:
df.columns

In [None]:
grp=df.groupby(['match_id','batsman','batting_team'])['batsman_runs'].sum().reset_index()
grp

In [None]:
max=grp.groupby(['batsman'])['batsman_runs'].max().reset_index()
max.columns=['batsman','max_runs']
max

In [None]:
player2=pd.concat([player,max.iloc[:,1:3]],axis=1)
player2

In [None]:
player2.shape

#### Top 10 Batsman

In [None]:
max_runs=df.groupby(['batsman'])['batsman_runs'].sum()
max_runs.sort_values(ascending=False)[:10].plot.bar()

#### SK Raina has highest total runs across all seasons Whereas Virat Kohliis just few runs behind with the second spot 

#### Top Individual Scores

In [None]:
df.groupby(["match_id","batsman",'batting_team'])["batsman_runs"].sum().reset_index().sort_values(by='batsman_runs', ascending=False).head(10)

#### Not only Gayle but there are many RCB players on the top scores list. Looks like RCB is a very formidable batting side.

##### Highest Wicket Taker

In [None]:
df['dismissal_kind'].unique()

In [None]:
dismissal_kinds = ['caught', 'bowled', 'lbw', 'caught and bowled',
       'stumped', 'hit wicket']

In [None]:
hwt=df[df["dismissal_kind"].isin(dismissal_kinds)]
hwt.head()

In [None]:
hwt['bowler'].value_counts()[:10].plot.bar()

##### Lasith Malinga leads the chart, 