In [None]:
#INSY 5376 Big Data Analytics - Project - IPL Player Performance Analysis
#Team Members :
# Amuluru, Sriram Sai
# Grandhi, Anish
# Potukuchi, Sameer Kumar
# Thanikonda, Pruthvi Sai Kumar
#Reference : https://www.kaggle.com/ash316/cricket-exploration-interactive-plots


#import necessary packages
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
import pandas as pd

#Initialize Spark Configuration, Spark Context, SQL Context Objects
conf = SparkConf().setMaster("local[*]").setAppName("IPL Data Analysis")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)






In [None]:
#Read CSV files into data frames
from collections import namedtuple
deliveriesFields = ('matchID','inning','battingTeam','bowlingTeam','over','ball','batsman','nonStriker','bowler','isSuperOver','wideRuns','byeRuns','legByeRuns','noballRuns','penaltyRuns','batsmanRuns','extraRuns','totalRuns','playerDismissed','dismissalKind','fielder')
deliveriesColumns = namedtuple('deliveries',deliveriesFields)
def parse(line):
    line = line.encode('ascii','ignore')
    fields = line.split(",")
    match_id = fields[0]
    inning = fields[1]
    batting_team = fields[2]
    bowling_team = fields[3]
    over = int(fields[4])
    ball = int(fields[5])
    batsman = fields[6]
    non_striker = fields[7]
    bowler = fields[8]
    is_super_over = fields[9]
    wide_runs = fields[10]
    bye_runs = fields[11]
    legbye_runs = fields[12]
    noball_runs = fields[13]
    penalty_runs = fields[14]
    batsman_runs = int(fields[15])
    extra_runs= int(fields[16])
    total_runs = int(fields[17])
    player_dismissed = fields[18]
    dismissal_kind = fields[19]
    fielder = fields[20]
    return deliveriesColumns(match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder)
    
    



deliveriesRDD = sc.textFile('deliveries.csv')
deliveriesRDD = deliveriesRDD.filter(lambda x : 'inning' not in x)
deliveriesRDD = deliveriesRDD.map(parse)
deliveriesDF = sqlContext.createDataFrame(deliveriesRDD)
deliveriesDF.select('matchID').distinct().count()

In [None]:
matchesFields = ('matchID','season','city','date','team1','team2','tossWinner','tossDecision','result','dlApplied','winner','winByRuns','winByWickets','playerOfMatch','venue','umpire1','umpire2','umpire3')
matchesColumns = namedtuple('matches',matchesFields)
def parseMatches(line):
    line = line.encode('ascii','ignore')
    fields = line.split(",")
    match_id = fields[0]
    season = fields[1]
    city = fields[2]
    date = fields[3]
    team1 = fields[4]
    team2 = fields[5]
    tossWinner = fields[6]
    tossDecision = fields[7]
    result = fields[8]
    dlApplied = fields[9]
    winner = fields[10]
    winByRuns = fields[11]
    winByWickets = fields[12]
    playerOfMatch = fields[13]
    venue = fields[14]
    umpire1 = fields[15]
    umpire2= fields[16]
    umpire3 = fields[17]
    return matchesColumns(match_id,season,city,date,team1,team2,tossWinner,tossDecision,result,dlApplied,winner,winByRuns,winByWickets,playerOfMatch,venue,umpire1,umpire2,umpire3)
    
    



matchesRDD = sc.textFile('matches.csv')
matchesRDD = matchesRDD.filter(lambda x : 'season' not in x)
matchesRDD = matchesRDD.map(parseMatches)
matchesDF = sqlContext.createDataFrame(matchesRDD)
matchesDF = matchesDF.drop(matchesDF['umpire3'])


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Converting to pandas dataframe to make it compatible for plotting with matplotlib
matches = matchesDF.toPandas()


In [None]:
#Teams with highest number of wins across seasons
matches['winner'] = matches['winner'].replace('','No Result')
plt.subplots(figsize=(10,6))
ax=matches['winner'].value_counts().plot.bar(width=0.8)
for p in ax.patches:
    ax.annotate(format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
plt.show()

In [None]:
#Converting to pandas dataframe to make it compatible for plotting with matplotlib
deliveries = deliveriesDF.toPandas()


In [None]:
#Top 10 batsmen by number of runs plotted using matplotlib
balls=deliveries.groupby(['batsman','battingTeam'])['ball'].count().reset_index()
runs=deliveries.groupby(['batsman','battingTeam'])['batsmanRuns'].sum().reset_index()
balls=balls.merge(runs,left_on=['batsman','battingTeam'],right_on=['batsman','battingTeam'],how='outer')
balls.columns=[['batsman','battingTeam','ball','batsmanRuns']]
sixes=deliveries.groupby(['batsman','battingTeam'])['batsmanRuns'].agg(lambda x: (x==6).sum()).reset_index()
fours=deliveries.groupby(['batsman','battingTeam'])['batsmanRuns'].agg(lambda x: (x==4).sum()).reset_index()
balls['strike_rate']=(balls['batsmanRuns']/balls['ball']*100).round()
balls=balls.merge(sixes,left_on=['batsman','battingTeam'],right_on=['batsman','battingTeam'],how='outer')
balls=balls.merge(fours,left_on=['batsman','battingTeam'],right_on=['batsman','battingTeam'],how='outer')
compare=deliveries.groupby(['matchID',"batsman",'battingTeam'])["batsmanRuns"].sum().reset_index()
compare=compare.groupby(['batsman','battingTeam'])['batsmanRuns'].max().reset_index()
balls=balls.merge(compare,left_on=['batsman','battingTeam'],right_on=['batsman','battingTeam'],how='outer')
balls.columns=[['batsman','battingteam','balls','runs','strike_rate',"6's","4's",'Highest_score']]
balls1 = balls.loc[balls['balls'] > 0]
balls1.to_csv('batsmen.csv')
batsmenRuns = balls1.groupby(['batsman'])['runs'].sum().reset_index()
batsmenRuns = batsmenRuns.sort_values(by='runs',ascending=False)
topTenBatsmen = batsmenRuns[0:10]
plt.subplots(figsize=(10,6))
ax=topTenBatsmen['runs'].plot.bar(width=0.8)
for p in ax.patches:
    ax.set_xticklabels(topTenBatsmen['batsman'])
    ax.annotate(format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
plt.show()

In [None]:
#Number Of 4's and sixes for each team plotted using matplotlib
import plotly.graph_objs as go
import plotly.offline as py
import plotly
plotly.offline.init_notebook_mode()
ax=deliveries[deliveries['batsmanRuns']==6].battingTeam.value_counts().reset_index()
ax2=deliveries[deliveries['batsmanRuns']==4].battingTeam.value_counts().reset_index()
ax=ax.merge(ax2,left_on='index',right_on='index',how='left')
ax.columns=[['team',"6's","4's"]]
ax

trace1 = go.Bar(
    x=ax.team.values, y=ax["6's"],
    name="6's",
    orientation = 'v',
    marker = dict(color = 'rgba(205,12,28, 0.6)',
        line = dict(color = 'rgba(205,12,28, 0.6)',
            width = 2), 
    )
)
trace2 = go.Bar(
    x=ax.team.values, y=ax["4's"],
    name="4's",
    orientation = 'v',
    marker = dict(color = 'rgba(9,234,227, 0.6)',
        line = dict(color = 'rgba(9, 227, 227, 1.0)',
            width = 2),
    )
)

data = [trace1, trace2]
layout = go.Layout( margin=go.Margin(
        b=135,
        r=135
    ),
    barmode='hist'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='marker-h-bar')


In [None]:
#Top 10 bowlers by number of wickets plotted using matplotlib
bowlers=deliveries.groupby('bowler').sum().reset_index()
bowl=deliveries['bowler'].value_counts().reset_index()
bowlers=bowlers.merge(bowl,left_on='bowler',right_on='index',how='left')
bowlers=bowlers[['bowler_x','totalRuns','bowler_y']]
bowlers.columns=[['bowler','runs_given','balls']]
bowlers['overs']=(bowlers['balls']//6)
dismissal_kinds = ["bowled", "caught", "lbw", "stumped", "caught and bowled", "hit wicket"]  
ct=deliveries[deliveries["dismissalKind"].isin(dismissal_kinds)]
ct=ct['bowler'].value_counts().reset_index()
bowlers=bowlers.merge(ct,left_on='bowler',right_on='index',how='left').dropna()
bowlers=bowlers[['bowler_x','runs_given','overs','bowler_y']]
bowlers.columns=[['bowler','runs_given','overs','wickets']]
bowlers['economy']=(bowlers['runs_given']/bowlers['overs'])
bowlersWickets = bowlers.sort_values(by='wickets', ascending=False)
bowlersWickets.to_csv('bowlers.csv')
topTenBowlersByWickets = bowlersWickets[0:10]
plt.subplots(figsize=(10,6))
ax=topTenBowlersByWickets['wickets'].plot.bar(width=0.8)
for p in ax.patches:
    ax.set_xticklabels(topTenBowlersByWickets['bowler'])
    ax.annotate(format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
plt.show()

In [None]:
#Top 10 players with most number of man of the match awards plotted using matplotlib
playerOfMatchDF = matchesDF.groupBy('playerOfMatch').count()
top10PlayerOfMatch = playerOfMatchDF.orderBy('count', ascending=False).take(10)
top10PlayerOfMatch = sqlContext.createDataFrame(top10PlayerOfMatch)
top10PlayerOfMatch = top10PlayerOfMatch.toPandas()
plt.subplots(figsize=(12,6))
ax=top10PlayerOfMatch['count'].plot.bar(width=0.8)
for p in ax.patches:
    ax.set_xticklabels(top10PlayerOfMatch['playerOfMatch'])
    ax.annotate(format(p.get_height()), (p.get_x()+0.25, p.get_height()+0.1))
plt.show()

In [None]:
#Percentage of different dismissal kinds plotted using matplotlib.
dismiss=["run out","bowled", "caught", "lbw", "stumped", "caught and bowled", "hit wicket"]
ct=deliveries[deliveries["dismissalKind"].isin(dismiss)]
bx=ct.dismissalKind.value_counts()[:10]
bx


fig = {
  "data": [    
    {
      "values": bx.values ,
      "labels": bx.index,
      "text":"CO2",
      "textposition":"inside",
      "domain": {"x": [0, 0.48]},
      "name": "Wickets",
      "hoverinfo":"label+percent+name",
      "hole": .5,
      "type": "pie"
    }],
  "layout": {
        "title":"Percentage of Types Of Dismissals",
        "annotations": [
            {
                "font": {
                    "size": 15
                },
                "showarrow": False,
                "text": "Dismissals",
                "x": 0.18,
                "y": 0.51
            }
        ]
    }
}
py.iplot(fig, filename='donut')

In [None]:
#Different ways in which top 10 batsmen score plotted using matplotlib
from pyspark.sql.functions import sum
max_runs = deliveriesDF.groupBy('batsman').agg(sum('batsmanRuns').alias('batsmanRuns')).collect()
batsmen = matchesDF.select('matchID', 'season').join(deliveriesDF, matchesDF.matchID == deliveriesDF.matchID,"inner")
max_runs = sqlContext.createDataFrame(max_runs)
batsmen = batsmen.toPandas()
max_runs= max_runs.toPandas()
a=batsmen.groupby(['batsman','batsmanRuns'])['totalRuns'].count().reset_index()
b=max_runs.sort_values('batsmanRuns',ascending=False)[:10].reset_index()
c=b.merge(a,left_on='batsman',right_on='batsman',how='left')
c.drop('batsmanRuns_x',axis=1,inplace=True)
c.set_index('batsman',inplace=True)
c.columns=['index','type','count']
c=c[(c['type']==1)|(c['type']==2)|(c['type']==4)|(c['type']==6)]
cols=['index','type','count']
c.reset_index(inplace=True)
c=c.pivot('batsman','type','count')

trace1 = go.Bar(
    y=c.index, x=c[6],
    name="6's",
    orientation = 'h',
    marker = dict(color = 'rgba(241, 169, 115, 0.5)',
        line = dict(color = 'rgba(241, 169, 115, 1.0)',
            width = 3)
    )
)
trace2 = go.Bar(
    y=c.index, x=c[4],
    name="4's",
    orientation = 'h',
    marker = dict(color = 'rgba(14, 124, 98, 0.5)',
        line = dict(color = 'rgba(14, 124, 98, 1.0)',
            width = 3)
    )
)

trace3 = go.Bar(
    y=c.index, x=c[2],
    name="2's",
    orientation = 'h',
    marker = dict(color = 'rgba(23, 116, 255, 0.5)',
        line = dict(color = 'rgba(23, 116, 255, 1.0)',
            width = 3)
    )
)
trace4 = go.Bar(
    y=c.index, x=c[1],
    name="1's",
    orientation = 'h',
    marker = dict(color = 'rgba(97, 30, 124, 0.5)',
        line = dict(color = 'rgba(97, 30, 124, 1.0)',
            width = 3)
    )
)

data = [trace1, trace2,trace3,trace4]
layout = go.Layout(
    barmode='stack'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='marker-h-bar')