In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
from pprint import pprint

In [None]:
data1 = pd.read_csv("../input/ipldatasets/matches.csv")
data2 = pd.read_csv("../input/ipldatasets/deliveries.csv")

# Understanding the structure of the data and fixing some errors

In [None]:
print(len(data1), len(data2))

data1["team1"] = data1["team1"].replace("Rising Pune Supergiant", "Rising Pune Supergiants")
data1["team2"] = data1["team2"].replace("Rising Pune Supergiant", "Rising Pune Supergiants")
data1["winner"] = data1["winner"].replace("Rising Pune Supergiant", "Rising Pune Supergiants")

data1["team1"] = data1["team1"].replace("Delhi Daredevils", "Delhi Capitals")
data1["team2"] = data1["team2"].replace("Delhi Daredevils", "Delhi Capitals")
data1["winner"] = data1["winner"].replace("Delhi Daredevils", "Delhi Capitals")

In [None]:
data1.columns
# data2[data2["is_super_over"] == 1].head(30)

In [None]:
data1[data1["result"] != "normal"].head()

In [None]:
data1[data1["winner"] == np.nan]

In [None]:
teams = list(set(list(data1.team1)).union(set(list(data1.team2))))
teams

In [None]:
data1.winner.unique()

# Some Generic exploration

Here we do some generic data exploration, like those concerning  generic wins and losses, ie, not specific to any team or player.

In [None]:
px.histogram(data1.loc[data1["win_by_wickets"] > 0], x="win_by_wickets", labels={
    "win_by_wickets": "Wickets left",
    "count": "Number of matches won"
}, title="Counts of matches won by wickets vs Win margin")

In [None]:
px.histogram(data1.loc[data1["win_by_runs"] > 0], x="win_by_runs", labels={
    "win_by_runs": "Runs",
    "count": "Number of matches won"
}, title="Counts of matches won by runs vs Win margin")

# Check if toss or some other statistic affects the wins:



In [None]:
print(f"Chances that toss winner will win the match = {len(data1[data1['toss_winner'] == data1['winner']])*100//len(data1)}%")
print(f"Chances that fielding first will win the match = {len(data1[data1['win_by_wickets'] > 0])*100//len(data1)}%")
print(f"Chances that a team will win on home ground = {len(data1[data1['team1'] == data1['winner']])*100//len(data1)}%")

## Inference:

From this, we can infer that teams that won while chasing, most frequently won by aaround 6 wickets. While defending the target, they mostly won by less than 30 runs.

Also, toss result and decision doesn't affect the chances of winning significantly.

Surprisingly, the chances of a random team winning on their home ground is less likely than winning on away matches, probably because a team plays more away matches on average than on home ground.

# Objective 1a: Deciding the best teams in IPL history.

In this regard, we can model a reasonable assumption: "The teams with more wins are more likely to be the best teams, as they are more likely to perform better." 

In [None]:
team_wins = {}
team_losses = {}
team_ties = {}
# data1 = data1[data1["winner"] != np.nan]

nm1 = np.nan # Ties
nm2 = np.nan # Losses
nm = np.nan # Wins

for i in range(len(data1)):
    nm = data1.iloc[i,:].winner
    
    if data1.iloc[i,:].team2 == data1.iloc[i,:].winner:
        nm1 = data1.iloc[i,:].team1
    elif data1.iloc[i,:].team1 == data1.iloc[i,:].winner:
        nm1 = data1.iloc[i,:].team2

    if nm != np.nan:
        if nm in team_wins.keys(): 
            team_wins[nm] += 1
        else:
            team_wins[nm] = 1
    
    if nm1 != np.nan:
        if nm1 in team_losses.keys(): 
            team_losses[nm1] += 1
        else:
            team_losses[nm1] = 1
    
    if nm == np.nan:
        t1 = data1.iloc[i,:].team1
        t2 = data1.iloc[i,:].team2

        if t1 in team_ties.keys():
            team_ties[t1] +=1
        else:
            team_ties[t1] = 1

        if t2 in team_ties.keys():
            team_ties[t2] +=1
        else:
            team_ties[t2] = 1
    
del team_wins[np.nan]
# del team_losses[np.nan]
print("Win counts:\n")
pprint(sorted(team_wins.items(), key= lambda x: x[1], reverse=True))
print("----------------------------------\n")
print("Loss counts:\n")
pprint(sorted(team_losses.items(), key= lambda x: x[1], reverse=True))
print("----------------------------------\n")
print("Tie counts:\n")
pprint(sorted(team_ties.items(), key= lambda x: x[1], reverse=True))
print("----------------------------------\n")

In [None]:
team_stats = pd.DataFrame({"team": [], "wins": [], "losses": [], "ties": []})

for team in team_wins.keys():
    team_stats = team_stats.append({"team": team, "wins": team_wins[team], "losses": team_losses[team]}, ignore_index = True)

team_stats.fillna(0)

In [None]:
team_stats_disp = team_stats[["team", "wins", "losses"]]
team_stats_disp = team_stats_disp.sort_values("wins", ascending=False)
team_stats_disp

In [None]:
px.bar(team_stats_disp, x="team", y=["wins", "losses"], title="Team Stats, ranked by success in IPL (2008-2019)", labels={
    "team": "Team Name",
    "value": "Match Results"
}).show()

# Inference 1a:

From the above graph, we can infer that the top 3 successful teams in IPL are:
- Mumbai Indians (109 wins, 78 losses)
- Chennai Super Kings(100 wins, 64 losses)
- Kolkata Knight Riders(92 wins, 87 losses)

# Objective 1b: To find the top-performing players in IPL history.



## Finding the top-performing batsmen

In [None]:
batsmen = sorted(list(set(list(data2["batsman"]))))

balls_faced_index = {}

for i in batsmen:
    balls_faced_index[i] = len(data2[data2["batsman"] == i])

runs_index = {}

for i in batsmen:
    runs_index[i] = data2[data2["batsman"] == i].batsman_runs.sum()

# pprint(runs_index)
len(runs_index)

top_batsmen = sorted(runs_index.items(), key = lambda x: x[1], reverse=True)
# pprint(top_batsmen[:20])

strike_rate_index = {}

for i in runs_index.keys():
    strike_rate_index[i] = runs_index[i]/balls_faced_index[i] * 100

# strike_rate_index



In [None]:
batsmen_matches_index = {}
buf = None
for i in batsmen:
    batsmen_matches_index[i] = len(set(list(data2[data2["batsman"] == i].match_id)))

In [None]:
dict(sorted(batsmen_matches_index.items(), key=lambda x: x[1], reverse=True)[:20])

In [None]:
batsmen_df = pd.DataFrame({"Name": [], "Matches": [], "Runs": [], "Balls faced": [], "Strike Rate": []})

for i in batsmen:
    batsmen_df = batsmen_df.append({"Name": i, "Matches": batsmen_matches_index[i], "Runs": runs_index[i], "Balls faced": balls_faced_index[i], "Strike Rate": strike_rate_index[i]}, ignore_index=True)

batsmen_df = batsmen_df.sort_values("Runs", ascending=False)

In [None]:
batsmen_df.iloc[:20]

## Find the top performing bowlers

In [None]:
bowlers = sorted(list(set(list(data2["bowler"]))))

deliveries_index = {}
for i in bowlers:
    deliveries_index[i] = len(data2[data2["bowler"] == i])


In [None]:
# Verification if total sum of all deliveries match number of rows of deliveries.csv

sumd = 0 
for i in bowlers:
    sumd += deliveries_index[i]

sumd

In [None]:
# pprint(deliveries_index)
pprint(sorted(deliveries_index.items(), key = lambda x: x[1], reverse=True)[:10])

In [None]:
print(data2["dismissal_kind"].unique())
kinds = ["caught", "bowled", "lbw", "caught and bowled", "stumped"]

In [None]:
wickets_index = {}

for i in bowlers:
    wickets_index[i] = len(data2.loc[(data2["player_dismissed"].isin(batsmen)) & (data2["bowler"] == i) & (data2["dismissal_kind"].isin(kinds))])



In [None]:
pprint(sorted(wickets_index.items(), key = lambda x: x[1], reverse=True)[:10])

In [None]:
runs_conceded_index = {}

for i in bowlers:
    runs_conceded_index[i] = data2.loc[(data2["bowler"]) == i].total_runs.sum()

In [None]:
dict(sorted(runs_conceded_index.items(), key = lambda x: x[1], reverse=True)[:20])

In [None]:
average_index = {}

for i in bowlers:
    average_index[i] = runs_conceded_index[i]/wickets_index[i]

In [None]:
average_index["SL Malinga"]

In [None]:
bowlers_matches_index = {}
for i in bowlers:
    bowlers_matches_index[i] = len(set(list(data2[data2["bowler"] == i].match_id)))

In [None]:
bowlers_df = pd.DataFrame({"Name": [], "Matches": [], "Wickets": [], "Runs conceded": [], "Average": []})

for i in bowlers:
    bowlers_df = bowlers_df.append({"Name": i, "Matches": bowlers_matches_index[i], "Wickets": wickets_index[i], "Runs conceded": runs_conceded_index[i], "Average": average_index[i]}, ignore_index=True)

bowlers_df = bowlers_df.sort_values("Wickets", ascending=False)

In [None]:
bowlers_df

In [None]:
bowlers_df.loc[bowlers_df["Matches"] >= 50].sort_values("Average", ascending=True)[:10]

In [None]:
pprint(bowlers_df[:10].Name)

# Inference 1b: 

The top 10 batsmen in IPL history (going by runs) are:

- V Kohli
- SK Raina
- RG Sharma
- DA Warner
- S Dhawan
- CH Gayle
- MS Dhoni
- RV Uthappa
- AB de Villiers
- G Gambhir

The top 10 batsmen in IPL history (going by wickets) are:

- SL Malinga
- A Mishra
- Harbhajan Singh
- PP Chawla
- DJ Bravo
- B Kumar
- R Ashwin
- SP Narine
- UT Yadav
- RA Jadeja

# Objective 2: To find the most favourable players to be ambassadors of different companies for promotions

The top-performing player in each match is regarded as the Man of the Match (MoM). And popular So an argument can be modelled optimally: "The players with more MoM rewards are more likely to be popular."

In [None]:
MoM_names = list(data1.player_of_match)

MoM_index = {}

for i in MoM_names:
    if i in MoM_index.keys():
        MoM_index[i] += 1
    else:
        MoM_index[i] = 1
MoM_index

# print(len(MoM_index.keys()))
top_MoM = pd.DataFrame({"Name": [], "MoM awards": []})

for i in MoM_index.keys():
    top_MoM = top_MoM.append({"Name": i, "MoM awards": MoM_index[i]}, ignore_index=True)

top_MoM = top_MoM.sort_values("MoM awards", ascending=False)

top_MoM.iloc[:20, :]

# Head-to-Head (H2H) matchups

In [None]:
h2h = data1[["team1", "team2", "winner", "venue"]]
h2h_data = pd.DataFrame({"team1": [], "team2": [], "matchups": [], "wins for team 1": [], "wins for team 2": [], "tie(s)": [], "probability of team1 winning": [], "probability of team2 winning": [], "probability of tie": [], "venue": []})

h2h.reset_index(drop=True, inplace=True)
buf = None
explored = []
for i in teams:
    ct1 = 0
    ct2 = 0
    ct3 = 0
    sct = 0
    for j in teams:
        if i != j:
            # print(i, j)
            buf = h2h.loc[(h2h["team1"] == i) & (h2h["team2"] == j)]
            for k in range(len(buf)):
                # print(buf.iloc[k].winner)
                if buf.iloc[k].winner == i:
                    ct1 += 1
                elif buf.iloc[k].winner == j:
                    ct2 += 1
                else:
                    ct3 += 1
            sct = ct1+ct2+ct3
            if sct == 0: sct = 1
            h2h_data = h2h_data.append({"team1": i, "team2": j, "matchups": ct1+ct2+ct3, "wins for team 1": ct1, "wins for team 2": ct2, "tie(s)": ct3, "probability of team1 winning": ct1/sct, "probability of team2 winning": ct2/sct, "probability of tie": ct3/sct, "venue": list(buf.venue)}, ignore_index=True)
        ct1 = 0
        ct2 = 0
        ct3 = 0

h2h_data

### Verifying that the data filtering is correct

In [None]:
h2h_data["wins for team 2"].sum() + h2h_data["wins for team 1"].sum() + h2h_data["tie(s)"].sum() # should be same as total matches played (756)

### Finding tied matches:

In [None]:
h2h_data[h2h_data["tie(s)"] > 0.0]

In [None]:
# data1[(data1["team1"] == "Kings XI Punjab") & (data1["team2"] == "Kolkata Knight Riders")] # Verifying correctness of findings

In [None]:
h2h_data.sort_values("matchups", ascending=False)

## Performance on home ground

In [None]:
fig1a = px.bar(h2h_data, x = "team1", y="wins for team 1", text="wins for team 1", color="team2", labels={
    "team1": "Team",
    "wins for team 1": "Wins",
    "team2": "Opponent",

}, title="Stacked head-to-head wins at home", width=1000, height=800)
fig1a.update_layout(uniformtext_minsize=8)
fig1a.show()

In [None]:
fig2a = px.bar(h2h_data, x = "team1" , y="wins for team 2", text="wins for team 2", color="team2", labels={
    "team1": "Team",
    "wins for team 2": "Losses",
    "team2": "Opponent",

}, title="Stacked head-to-head losses at home", width=1000, height=800)
fig2a.update_layout(uniformtext_minsize=8)
fig2a.show()

In [None]:
figA = px.bar(h2h_data, x = "team1", y=["wins for team 1", "wins for team 2"], color="team2", labels={
    "team1": "Team",
    "value": "Stats (wins | losses)",
    "team2": "Opponent",

}, title="Stacked head-to-head performance at home", width=1000, height=800)

figA.show()

## Performance in away matches

In [None]:
fig1b = px.bar(h2h_data, x = "team2", y="wins for team 2", text="wins for team 2", color="team1", labels={
    "team2": "Team",
    "wins for team 2": "Wins",
    "team1": "Opponent",

}, title="Stacked head-to-head wins at away matches", width=1000, height=800)
fig1b.update_layout(uniformtext_minsize=8)
fig1b.show()

In [None]:
fig2b = px.bar(h2h_data, x = "team2", y="wins for team 1", text="wins for team 1", color="team1", labels={
    "team2": "Team",
    "wins for team 1": "Losses",
    "team1": "Opponent",

}, title="Stacked head-to-head losses at away matches", width=1000, height=800)
fig2b.update_layout(uniformtext_minsize=8)
fig2b.show()

In [None]:
figB = px.bar(h2h_data, x = "team2", y=["wins for team 2", "wins for team 1"], color="team1", labels={
    "team2": "Team",
    "value": "Stats (wins | losses)",
    "team1": "Opponent",

}, title="Stacked head-to-head performance at away matches", width=1000, height=800)

figB.show()

In [None]:
data1.loc[(data1["team1"] == "Gujarat Lions") & (data1["winner"] == "Gujarat Lions")]

In [None]:
len(data1.loc[(data1["team1"] == "Gujarat Lions")])

# Stitching ```winner``` column from match-wise data to delivery-wise data

In [None]:
print(data2.columns)
winners = {}

for i in list(data1.id):
    winners[i] = (data1[data1["id"] == i]["winner"].values)[0]

# print(winners)

In [None]:
balls_match_index = {}

for i in range(len(data2)):
    w = data2.iloc[i].match_id

    if w not in balls_match_index.keys():
        balls_match_index[w] = 1
    else:
        balls_match_index[w] += 1
    # print(w)

print(balls_match_index)

# data3.iloc[10000].match_id

In [None]:
winner_lst = []

for i in winners.keys():
    winner_lst +=  balls_match_index[i] * [winners[i]]

In [None]:
data2 = data2.assign(winner=winner_lst)

In [None]:
print(data2.head())

In [None]:
# data2.to_csv("deliveries2.csv", index=False)  # (save to file for faster lookups)

In [None]:
data1.head(1)

# Generate over-wise data

In [None]:
match_ids = sorted(list(set(list(data2.match_id))))
print(match_ids)

## Filtering over-wise data without super over

In [None]:
# over_data = pd.DataFrame({"match_id": [], "inning": [], "batting_team": [], "bowling_team": [], "over": [], "runs":[], "wickets": [], "winner": []})

# overs = list(range(1,21))
# print(overs)

# buf = None

# innings = [1,2]

# for i in match_ids:
#     j = 1
#     for j in innings:
#         k = 1
#         for k in overs:
#             buf = data2[(data2["match_id"] == i) & (data2["inning"] == j) & (data2["over"] == k) & (data2["is_super_over"] == 0)]

#             if len(buf.batting_team.values) > 0:
#                 bat = buf.batting_team.values[0]
#             else:
#                 bat = np.nan
#             if len(buf.bowling_team.values) > 0:
#                 bowl = buf.bowling_team.values[0]
#             else:
#                 bowl = np.nan
#             # bowl = list(set(buf.bowling_team.values))
#             ballrun = (buf.total_runs.sum())
#             wicket = buf.player_dismissed.count()
#             # winner_match = list(set(data1[data1["id"] == i].winner.values))

#             if len(data1[data1["id"] == i].winner.values) > 0:
#                 winner_match = data1[data1["id"] == i].winner.values[0]
#             else:
#                 winner_match = np.nan

#             over_data = over_data.append({"match_id": i, "inning": j, "batting_team": bat, "bowling_team": bowl, "over": k, "runs": ballrun, "wickets": wicket, "winner": winner_match}, ignore_index=True)

In [None]:
over_data = pd.read_csv("../input/ipldatasets/Overwise statistics without super over.csv") # Reading uploaded data for quick reuse

In [None]:
print(over_data["winner"]) # .iloc[35:50,]

In [None]:
data2[(data2["match_id"] == 1) & (data2["inning"] == 2) & (data2["over"] == 3) & (data2["is_super_over"] == 0)]

## Filtering over-wise data with super over

In [None]:
# super_over_data = pd.DataFrame({"match_id": [], "inning": [], "batting_team": [], "bowling_team": [], "over": [], "runs":[], "wickets": [], "winner": []})

# buf = None
# overs = [1]
# innings = [3,4]

# data3 = data2[data2["is_super_over"] == 1]
# for i in match_ids:
#     j = 3
#     for j in innings:
#         k = 1
#         # for k in overs:
#         buf = data3[(data3["match_id"] == i) & (data3["inning"] == j)] # & (data3["over"] == k)]

#         if len(buf.batting_team.values) > 0:
#             bat = buf.batting_team.values[0]
#         else:
#             bat = np.nan
#         if len(buf.bowling_team.values) > 0:
#             bowl = buf.bowling_team.values[0]
#         else:
#             bowl = np.nan

#         ballrun = (buf.total_runs.sum())
#         wicket = buf.player_dismissed.count()

#         if len(data1[data1["id"] == i].winner.values) > 0:
#             winner_match = data1[data1["id"] == i].winner.values[0]
#         else:
#             winner_match = np.nan

#         super_over_data = super_over_data.append({"match_id": i, "inning": j, "batting_team": bat, "bowling_team": bowl, "over": k, "runs": ballrun, "wickets": wicket, "winner": winner_match}, ignore_index=True)

In [None]:
data2[data2["is_super_over"] == 1].head(12)

# Cells under work below (please don't judge me for this... it's a work under construction)

## Analyzing super over data (Under work)

In [None]:
# super_over_data.loc[super_over_data["batting_team"] != np.nan]

In [None]:
# super_over_data.bowling_team.count()

In [None]:
# over_data.to_csv("Overwise statistics without super over.csv", index=False)

# len(over_data)*6 - len(data2.loc[data2["is_super_over"] == 0]) # Just a random check 

## Analysing over-wise (without super over) data (Under work)

In [None]:
over_data = pd.read_csv("../input/ipldatasets/Overwise statistics without super over.csv")

In [None]:
avg = {}

buf = None
for i in teams:
    buf = over_data[over_data["batting_team"] == i]