In [7]:
import json
import os
import pandas as pd

DATA_PATH = "C:\\Users\\STAR\\Downloads\\ipl-next-over-runs\\Data\\Raw\\ipl_json"

all_balls = []

for file in os.listdir(DATA_PATH):
    if not file.endswith(".json"):
        continue

    with open(os.path.join(DATA_PATH, file), "r", encoding="utf-8") as f:
        match = json.load(f)

    match_id = file.replace(".json", "")
    venue = match["info"].get("venue", None)

    for innings_idx, innings in enumerate(match["innings"], start=1):
        batting_team = innings["team"]

        for over in innings["overs"]:
            over_number = over["over"]

            for ball_idx, delivery in enumerate(over["deliveries"], start=1):
                runs_bat = delivery["runs"]["batter"]
                runs_extra = delivery["runs"]["extras"]
                runs_total = delivery["runs"]["total"]

                is_wicket = 1 if "wickets" in delivery else 0
                bowling_team = delivery.get("bowling_team", None)

                all_balls.append({
                    "match_id": match_id,
                    "innings": innings_idx,
                    "over": over_number,
                    "ball": ball_idx,
                    "batting_team": batting_team,
                    "bowling_team": bowling_team,
                    "batsman_runs": runs_bat,
                    "extras": runs_extra,
                    "total_runs": runs_total,
                    "is_wicket": is_wicket,
                    "venue": venue
                })

df = pd.DataFrame(all_balls)
print("Total balls:", len(df))
df.head()
df.to_csv('ipl_ball_by_ball.csv')

Total balls: 278205


In [8]:
import pandas as pd

# Load ball-by-ball data
df = pd.read_csv('C:\\Users\\STAR\\Downloads\\ipl-next-over-runs\\Data\\processed\\ipl_ball_by_ball.csv')

# Aggregate to over level
over_df = (
    df
    .groupby(["match_id", "innings", "over", "batting_team", "venue"])
    .agg(
        runs_in_over=("total_runs", "sum"),
        wickets_in_over=("is_wicket", "sum")
    )
    .reset_index()
)

# Sort correctly
over_df = over_df.sort_values(
    by=["match_id", "innings", "over"]
).reset_index(drop=True)

over_df.head()


Unnamed: 0,match_id,innings,over,batting_team,venue,runs_in_over,wickets_in_over
0,335982,1,0,Kolkata Knight Riders,M Chinnaswamy Stadium,3,0
1,335982,1,1,Kolkata Knight Riders,M Chinnaswamy Stadium,18,0
2,335982,1,2,Kolkata Knight Riders,M Chinnaswamy Stadium,6,0
3,335982,1,3,Kolkata Knight Riders,M Chinnaswamy Stadium,23,0
4,335982,1,4,Kolkata Knight Riders,M Chinnaswamy Stadium,10,0


In [9]:
over_df.to_csv("ipl_over_level.csv", index=False)

In [10]:
import pandas as pd

over_df = pd.read_csv("C:\\Users\\STAR\\Downloads\\ipl-next-over-runs\\Data\\processed\\ipl_over_level.csv")

# Sort properly
over_df = over_df.sort_values(
    by=["match_id", "innings", "over"]
).reset_index(drop=True)

# Target: next over runs
over_df["runs_next_over"] = (
    over_df.groupby(["match_id", "innings"])["runs_in_over"]
    .shift(-1)
)

# Drop last over of each innings (no target)
over_df = over_df.dropna(subset=["runs_next_over"])


Unnamed: 0,match_id,innings,over,batting_team,venue,runs_in_over,wickets_in_over,runs_next_over
0,335982,1,0,Kolkata Knight Riders,M Chinnaswamy Stadium,3,0,18.0
1,335982,1,1,Kolkata Knight Riders,M Chinnaswamy Stadium,18,0,6.0
2,335982,1,2,Kolkata Knight Riders,M Chinnaswamy Stadium,6,0,23.0
3,335982,1,3,Kolkata Knight Riders,M Chinnaswamy Stadium,23,0,10.0
4,335982,1,4,Kolkata Knight Riders,M Chinnaswamy Stadium,10,0,1.0
...,...,...,...,...,...,...,...,...
44981,1485779,2,14,Delhi Capitals,"Sawai Mansingh Stadium, Jaipur",9,1,4.0
44982,1485779,2,15,Delhi Capitals,"Sawai Mansingh Stadium, Jaipur",4,0,14.0
44983,1485779,2,16,Delhi Capitals,"Sawai Mansingh Stadium, Jaipur",14,0,12.0
44984,1485779,2,17,Delhi Capitals,"Sawai Mansingh Stadium, Jaipur",12,0,14.0


In [12]:
over_df["overs_completed"] = over_df["over"] + 1

over_df["total_runs_so_far"] = (
    over_df.groupby(["match_id", "innings"])["runs_in_over"]
    .cumsum()
)

over_df["current_run_rate"] = (
    over_df["total_runs_so_far"] / over_df["overs_completed"]
)

In [13]:
over_df["total_wickets"] = (
    over_df.groupby(["match_id", "innings"])["wickets_in_over"]
    .cumsum()
)

over_df["wickets_remaining"] = 10 - over_df["total_wickets"]


In [14]:
over_df["runs_last_3_overs"] = (
    over_df.groupby(["match_id", "innings"])["runs_in_over"]
    .rolling(3, min_periods=1)
    .mean()
    .reset_index(level=[0,1], drop=True)
)

over_df["wickets_last_3_overs"] = (
    over_df.groupby(["match_id", "innings"])["wickets_in_over"]
    .rolling(3, min_periods=1)
    .sum()
    .reset_index(level=[0,1], drop=True)
)

In [15]:
def over_phase(over):
    if over <= 5:
        return 0   # Powerplay
    elif over <= 14:
        return 1   # Middle
    else:
        return 2   # Death

over_df["over_phase"] = over_df["over"].apply(over_phase)

In [16]:
print(over_df[[
    "runs_in_over",
    "runs_next_over",
    "current_run_rate",
    "wickets_remaining"
]].describe())

       runs_in_over  runs_next_over  current_run_rate  wickets_remaining
count  42622.000000    42622.000000      42622.000000       42622.000000
mean       8.229975        8.434846          7.669938           7.574304
std        4.594119        4.693827          2.151044           2.012468
min        0.000000        0.000000          0.000000           1.000000
25%        5.000000        5.000000          6.416667           6.000000
50%        8.000000        8.000000          7.666667           8.000000
75%       11.000000       11.000000          8.909091           9.000000
max       37.000000       37.000000         27.000000          10.000000


In [17]:
over_df.to_csv("C:\\Users\\STAR\\Downloads\\ipl-next-over-runs\\Data\\processed\\ipl_next_over_ml.csv", index=False)
