In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm


In [20]:
filenames = []

base_path = "/content/drive/MyDrive/Ball to ball"

for file in os.listdir(base_path):
    if file.endswith(".yaml"):
        filenames.append(os.path.join(base_path, file))

len(filenames)

4983

In [21]:
filenames[0:5]

['/content/drive/MyDrive/Ball to ball/1321258.yaml',
 '/content/drive/MyDrive/Ball to ball/1373569.yaml',
 '/content/drive/MyDrive/Ball to ball/1486095.yaml',
 '/content/drive/MyDrive/Ball to ball/1031431.yaml',
 '/content/drive/MyDrive/Ball to ball/1311744.yaml']

In [22]:
if os.path.exists("final.csv"):
    os.remove("final.csv")

In [40]:
all_cols = set()

for file in tqdm(filenames):
    with open(file, "r") as f:
        df = pd.json_normalize(safe_load(f))
        all_cols.update(df.columns)

all_cols = list(all_cols)
len(all_cols)

100%|██████████| 4983/4983 [13:19<00:00,  6.23it/s]


7872

In [57]:
processed_data_for_df = []
counter = 1

for file_path in tqdm(filenames):
    with open(file_path, "r") as f:
        raw_data = safe_load(f)

        info = raw_data.get('info', {})
        innings = raw_data.get('innings', [])
        temp_df = pd.json_normalize({"info": info}, sep='.')

        # Add match_id
        temp_df['match_id'] = counter

        # Assign the 'innings' list wrapped in another list to store it as a single object
        temp_df['innings'] = [innings]

        # Convert the 1-row DataFrame to a dictionary and append
        processed_data_for_df.append(temp_df.iloc[0].to_dict())

        counter += 1

# Create the final DataFrame from the processed data
final_df = pd.DataFrame(processed_data_for_df)

# Removed the df.to_csv("final.csv") part from here to prevent incorrect serialization
# The final_df will now retain its complex object types in memory for pickling.

100%|██████████| 4983/4983 [14:00<00:00,  5.93it/s]


In [58]:
# This cell is no longer needed as final_df is generated directly in memory by 7SprWWsgjH_t
# and we do not want to overwrite it by reading from an potentially outdated final.csv.
# final_df = pd.read_csv("final.csv")

final_df.shape
final_df.head()

Unnamed: 0,info.balls_per_over,info.city,info.dates,info.gender,info.match_type,info.match_type_number,info.outcome.winner,info.outcome.by.wickets,info.overs,info.player_of_match,...,info.registry.people.Mohammed Aslam,info.registry.people.Mukhtiar Singh,info.registry.people.Marija Mratinkovic,info.registry.people.Adnan Mufti,info.registry.people.D Bau,info.registry.people.L Sreekumar,info.registry.people.Mubeen Tariq,info.registry.people.F Pieters,info.registry.people.Aarsheya Sharma,info.registry.people.S Vaidyanathan
0,6,Kerava,[2022-07-12],male,T20,1639.0,Isle of Man,8.0,20,[G Burrows],...,,,,,,,,,,
1,6,Durban,[2023-09-01],male,T20,2228.0,Australia,8.0,20,[SA Abbott],...,,,,,,,,,,
2,6,Rome,[2025-05-26],female,T20,2311.0,Italy,10.0,20,[D Nanayakkara],...,,,,,,,,,,
3,6,Southampton,[2017-06-21],male,T20,,England,9.0,20,[JM Bairstow],...,,,,,,,,,,
4,6,Ajman,[2022-04-27],female,T20,1064.0,United Arab Emirates,7.0,20,[CR Seneviratna],...,,,,,,,,,,


In [59]:
required_cols = [
    "innings",
    "info.dates",
    "info.gender",
    "info.match_type",
    "info.outcome.winner",
    "info.overs",
    "info.player_of_match",
    "info.teams",
    "info.toss.decision",
    "info.toss.winner",
    "info.umpires",
    "info.venue",
    "match_id",
    "info.city"
]

final_df = final_df.loc[:, required_cols]

In [60]:
final_df

Unnamed: 0,innings,info.dates,info.gender,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Cyprus', 'deliverie...",[2022-07-12],male,T20,Isle of Man,20,[G Burrows],"[Cyprus, Isle of Man]",field,Isle of Man,"[S Kuchimanchi, A Paleker]",Kerava National Cricket Ground,1,Kerava
1,"[{'1st innings': {'team': 'South Africa', 'del...",[2023-09-01],male,T20,Australia,20,[SA Abbott],"[South Africa, Australia]",field,Australia,"[A Paleker, Stephen Harris]","Kingsmead, Durban",2,Durban
2,"[{'1st innings': {'team': 'Sweden', 'deliverie...",[2025-05-26],female,T20,Italy,20,[D Nanayakkara],"[Sweden, Italy]",bat,Sweden,"[DH Mclean, M Prabhudesa]","Simar Cricket Ground, Rome",3,Rome
3,"[{'1st innings': {'team': 'South Africa', 'del...",[2017-06-21],male,T20,England,20,[JM Bairstow],"[England, South Africa]",bat,South Africa,"[RJ Bailey, RT Robinson]",The Rose Bowl,4,Southampton
4,"[{'1st innings': {'team': 'Hong Kong', 'delive...",[2022-04-27],female,T20,United Arab Emirates,20,[CR Seneviratna],"[Hong Kong, United Arab Emirates]",bat,Hong Kong,"[Aasif Iqbal, Shameem Abdul Basheer]","Malek Cricket Ground, Ajman",5,Ajman
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4978,"[{'1st innings': {'team': 'Indonesia', 'delive...",[2025-07-10],male,T20,Philippines,20,[KD Kesuma],"[Indonesia, Philippines]",bat,Indonesia,"[F Shony, S Hawoe]",Udayana Cricket Ground,4979,Bali
4979,"[{'1st innings': {'team': 'Philippines', 'deli...",[2024-10-05],male,T20,Philippines,20,[KDA Lukies],"[Philippines, South Korea]",bat,Philippines,"[Suresh Subramanian, Tanvir Ahmed]","Yeonhui Cricket Ground, Incheon",4980,Incheon
4980,"[{'1st innings': {'team': 'England', 'deliveri...",[2024-06-15],male,T20,England,20,[HC Brook],"[England, Namibia]",field,Namibia,"[AT Holdstock, L Rusere]","Sir Vivian Richards Stadium, North Sound, Antigua",4981,North Sound
4981,"[{'1st innings': {'team': 'Singapore', 'delive...",[2024-12-27],female,T20,Philippines,20,[AR Valdez],"[Singapore, Philippines]",field,Philippines,"[KK Ghosh, KSVP Venu Madhav]",Singapore National Cricket Ground,4982,Singapore


In [61]:
final_df['info.gender'].value_counts()

Unnamed: 0_level_0,count
info.gender,Unnamed: 1_level_1
male,3140
female,1843


In [62]:
final_df = final_df[final_df['info.gender'] == 'male']
final_df.drop(columns=['info.gender'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.gender'],inplace=True)


Unnamed: 0,innings,info.dates,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Cyprus', 'deliverie...",[2022-07-12],T20,Isle of Man,20,[G Burrows],"[Cyprus, Isle of Man]",field,Isle of Man,"[S Kuchimanchi, A Paleker]",Kerava National Cricket Ground,1,Kerava
1,"[{'1st innings': {'team': 'South Africa', 'del...",[2023-09-01],T20,Australia,20,[SA Abbott],"[South Africa, Australia]",field,Australia,"[A Paleker, Stephen Harris]","Kingsmead, Durban",2,Durban
3,"[{'1st innings': {'team': 'South Africa', 'del...",[2017-06-21],T20,England,20,[JM Bairstow],"[England, South Africa]",bat,South Africa,"[RJ Bailey, RT Robinson]",The Rose Bowl,4,Southampton
7,"[{'1st innings': {'team': 'Pakistan', 'deliver...",[2010-09-07],T20,England,20,[TT Bresnan],"[England, Pakistan]",bat,Pakistan,"[IJ Gould, RK Illingworth]",Sophia Gardens,8,Cardiff
8,"[{'1st innings': {'team': 'Australia', 'delive...",[2011-01-12],T20,England,20,[SR Watson],"[Australia, England]",bat,Australia,"[SD Fry, BNJ Oxenford]",Adelaide Oval,9,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976,"[{'1st innings': {'team': 'South Africa', 'del...",[2021-04-10],T20,Pakistan,20,[Mohammad Rizwan],"[South Africa, Pakistan]",bat,South Africa,"[AT Holdstock, A Paleker]","The Wanderers Stadium, Johannesburg",4977,Johannesburg
4977,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2022-10-13],T20,Pakistan,20,[Mohammad Rizwan],"[Bangladesh, Pakistan]",bat,Bangladesh,"[SB Haig, WR Knights]","Hagley Oval, Christchurch",4978,Christchurch
4978,"[{'1st innings': {'team': 'Indonesia', 'delive...",[2025-07-10],T20,Philippines,20,[KD Kesuma],"[Indonesia, Philippines]",bat,Indonesia,"[F Shony, S Hawoe]",Udayana Cricket Ground,4979,Bali
4979,"[{'1st innings': {'team': 'Philippines', 'deli...",[2024-10-05],T20,Philippines,20,[KDA Lukies],"[Philippines, South Korea]",bat,Philippines,"[Suresh Subramanian, Tanvir Ahmed]","Yeonhui Cricket Ground, Incheon",4980,Incheon


In [63]:
final_df['info.match_type'].value_counts()

Unnamed: 0_level_0,count
info.match_type,Unnamed: 1_level_1
T20,3140


In [64]:
final_df['info.overs'].value_counts()

Unnamed: 0_level_0,count
info.overs,Unnamed: 1_level_1
20,3132
50,8


In [65]:
final_df = final_df[final_df['info.overs'] == 20]
final_df.drop(columns=['info.overs','info.match_type'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.overs','info.match_type'],inplace=True)


Unnamed: 0,innings,info.dates,info.outcome.winner,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Cyprus', 'deliverie...",[2022-07-12],Isle of Man,[G Burrows],"[Cyprus, Isle of Man]",field,Isle of Man,"[S Kuchimanchi, A Paleker]",Kerava National Cricket Ground,1,Kerava
1,"[{'1st innings': {'team': 'South Africa', 'del...",[2023-09-01],Australia,[SA Abbott],"[South Africa, Australia]",field,Australia,"[A Paleker, Stephen Harris]","Kingsmead, Durban",2,Durban
3,"[{'1st innings': {'team': 'South Africa', 'del...",[2017-06-21],England,[JM Bairstow],"[England, South Africa]",bat,South Africa,"[RJ Bailey, RT Robinson]",The Rose Bowl,4,Southampton
7,"[{'1st innings': {'team': 'Pakistan', 'deliver...",[2010-09-07],England,[TT Bresnan],"[England, Pakistan]",bat,Pakistan,"[IJ Gould, RK Illingworth]",Sophia Gardens,8,Cardiff
8,"[{'1st innings': {'team': 'Australia', 'delive...",[2011-01-12],England,[SR Watson],"[Australia, England]",bat,Australia,"[SD Fry, BNJ Oxenford]",Adelaide Oval,9,
...,...,...,...,...,...,...,...,...,...,...,...
4976,"[{'1st innings': {'team': 'South Africa', 'del...",[2021-04-10],Pakistan,[Mohammad Rizwan],"[South Africa, Pakistan]",bat,South Africa,"[AT Holdstock, A Paleker]","The Wanderers Stadium, Johannesburg",4977,Johannesburg
4977,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2022-10-13],Pakistan,[Mohammad Rizwan],"[Bangladesh, Pakistan]",bat,Bangladesh,"[SB Haig, WR Knights]","Hagley Oval, Christchurch",4978,Christchurch
4978,"[{'1st innings': {'team': 'Indonesia', 'delive...",[2025-07-10],Philippines,[KD Kesuma],"[Indonesia, Philippines]",bat,Indonesia,"[F Shony, S Hawoe]",Udayana Cricket Ground,4979,Bali
4979,"[{'1st innings': {'team': 'Philippines', 'deli...",[2024-10-05],Philippines,[KDA Lukies],"[Philippines, South Korea]",bat,Philippines,"[Suresh Subramanian, Tanvir Ahmed]","Yeonhui Cricket Ground, Incheon",4980,Incheon


In [66]:
import pickle

with open("dataset_level1.pkl", "wb") as f:
    pickle.dump(final_df, f)

In [67]:
with open("dataset_level1.pkl", "rb") as f:
    matches = pickle.load(f)
matches.iloc[0]['innings'][0]['1st innings']['deliveries']

[{0.1: {'bowler': 'CJ Langford',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'non_striker': 'AUKDS Kalugala',
   'batsman': 'Muhammad Shoaib Ahmed'}},
 {0.2: {'bowler': 'CJ Langford',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'non_striker': 'AUKDS Kalugala',
   'batsman': 'Muhammad Shoaib Ahmed'}},
 {0.3: {'bowler': 'CJ Langford',
   'runs': {'extras': 0, 'total': 4, 'batsman': 4},
   'non_striker': 'AUKDS Kalugala',
   'batsman': 'Muhammad Shoaib Ahmed'}},
 {0.4: {'bowler': 'CJ Langford',
   'runs': {'extras': 0, 'total': 1, 'batsman': 1},
   'non_striker': 'AUKDS Kalugala',
   'batsman': 'Muhammad Shoaib Ahmed'}},
 {0.5: {'non_striker': 'Muhammad Shoaib Ahmed',
   'bowler': 'CJ Langford',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'batsman': 'AUKDS Kalugala'}},
 {0.6: {'non_striker': 'Muhammad Shoaib Ahmed',
   'bowler': 'CJ Langford',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'batsman': 'AUKDS Kalugala'}},
 {1.1: {'non_striker': 'AUKD

In [68]:
count = 1
delivery_df = pd.DataFrame()
for index, row in matches.iterrows():
    # The problematic match IDs below seem to cause issues due to inconsistent structure.
    # Skipping these specific matches to avoid errors and proceed with the rest of the data.
    if count in [75,108,150,180,268,360,443,458,584,748,982,1052,1111,1226,1345]:
        count+=1
        continue

    ball_of_match = []
    batsman = []
    bowler = []
    runs = []
    player_of_dismissed = []
    teams = []
    batting_team = []
    match_id = []
    city = []
    venue = []

    # Accessing the first innings deliveries. Ensure 'innings' is a list of dicts.
    first_innings_data = row['innings'][0]
    deliveries = first_innings_data['1st innings']['deliveries']

    for ball in deliveries:
        for key in ball.keys():
            match_id.append(count)
            batting_team.append(first_innings_data['1st innings']['team'])
            teams.append(row['info.teams'])
            ball_of_match.append(key)
            batsman.append(ball[key]['batsman'])
            bowler.append(ball[key]['bowler'])
            runs.append(ball[key]['runs']['total'])
            city.append(row['info.city'])
            venue.append(row['info.venue'])
            try:
                player_of_dismissed.append(ball[key]['wicket']['player_out'])
            except:
                player_of_dismissed.append('0')
    loop_df = pd.DataFrame({
            'match_id':match_id,
            'teams':teams,
            'batting_team':batting_team,
            'ball':ball_of_match,
            'batsman':batsman,
            'bowler':bowler,
            'runs':runs,
            'player_dismissed':player_of_dismissed,
            'city':city,
            'venue':venue
        })
    # Using pd.concat for efficiency instead of append in loop
    delivery_df = pd.concat([delivery_df, loop_df], ignore_index=True)
    count+=1


In [69]:
def bowl(row):
    for team in row['teams']:
        if team != row['batting_team']:
            return team

In [70]:
delivery_df['bowling_team'] = delivery_df.apply(bowl,axis=1)

In [71]:
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,1,"[Cyprus, Isle of Man]",Cyprus,0.1,Muhammad Shoaib Ahmed,CJ Langford,0,0,Kerava,Kerava National Cricket Ground,Isle of Man
1,1,"[Cyprus, Isle of Man]",Cyprus,0.2,Muhammad Shoaib Ahmed,CJ Langford,0,0,Kerava,Kerava National Cricket Ground,Isle of Man
2,1,"[Cyprus, Isle of Man]",Cyprus,0.3,Muhammad Shoaib Ahmed,CJ Langford,4,0,Kerava,Kerava National Cricket Ground,Isle of Man
3,1,"[Cyprus, Isle of Man]",Cyprus,0.4,Muhammad Shoaib Ahmed,CJ Langford,1,0,Kerava,Kerava National Cricket Ground,Isle of Man
4,1,"[Cyprus, Isle of Man]",Cyprus,0.5,AUKDS Kalugala,CJ Langford,0,0,Kerava,Kerava National Cricket Ground,Isle of Man
...,...,...,...,...,...,...,...,...,...,...,...
376869,3132,"[England, Namibia]",England,9.2,MM Ali,R Trumpelmann,0,MM Ali,North Sound,"Sir Vivian Richards Stadium, North Sound, Antigua",Namibia
376870,3132,"[England, Namibia]",England,9.3,LS Livingstone,R Trumpelmann,6,0,North Sound,"Sir Vivian Richards Stadium, North Sound, Antigua",Namibia
376871,3132,"[England, Namibia]",England,9.4,LS Livingstone,R Trumpelmann,6,0,North Sound,"Sir Vivian Richards Stadium, North Sound, Antigua",Namibia
376872,3132,"[England, Namibia]",England,9.5,LS Livingstone,R Trumpelmann,2,0,North Sound,"Sir Vivian Richards Stadium, North Sound, Antigua",Namibia


In [72]:
delivery_df.drop(columns=['teams'],inplace=True)

In [73]:
delivery_df['batting_team'].value_counts()

Unnamed: 0_level_0,count
batting_team,Unnamed: 1_level_1
Pakistan,18158
India,17887
New Zealand,16059
Sri Lanka,14036
South Africa,13627
...,...
Ivory Coast,310
Chile,261
Turks and Caicos Island,254
China,218


In [75]:
top20_teams = delivery_df['batting_team'].value_counts().head(20)

In [80]:
top20_teams

['Pakistan',
 'India',
 'New Zealand',
 'Sri Lanka',
 'South Africa',
 'West Indies',
 'England',
 'Bangladesh',
 'Zimbabwe',
 'Australia',
 'Ireland',
 'Hong Kong',
 'Netherlands',
 'United Arab Emirates',
 'Malaysia',
 'Nigeria',
 'Uganda',
 'Bahrain',
 'Scotland',
 'Nepal']

In [81]:
teams=['Pakistan',
 'India',
 'New Zealand',
 'Sri Lanka',
 'South Africa',
 'West Indies',
 'England',
 'Bangladesh',
 'Zimbabwe',
 'Australia',
 'Ireland',
 'Hong Kong',
 'Netherlands',
 'United Arab Emirates',
 'Malaysia',
 'Nigeria',
 'Uganda',
 'Bahrain',
 'Scotland',
 'Nepal']

In [82]:
delivery_df = delivery_df[delivery_df['batting_team'].isin(teams)]
delivery_df = delivery_df[delivery_df['bowling_team'].isin(teams)]

In [83]:
delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
125,2,South Africa,0.1,RR Hendricks,AM Hardie,1,0,Durban,"Kingsmead, Durban",Australia
126,2,South Africa,0.2,T Bavuma,AM Hardie,4,0,Durban,"Kingsmead, Durban",Australia
127,2,South Africa,0.3,T Bavuma,AM Hardie,0,0,Durban,"Kingsmead, Durban",Australia
128,2,South Africa,0.4,T Bavuma,AM Hardie,0,0,Durban,"Kingsmead, Durban",Australia
129,2,South Africa,0.5,T Bavuma,AM Hardie,4,0,Durban,"Kingsmead, Durban",Australia
...,...,...,...,...,...,...,...,...,...,...
376563,3129,Bangladesh,19.2,Nurul Hasan,Mohammad Wasim,0,0,Christchurch,"Hagley Oval, Christchurch",Pakistan
376564,3129,Bangladesh,19.3,Nurul Hasan,Mohammad Wasim,0,0,Christchurch,"Hagley Oval, Christchurch",Pakistan
376565,3129,Bangladesh,19.4,Nurul Hasan,Mohammad Wasim,0,Afif Hossain,Christchurch,"Hagley Oval, Christchurch",Pakistan
376566,3129,Bangladesh,19.5,Mohammad Saifuddin,Mohammad Wasim,1,0,Christchurch,"Hagley Oval, Christchurch",Pakistan


In [84]:
output = delivery_df[['match_id','batting_team','bowling_team','ball','runs','player_dismissed','city','venue']]

In [85]:
output

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
125,2,South Africa,Australia,0.1,1,0,Durban,"Kingsmead, Durban"
126,2,South Africa,Australia,0.2,4,0,Durban,"Kingsmead, Durban"
127,2,South Africa,Australia,0.3,0,0,Durban,"Kingsmead, Durban"
128,2,South Africa,Australia,0.4,0,0,Durban,"Kingsmead, Durban"
129,2,South Africa,Australia,0.5,4,0,Durban,"Kingsmead, Durban"
...,...,...,...,...,...,...,...,...
376563,3129,Bangladesh,Pakistan,19.2,0,0,Christchurch,"Hagley Oval, Christchurch"
376564,3129,Bangladesh,Pakistan,19.3,0,0,Christchurch,"Hagley Oval, Christchurch"
376565,3129,Bangladesh,Pakistan,19.4,0,Afif Hossain,Christchurch,"Hagley Oval, Christchurch"
376566,3129,Bangladesh,Pakistan,19.5,1,0,Christchurch,"Hagley Oval, Christchurch"


In [86]:
import pickle

with open("dataset_level2.pkl", "wb") as f:
    pickle.dump(output, f)

In [87]:
teams

['Pakistan',
 'India',
 'New Zealand',
 'Sri Lanka',
 'South Africa',
 'West Indies',
 'England',
 'Bangladesh',
 'Zimbabwe',
 'Australia',
 'Ireland',
 'Hong Kong',
 'Netherlands',
 'United Arab Emirates',
 'Malaysia',
 'Nigeria',
 'Uganda',
 'Bahrain',
 'Scotland',
 'Nepal']

In [91]:
import pandas as pd
pd.Series(city).unique()

array(['North Sound'], dtype=object)

To access files in your Google Drive, you first need to mount your Drive to this Colab environment. This will allow you to navigate your Drive folders and files.

After executing the above cell and following the authentication steps, your Google Drive will be mounted at `/content/drive`. You can then list the contents of your Drive using shell commands like `!ls /content/drive/MyDrive`.