In [1]:
import numpy as np
import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm
import json

In [2]:
folder_path = 'Data'
filenames = []

for file in os.listdir(folder_path):
    filenames.append(os.path.join(folder_path, file))

In [3]:
filenames[0:5]

['Data\\.ipynb_checkpoints',
 'Data\\1001349.yaml',
 'Data\\1001351.yaml',
 'Data\\1001353.yaml',
 'Data\\1004729.yaml']

In [7]:
len(filenames)

4343

In [8]:
final_df_list = []
counter = 1

for file in tqdm(filenames):
    try:
        with open(file, 'r', encoding='utf-8') as f:
            data = safe_load(f)
            if isinstance(data, dict):  # make sure it's a valid dict
                df = pd.json_normalize(data)
                df['match_id'] = counter
                final_df_list.append(df)
                counter += 1
            else:
                print(f"Skipping unsupported data type in file: {file}")
    except Exception as e:
        print(f"Skipping file due to error: {file} ({e})")

  0%|                                                                                         | 0/4343 [00:00<?, ?it/s]

Skipping file due to error: Data\.ipynb_checkpoints ([Errno 13] Permission denied: 'Data\\.ipynb_checkpoints')


100%|██████████████████████████████████████████████████████████████████████████████| 4343/4343 [14:17<00:00,  5.06it/s]


In [9]:
batch_size = 100  # adjust this (e.g., 500 or 1000)

batches = [final_df_list[i:i+batch_size] for i in range(0, len(final_df_list), batch_size)]

dfs = []
for batch in tqdm(batches, desc="Concatenating batches"):
    dfs.append(pd.concat(batch, ignore_index=True))

# Final concat of all batches
final_df = pd.concat(dfs, ignore_index=True)
print("✅ Final dataframe shape:", final_df.shape)

Concatenating batches: 100%|███████████████████████████████████████████████████████████| 44/44 [01:05<00:00,  1.50s/it]


✅ Final dataframe shape: (4342, 7434)


In [10]:
final_df

Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.dates,info.gender,info.match_type,info.outcome.by.wickets,info.outcome.winner,...,info.registry.people.Muhammad Kaleem,info.registry.people.Usman Mushtaq,info.registry.people.Mohammad Shahid,info.registry.people.Muktar Ali,info.registry.people.Mohammad Usman,info.registry.people.VS Wategaonkar,info.registry.people.Farhan Ahmed,info.registry.people.Khurram Manzoor,info.registry.people.P Negi,info.registry.people.Hafiz Qaleem
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-18,2,6,[2017-02-17],male,T20,5.0,Sri Lanka,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-19,2,6,[2017-02-19],male,T20,2.0,Sri Lanka,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-23,1,6,[2017-02-22],male,T20,,Australia,...,,,,,,,,,,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",0.91,2016-09-12,1,6,[2016-09-05],male,T20,,Hong Kong,...,,,,,,,,,,
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",0.91,2016-06-19,1,6,[2016-06-18],male,T20,,Zimbabwe,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4337,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",0.91,2016-03-05,2,6,[2016-03-04],male,T20,6.0,Pakistan,...,,,,,,,,,,
4338,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",0.91,2016-03-08,1,6,[2016-03-06],male,T20,8.0,India,...,,,,,,,,,,
4339,"[{'1st innings': {'team': 'Netherlands', 'deli...",0.91,2016-02-03,1,6,[2016-02-03],male,T20,,Netherlands,...,,,,,cee89f44,,927694f7,,,f566cd7d
4340,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2016-09-12,1,6,[2016-09-06],male,T20,,Australia,...,,,,,,,,,,


In [12]:
backup = final_df.copy()

In [14]:
final_df.drop(columns=[
    'meta.data_version',
    'meta.created',
    'meta.revision',
    'info.outcome.bowl_out',
    'info.bowl_out',
    'info.supersubs.South Africa',
    'info.supersubs.New Zealand',
    'info.outcome.eliminator',
    'info.outcome.result',
    'info.outcome.method',
    'info.neutral_venue',
    'info.match_type_number',
    'info.outcome.by.runs',
    'info.outcome.by.wickets'
],inplace=True)

In [15]:
final_df.columns

Index(['innings', 'info.balls_per_over', 'info.dates', 'info.gender',
       'info.match_type', 'info.outcome.winner', 'info.overs',
       'info.player_of_match', 'info.players.Australia',
       'info.players.Sri Lanka',
       ...
       'info.registry.people.Muhammad Kaleem',
       'info.registry.people.Usman Mushtaq',
       'info.registry.people.Mohammad Shahid',
       'info.registry.people.Muktar Ali',
       'info.registry.people.Mohammad Usman',
       'info.registry.people.VS Wategaonkar',
       'info.registry.people.Farhan Ahmed',
       'info.registry.people.Khurram Manzoor', 'info.registry.people.P Negi',
       'info.registry.people.Hafiz Qaleem'],
      dtype='object', length=7420)

In [16]:
final_df.shape

(4342, 7420)

In [18]:
final_df['info.gender'].value_counts()

info.gender
male      2761
female    1581
Name: count, dtype: int64

In [19]:
final_df = final_df[final_df['info.gender'] == 'male']
final_df.drop(columns=['info.gender'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.gender'],inplace=True)


Unnamed: 0,innings,info.balls_per_over,info.dates,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.players.Australia,info.players.Sri Lanka,info.registry.people.A Zampa,...,info.registry.people.Muhammad Kaleem,info.registry.people.Usman Mushtaq,info.registry.people.Mohammad Shahid,info.registry.people.Muktar Ali,info.registry.people.Mohammad Usman,info.registry.people.VS Wategaonkar,info.registry.people.Farhan Ahmed,info.registry.people.Khurram Manzoor,info.registry.people.P Negi,info.registry.people.Hafiz Qaleem
0,"[{'1st innings': {'team': 'Australia', 'delive...",6,[2017-02-17],T20,Sri Lanka,20,[DAS Gunaratne],"[AJ Finch, M Klinger, TM Head, MC Henriques, A...","[N Dickwella, WU Tharanga, EMDY Munaweera, DAS...",14f96089,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",6,[2017-02-19],T20,Sri Lanka,20,[DAS Gunaratne],"[M Klinger, AJ Finch, BR Dunk, MC Henriques, T...","[N Dickwella, WU Tharanga, EMDY Munaweera, BKG...",,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Australia', 'delive...",6,[2017-02-22],T20,Australia,20,[A Zampa],"[AJ Finch, M Klinger, BR Dunk, TM Head, MC Hen...","[EMDY Munaweera, WU Tharanga, BKG Mendis, DAS ...",14f96089,...,,,,,,,,,,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",6,[2016-09-05],T20,Hong Kong,20,,,,,...,,,,,,,,,,
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",6,[2016-06-18],T20,Zimbabwe,20,[E Chigumbura],,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4337,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",6,[2016-03-04],T20,Pakistan,20,[Umar Akmal],,"[LD Chandimal, TM Dilshan, GSNFG Jayasuriya, C...",,...,,,,,,,,,,
4338,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",6,[2016-03-06],T20,India,20,[S Dhawan],,,,...,,,,,,,,,,
4339,"[{'1st innings': {'team': 'Netherlands', 'deli...",6,[2016-02-03],T20,Netherlands,20,[Mudassar Bukhari],,,,...,,,,,cee89f44,,927694f7,,,f566cd7d
4340,"[{'1st innings': {'team': 'Australia', 'delive...",6,[2016-09-06],T20,Australia,20,[GJ Maxwell],"[GJ Maxwell, DA Warner, UT Khawaja, TM Head, G...","[MDKJ Perera, TM Dilshan, DM de Silva, LD Chan...",14f96089,...,,,,,,,,,,


In [23]:
cols=['innings', 'info.dates'	,'info.match_type'	,'info.outcome.winner',	'info.overs',	'info.player_of_match',	'info.teams'	,'info.toss.decision'	,'info.toss.winner',	'info.umpires'	,'info.venue'	,'match_id'	,'info.city']

In [24]:
len(cols)

13

In [25]:
final_df = final_df[cols]

In [26]:
final_df

Unnamed: 0,innings,info.dates,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-17],T20,Sri Lanka,20,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Melbourne Cricket Ground,1,
1,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-19],T20,Sri Lanka,20,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[SD Fry, SJ Nogajski]","Simonds Stadium, South Geelong",2,Victoria
2,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-22],T20,Australia,20,[A Zampa],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Adelaide Oval,3,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",[2016-09-05],T20,Hong Kong,20,,"[Ireland, Hong Kong]",bat,Hong Kong,"[R Black, AJ Neill]","Bready Cricket Club, Magheramason",4,Londonderry
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",[2016-06-18],T20,Zimbabwe,20,[E Chigumbura],"[Zimbabwe, India]",field,India,"[TJ Matibiri, RB Tiffin]",Harare Sports Club,5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4337,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",[2016-03-04],T20,Pakistan,20,[Umar Akmal],"[Pakistan, Sri Lanka]",field,Pakistan,"[AK Chaudhary, Enamul Haque]",Shere Bangla National Stadium,4338,Mirpur
4338,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2016-03-06],T20,India,20,[S Dhawan],"[Bangladesh, India]",field,India,"[RSA Palliyaguruge, Shozab Raza]",Shere Bangla National Stadium,4339,Mirpur
4339,"[{'1st innings': {'team': 'Netherlands', 'deli...",[2016-02-03],T20,Netherlands,20,[Mudassar Bukhari],"[United Arab Emirates, Netherlands]",field,United Arab Emirates,"[CK Nandan, Sarika Prasad]",ICC Academy,4340,Dubai
4340,"[{'1st innings': {'team': 'Australia', 'delive...",[2016-09-06],T20,Australia,20,[GJ Maxwell],"[Sri Lanka, Australia]",field,Sri Lanka,"[REJ Martinesz, RR Wimalasiri]",Pallekele International Cricket Stadium,4341,


In [27]:
final_df['info.match_type'].value_counts()

info.match_type
T20    2761
Name: count, dtype: int64

In [28]:
final_df['info.overs'].value_counts()

info.overs
20    2755
50       6
Name: count, dtype: int64

In [29]:
final_df = final_df[final_df['info.overs'] == 20]
final_df.drop(columns=['info.overs','info.match_type'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.overs','info.match_type'],inplace=True)


Unnamed: 0,innings,info.dates,info.outcome.winner,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-17],Sri Lanka,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Melbourne Cricket Ground,1,
1,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-19],Sri Lanka,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[SD Fry, SJ Nogajski]","Simonds Stadium, South Geelong",2,Victoria
2,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-22],Australia,[A Zampa],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Adelaide Oval,3,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",[2016-09-05],Hong Kong,,"[Ireland, Hong Kong]",bat,Hong Kong,"[R Black, AJ Neill]","Bready Cricket Club, Magheramason",4,Londonderry
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",[2016-06-18],Zimbabwe,[E Chigumbura],"[Zimbabwe, India]",field,India,"[TJ Matibiri, RB Tiffin]",Harare Sports Club,5,
...,...,...,...,...,...,...,...,...,...,...,...
4337,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",[2016-03-04],Pakistan,[Umar Akmal],"[Pakistan, Sri Lanka]",field,Pakistan,"[AK Chaudhary, Enamul Haque]",Shere Bangla National Stadium,4338,Mirpur
4338,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2016-03-06],India,[S Dhawan],"[Bangladesh, India]",field,India,"[RSA Palliyaguruge, Shozab Raza]",Shere Bangla National Stadium,4339,Mirpur
4339,"[{'1st innings': {'team': 'Netherlands', 'deli...",[2016-02-03],Netherlands,[Mudassar Bukhari],"[United Arab Emirates, Netherlands]",field,United Arab Emirates,"[CK Nandan, Sarika Prasad]",ICC Academy,4340,Dubai
4340,"[{'1st innings': {'team': 'Australia', 'delive...",[2016-09-06],Australia,[GJ Maxwell],"[Sri Lanka, Australia]",field,Sri Lanka,"[REJ Martinesz, RR Wimalasiri]",Pallekele International Cricket Stadium,4341,


In [30]:
import pickle
pickle.dump(final_df,open('dataset_level1.pkl','wb'))

In [31]:
final_df.to_csv("final.csv")

In [37]:
#matches = pickle.load(open('dataset_level1.pkl','rb'))
matches.iloc[0]['innings'][0]['1st innings']['deliveries']

[{0.1: {'batsman': 'AJ Finch',
   'bowler': 'SL Malinga',
   'non_striker': 'M Klinger',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.2: {'batsman': 'AJ Finch',
   'bowler': 'SL Malinga',
   'non_striker': 'M Klinger',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.3: {'batsman': 'AJ Finch',
   'bowler': 'SL Malinga',
   'non_striker': 'M Klinger',
   'runs': {'batsman': 1, 'extras': 0, 'total': 1}}},
 {0.4: {'batsman': 'M Klinger',
   'bowler': 'SL Malinga',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 2, 'extras': 0, 'total': 2}}},
 {0.5: {'batsman': 'M Klinger',
   'bowler': 'SL Malinga',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.6: {'batsman': 'M Klinger',
   'bowler': 'SL Malinga',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 3, 'extras': 0, 'total': 3}}},
 {1.1: {'batsman': 'M Klinger',
   'bowler': 'KMDN Kulasekara',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 0, 'extras': 0, 'total': 

In [43]:
print(matches['info.city'].iloc[8])

Mount Maunganui


In [47]:
skip_counts = {75, 108, 150, 180, 268, 360, 443, 458, 584, 748, 982, 1052, 1111, 1226, 1345}

all_rows = []

for count, (_, row) in enumerate(matches.iterrows(), start=1):
    if count in skip_counts:
        continue

    try:
        innings = row['innings'][0]['1st innings']
        deliveries = innings['deliveries']
        batting_team = innings['team']
        teams = row['info.teams']

        city = row['info.city'] if pd.notna(row['info.city']) else None
        venue = row['info.venue'] if pd.notna(row['info.venue']) else None

        for delivery in deliveries:
            for ball_number, ball_data in delivery.items():
                # handle wicket info safely
                wicket_info = ball_data.get('wicket', None)
                if isinstance(wicket_info, dict):
                    player_out = wicket_info.get('player_out', '0')
                else:
                    player_out = '0'

                all_rows.append({
                    'match_id': count,
                    'teams': teams,
                    'batting_team': batting_team,
                    'ball': ball_number,
                    'batsman': ball_data['batsman'],
                    'bowler': ball_data['bowler'],
                    'runs': ball_data['runs']['total'],
                    'player_dismissed': player_out,
                    'city': city,
                    'venue': venue
                })
    except Exception as e:
        print(f"⚠️ Skipping match_id={count} due to error: {e}")
        continue

# create the final DataFrame once
delivery_df = pd.DataFrame(all_rows)
print(f"✅ delivery_df shape: {delivery_df.shape}")


✅ delivery_df shape: (331544, 10)


In [48]:
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue
0,1,"[Australia, Sri Lanka]",Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground
1,1,"[Australia, Sri Lanka]",Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground
2,1,"[Australia, Sri Lanka]",Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground
3,1,"[Australia, Sri Lanka]",Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground
4,1,"[Australia, Sri Lanka]",Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...,...,...
331539,2755,"[Sri Lanka, Australia]",Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium
331540,2755,"[Sri Lanka, Australia]",Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium
331541,2755,"[Sri Lanka, Australia]",Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium
331542,2755,"[Sri Lanka, Australia]",Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium


In [49]:
def bowl(row):
    for i in row['teams']:
        if(i != row['batting_team']):
            return i

In [52]:
delivery_df['bowling_team']=delivery_df.apply(bowl,axis=1)

In [53]:
delivery_df.head()

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,1,"[Australia, Sri Lanka]",Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,1,"[Australia, Sri Lanka]",Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,1,"[Australia, Sri Lanka]",Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,1,"[Australia, Sri Lanka]",Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,1,"[Australia, Sri Lanka]",Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka


In [55]:
delivery_df.drop(columns=['teams'],inplace=True)

In [56]:
delivery_df.head()

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,1,Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,1,Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,1,Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,1,Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,1,Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka


In [58]:
delivery_df['batting_team'].unique()

array(['Australia', 'Hong Kong', 'Zimbabwe', 'India', 'Bangladesh',
       'New Zealand', 'South Africa', 'England', 'West Indies',
       'Pakistan', 'Scotland', 'Oman', 'Ireland', 'Papua New Guinea',
       'United Arab Emirates', 'Sri Lanka', 'Netherlands', 'Nepal',
       'Vanuatu', 'Philippines', 'United States of America', 'Germany',
       'Ghana', 'Uganda', 'Kenya', 'Namibia', 'Nigeria', 'Botswana',
       'Guernsey', 'Denmark', 'Jersey', 'Italy', 'Norway', 'Thailand',
       'Malaysia', 'Maldives', 'Singapore', 'Kuwait', 'Bermuda', 'Canada',
       'Cayman Islands', 'Portugal', 'Gibraltar', 'Spain', 'Bhutan',
       'Qatar', 'Iran', 'Belgium', 'Isle of Man', 'Bulgaria', 'Romania',
       'Luxembourg', 'Austria', 'Czech Republic', 'Greece', 'Serbia',
       'Malta', 'France', 'Sweden', 'Rwanda', 'Finland', 'Hungary',
       'Estonia', 'Cyprus', 'Switzerland', 'Seychelles', 'Malawi',
       'Lesotho', 'Swaziland', 'Tanzania', 'Mozambique', 'Sierra Leone',
       'Cameroon', 'Bah

In [59]:
teams = [
    'Australia',
    'India',
    'Bangladesh',
    'New Zealand',
    'South Africa',
    'England',
    'West Indies',
    'Afghanistan',
    'Pakistan',
    'Sri Lanka',
    'Zimbabwe'
]

In [60]:
delivery_df = delivery_df[delivery_df['batting_team'].isin(teams)]
delivery_df = delivery_df[delivery_df['bowling_team'].isin(teams)]

In [64]:
delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,1,Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,1,Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,1,Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,1,Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,1,Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...
331539,2755,Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium,Australia
331540,2755,Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium,Australia
331541,2755,Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium,Australia
331542,2755,Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium,Australia


In [63]:
output = delivery_df[['match_id','batting_team','bowling_team','ball','runs','player_dismissed','city','venue']]

In [65]:
output

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,1,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground
1,1,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground
2,1,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground
3,1,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground
4,1,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...
331539,2755,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
331540,2755,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
331541,2755,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
331542,2755,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium


In [66]:
pickle.dump(output,open('dataset_level2.pkl','wb'))

In [67]:
output.to_csv("dataset.csv")