In [66]:
import numpy as np
import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm

In [67]:
filenames = []
for file in os.listdir('ipl'):
    filenames.append(os.path.join('ipl',file))

In [68]:
filenames[0:5]

['ipl\\1082591.yaml',
 'ipl\\1082592.yaml',
 'ipl\\1082593.yaml',
 'ipl\\1082594.yaml',
 'ipl\\1082595.yaml']

In [69]:
from yaml import safe_load, safe_load_all
import json
def load_records_from_text(text: str):
    text = (text or "").strip()
    if not text:
        return []

    # Try JSON Lines (NDJSON)
    try:
        lines = [json.loads(line) for line in text.splitlines() if line.strip()]
        if lines and all(isinstance(x, dict) for x in lines):
            return lines
    except Exception:
        pass

    # Try JSON array/object
    try:
        obj = json.loads(text)
        if isinstance(obj, list):
            return obj
        if isinstance(obj, dict):
            return [obj]
    except Exception:
        pass

    # Try YAML (single/multi-doc)
    try:
        docs = list(safe_load_all(text))  # works for 1 or many docs
        out = []
        for d in docs:
            if d is None:
                continue
            if isinstance(d, list):
                out.extend(d)
            elif isinstance(d, dict):
                out.append(d)
        if out:
            return out
    except Exception:
        pass

    return []

dfs = []
counter = 0

for file in tqdm(filenames):
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
    records = load_records_from_text(text)
    if not records:
        # Agar file empty ya unparsable ho, skip kar do
        continue

    df = pd.json_normalize(records)  # optionally: sep='.' for nested keys
    df['match_id'] = counter
    dfs.append(df)
    counter += 1

final_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

100%|██████████| 1170/1170 [04:42<00:00,  4.13it/s]


In [70]:
backup = final_df.copy()

In [71]:
final_df

Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.city,info.competition,info.dates,info.gender,info.match_type,...,info.registry.people.B Aparajith,info.registry.people.GS Sandhu,info.players.Rising Pune Supergiants,info.registry.people.P Sahu,info.registry.people.KJ Abbott,info.registry.people.PSP Handscomb,info.registry.people.SM Boland,info.registry.people.UT Khawaja,info.registry.people.F Behardien,info.registry.people.ER Dwivedi
0,[{'1st innings': {'team': 'Sunrisers Hyderabad...,0.91,2017-04-06,1,6,Hyderabad,IPL,[2017-04-05],male,T20,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Mumbai Indians', 'd...",0.91,2017-04-07,1,6,Pune,IPL,[2017-04-06],male,T20,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",0.91,2017-04-07,2,6,Rajkot,IPL,[2017-04-07],male,T20,...,,,,,,,,,,
3,[{'1st innings': {'team': 'Rising Pune Supergi...,0.91,2017-04-08,1,6,Indore,IPL,[2017-04-08],male,T20,...,,,,,,,,,,
4,[{'1st innings': {'team': 'Royal Challengers B...,0.91,2017-04-08,2,6,Bengaluru,IPL,[2017-04-08],male,T20,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1164,"[{'1st innings': {'team': 'Delhi Daredevils', ...",0.91,2016-05-23,1,6,Raipur,IPL,[2016-05-22],male,T20,...,,,,,,,,,,
1165,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",0.91,2016-05-24,1,6,Bangalore,IPL,[2016-05-24],male,T20,...,,,,,,,,,,b274dbbd
1166,[{'1st innings': {'team': 'Sunrisers Hyderabad...,0.91,2016-05-25,1,6,Delhi,IPL,[2016-05-25],male,T20,...,,,,,,,,,,
1167,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",0.91,2016-05-28,1,6,Delhi,IPL,[2016-05-27],male,T20,...,,,,,,,,,,b274dbbd


In [None]:
# 1. Define specific administrative columns to drop
cols_to_drop = [
    'meta.data_version', 
    'meta.created', 
    'meta.revision',
    'info.balls_per_over',
    'info.gender',
    'info.match_type',
    'info.competition'
]

# 2. Automatically find all 'registry' and 'players' columns (these are mostly NaN)
registry_cols = [col for col in final_df.columns if 'info.registry.people' in col]
player_list_cols = [col for col in final_df.columns if 'info.players' in col]

# 3. Combine and drop everything in one command
final_df.drop(columns=cols_to_drop + registry_cols + player_list_cols, inplace=True, errors='ignore')

# 4. Optional: View the clean columns remaining
print(final_df.columns)

Index(['innings', 'info.city', 'info.dates', 'info.outcome.by.runs',
       'info.outcome.winner', 'info.overs', 'info.player_of_match',
       'info.teams', 'info.toss.decision', 'info.toss.winner', 'info.umpires',
       'info.venue', 'match_id', 'info.outcome.by.wickets',
       'info.outcome.eliminator', 'info.outcome.result', 'info.outcome.method',
       'info.neutral_venue'],
      dtype='object')


In [73]:
final_df

Unnamed: 0,innings,info.city,info.dates,info.outcome.by.runs,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.outcome.by.wickets,info.outcome.eliminator,info.outcome.result,info.outcome.method,info.neutral_venue
0,[{'1st innings': {'team': 'Sunrisers Hyderabad...,Hyderabad,[2017-04-05],35.0,Sunrisers Hyderabad,20,[Yuvraj Singh],"[Sunrisers Hyderabad, Royal Challengers Bangal...",field,Royal Challengers Bangalore,"[AY Dandekar, NJ Llong]","Rajiv Gandhi International Stadium, Uppal",0,,,,,
1,"[{'1st innings': {'team': 'Mumbai Indians', 'd...",Pune,[2017-04-06],,Rising Pune Supergiant,20,[SPD Smith],"[Rising Pune Supergiant, Mumbai Indians]",field,Rising Pune Supergiant,"[A Nand Kishore, S Ravi]",Maharashtra Cricket Association Stadium,1,7.0,,,,
2,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",Rajkot,[2017-04-07],,Kolkata Knight Riders,20,[CA Lynn],"[Gujarat Lions, Kolkata Knight Riders]",field,Kolkata Knight Riders,"[Nitin Menon, CK Nandan]",Saurashtra Cricket Association Stadium,2,10.0,,,,
3,[{'1st innings': {'team': 'Rising Pune Supergi...,Indore,[2017-04-08],,Kings XI Punjab,20,[GJ Maxwell],"[Kings XI Punjab, Rising Pune Supergiant]",field,Kings XI Punjab,"[AK Chaudhary, C Shamshuddin]",Holkar Cricket Stadium,3,6.0,,,,
4,[{'1st innings': {'team': 'Royal Challengers B...,Bengaluru,[2017-04-08],15.0,Royal Challengers Bangalore,20,[KM Jadhav],"[Royal Challengers Bangalore, Delhi Daredevils]",bat,Royal Challengers Bangalore,"[S Ravi, VK Sharma]",M.Chinnaswamy Stadium,4,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1164,"[{'1st innings': {'team': 'Delhi Daredevils', ...",Raipur,[2016-05-22],,Royal Challengers Bangalore,20,[V Kohli],"[Delhi Daredevils, Royal Challengers Bangalore]",field,Royal Challengers Bangalore,"[A Nand Kishore, BNJ Oxenford]",Shaheed Veer Narayan Singh International Stadium,1164,6.0,,,,
1165,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",Bangalore,[2016-05-24],,Royal Challengers Bangalore,20,[AB de Villiers],"[Gujarat Lions, Royal Challengers Bangalore]",field,Royal Challengers Bangalore,"[AK Chaudhary, HDPK Dharmasena]",M Chinnaswamy Stadium,1165,4.0,,,,
1166,[{'1st innings': {'team': 'Sunrisers Hyderabad...,Delhi,[2016-05-25],22.0,Sunrisers Hyderabad,20,[MC Henriques],"[Sunrisers Hyderabad, Kolkata Knight Riders]",field,Kolkata Knight Riders,"[M Erasmus, C Shamshuddin]",Feroz Shah Kotla,1166,,,,,
1167,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",Delhi,[2016-05-27],,Sunrisers Hyderabad,20,[DA Warner],"[Gujarat Lions, Sunrisers Hyderabad]",field,Sunrisers Hyderabad,"[M Erasmus, CK Nandan]",Feroz Shah Kotla,1167,4.0,,,,


In [74]:

final_df.drop(columns=[
    'info.outcome.eliminator',
    'info.outcome.result',
    'info.outcome.method',
    'info.neutral_venue'
], inplace=True, errors='ignore')

print(final_df.columns)

Index(['innings', 'info.city', 'info.dates', 'info.outcome.by.runs',
       'info.outcome.winner', 'info.overs', 'info.player_of_match',
       'info.teams', 'info.toss.decision', 'info.toss.winner', 'info.umpires',
       'info.venue', 'match_id', 'info.outcome.by.wickets'],
      dtype='object')


In [75]:
final_df

Unnamed: 0,innings,info.city,info.dates,info.outcome.by.runs,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.outcome.by.wickets
0,[{'1st innings': {'team': 'Sunrisers Hyderabad...,Hyderabad,[2017-04-05],35.0,Sunrisers Hyderabad,20,[Yuvraj Singh],"[Sunrisers Hyderabad, Royal Challengers Bangal...",field,Royal Challengers Bangalore,"[AY Dandekar, NJ Llong]","Rajiv Gandhi International Stadium, Uppal",0,
1,"[{'1st innings': {'team': 'Mumbai Indians', 'd...",Pune,[2017-04-06],,Rising Pune Supergiant,20,[SPD Smith],"[Rising Pune Supergiant, Mumbai Indians]",field,Rising Pune Supergiant,"[A Nand Kishore, S Ravi]",Maharashtra Cricket Association Stadium,1,7.0
2,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",Rajkot,[2017-04-07],,Kolkata Knight Riders,20,[CA Lynn],"[Gujarat Lions, Kolkata Knight Riders]",field,Kolkata Knight Riders,"[Nitin Menon, CK Nandan]",Saurashtra Cricket Association Stadium,2,10.0
3,[{'1st innings': {'team': 'Rising Pune Supergi...,Indore,[2017-04-08],,Kings XI Punjab,20,[GJ Maxwell],"[Kings XI Punjab, Rising Pune Supergiant]",field,Kings XI Punjab,"[AK Chaudhary, C Shamshuddin]",Holkar Cricket Stadium,3,6.0
4,[{'1st innings': {'team': 'Royal Challengers B...,Bengaluru,[2017-04-08],15.0,Royal Challengers Bangalore,20,[KM Jadhav],"[Royal Challengers Bangalore, Delhi Daredevils]",bat,Royal Challengers Bangalore,"[S Ravi, VK Sharma]",M.Chinnaswamy Stadium,4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1164,"[{'1st innings': {'team': 'Delhi Daredevils', ...",Raipur,[2016-05-22],,Royal Challengers Bangalore,20,[V Kohli],"[Delhi Daredevils, Royal Challengers Bangalore]",field,Royal Challengers Bangalore,"[A Nand Kishore, BNJ Oxenford]",Shaheed Veer Narayan Singh International Stadium,1164,6.0
1165,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",Bangalore,[2016-05-24],,Royal Challengers Bangalore,20,[AB de Villiers],"[Gujarat Lions, Royal Challengers Bangalore]",field,Royal Challengers Bangalore,"[AK Chaudhary, HDPK Dharmasena]",M Chinnaswamy Stadium,1165,4.0
1166,[{'1st innings': {'team': 'Sunrisers Hyderabad...,Delhi,[2016-05-25],22.0,Sunrisers Hyderabad,20,[MC Henriques],"[Sunrisers Hyderabad, Kolkata Knight Riders]",field,Kolkata Knight Riders,"[M Erasmus, C Shamshuddin]",Feroz Shah Kotla,1166,
1167,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",Delhi,[2016-05-27],,Sunrisers Hyderabad,20,[DA Warner],"[Gujarat Lions, Sunrisers Hyderabad]",field,Sunrisers Hyderabad,"[M Erasmus, CK Nandan]",Feroz Shah Kotla,1167,4.0


In [76]:
final_df['info.outcome.by.runs'].isnull().sum()

np.int64(638)

In [77]:
final_df['info.outcome.by.wickets'].isnull().sum()

np.int64(554)

In [78]:
final_df['info.outcome.by.runs'] = final_df['info.outcome.by.runs'].fillna(0)
final_df['info.outcome.by.wickets'] = final_df['info.outcome.by.wickets'].fillna(0)

In [79]:
final_df

Unnamed: 0,innings,info.city,info.dates,info.outcome.by.runs,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.outcome.by.wickets
0,[{'1st innings': {'team': 'Sunrisers Hyderabad...,Hyderabad,[2017-04-05],35.0,Sunrisers Hyderabad,20,[Yuvraj Singh],"[Sunrisers Hyderabad, Royal Challengers Bangal...",field,Royal Challengers Bangalore,"[AY Dandekar, NJ Llong]","Rajiv Gandhi International Stadium, Uppal",0,0.0
1,"[{'1st innings': {'team': 'Mumbai Indians', 'd...",Pune,[2017-04-06],0.0,Rising Pune Supergiant,20,[SPD Smith],"[Rising Pune Supergiant, Mumbai Indians]",field,Rising Pune Supergiant,"[A Nand Kishore, S Ravi]",Maharashtra Cricket Association Stadium,1,7.0
2,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",Rajkot,[2017-04-07],0.0,Kolkata Knight Riders,20,[CA Lynn],"[Gujarat Lions, Kolkata Knight Riders]",field,Kolkata Knight Riders,"[Nitin Menon, CK Nandan]",Saurashtra Cricket Association Stadium,2,10.0
3,[{'1st innings': {'team': 'Rising Pune Supergi...,Indore,[2017-04-08],0.0,Kings XI Punjab,20,[GJ Maxwell],"[Kings XI Punjab, Rising Pune Supergiant]",field,Kings XI Punjab,"[AK Chaudhary, C Shamshuddin]",Holkar Cricket Stadium,3,6.0
4,[{'1st innings': {'team': 'Royal Challengers B...,Bengaluru,[2017-04-08],15.0,Royal Challengers Bangalore,20,[KM Jadhav],"[Royal Challengers Bangalore, Delhi Daredevils]",bat,Royal Challengers Bangalore,"[S Ravi, VK Sharma]",M.Chinnaswamy Stadium,4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1164,"[{'1st innings': {'team': 'Delhi Daredevils', ...",Raipur,[2016-05-22],0.0,Royal Challengers Bangalore,20,[V Kohli],"[Delhi Daredevils, Royal Challengers Bangalore]",field,Royal Challengers Bangalore,"[A Nand Kishore, BNJ Oxenford]",Shaheed Veer Narayan Singh International Stadium,1164,6.0
1165,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",Bangalore,[2016-05-24],0.0,Royal Challengers Bangalore,20,[AB de Villiers],"[Gujarat Lions, Royal Challengers Bangalore]",field,Royal Challengers Bangalore,"[AK Chaudhary, HDPK Dharmasena]",M Chinnaswamy Stadium,1165,4.0
1166,[{'1st innings': {'team': 'Sunrisers Hyderabad...,Delhi,[2016-05-25],22.0,Sunrisers Hyderabad,20,[MC Henriques],"[Sunrisers Hyderabad, Kolkata Knight Riders]",field,Kolkata Knight Riders,"[M Erasmus, C Shamshuddin]",Feroz Shah Kotla,1166,0.0
1167,"[{'1st innings': {'team': 'Gujarat Lions', 'de...",Delhi,[2016-05-27],0.0,Sunrisers Hyderabad,20,[DA Warner],"[Gujarat Lions, Sunrisers Hyderabad]",field,Sunrisers Hyderabad,"[M Erasmus, CK Nandan]",Feroz Shah Kotla,1167,4.0


In [80]:
final_df.isnull().sum()

innings                     0
info.city                  51
info.dates                  0
info.outcome.by.runs        0
info.outcome.winner        23
info.overs                  0
info.player_of_match        8
info.teams                  0
info.toss.decision          0
info.toss.winner            0
info.umpires                0
info.venue                  0
match_id                    0
info.outcome.by.wickets     0
dtype: int64

In [81]:
final_df['info.city'] = final_df['info.city'].fillna('Unknown')
final_df['info.outcome.winner'] = final_df['info.outcome.winner'].fillna('No Result')
final_df['info.player_of_match'] = final_df['info.player_of_match'].fillna('No Player of match')
print(final_df.isnull().sum())

innings                    0
info.city                  0
info.dates                 0
info.outcome.by.runs       0
info.outcome.winner        0
info.overs                 0
info.player_of_match       0
info.teams                 0
info.toss.decision         0
info.toss.winner           0
info.umpires               0
info.venue                 0
match_id                   0
info.outcome.by.wickets    0
dtype: int64


In [82]:
import pickle
pickle.dump(final_df,open('dataset_level1.pkl','wb'))

In [83]:
# matches = pickle.load(open('dataset_level1.pkl','rb'))
matches.iloc[0]['innings'][0]['1st innings']['deliveries']

[{0.1: {'batsman': 'DA Warner',
   'bowler': 'TS Mills',
   'non_striker': 'S Dhawan',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.2: {'batsman': 'DA Warner',
   'bowler': 'TS Mills',
   'non_striker': 'S Dhawan',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.3: {'batsman': 'DA Warner',
   'bowler': 'TS Mills',
   'non_striker': 'S Dhawan',
   'runs': {'batsman': 4, 'extras': 0, 'total': 4}}},
 {0.4: {'batsman': 'DA Warner',
   'bowler': 'TS Mills',
   'non_striker': 'S Dhawan',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.5: {'batsman': 'DA Warner',
   'bowler': 'TS Mills',
   'extras': {'wides': 2},
   'non_striker': 'S Dhawan',
   'runs': {'batsman': 0, 'extras': 2, 'total': 2}}},
 {0.6: {'batsman': 'S Dhawan',
   'bowler': 'TS Mills',
   'non_striker': 'DA Warner',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.7: {'batsman': 'S Dhawan',
   'bowler': 'TS Mills',
   'extras': {'legbyes': 1},
   'non_striker': 'DA Warner',
   'runs': 

In [84]:
count = 1
delivery_df = pd.DataFrame()
for index, row in matches.iterrows():
    if count in [75,108,150,180,268,360,443,458,584,748,982,1052,1111,1226,1345]:
        count+=1
        continue
    count+=1
    ball_of_match = []
    batsman = []
    bowler = []
    runs = []
    player_of_dismissed = []
    teams = []
    batting_team = []
    match_id = []
    city = []
    venue = []
    for ball in row['innings'][0]['1st innings']['deliveries']:
        for key in ball.keys():
            match_id.append(count)
            batting_team.append(row['innings'][0]['1st innings']['team'])
            teams.append(row['info.teams'])
            ball_of_match.append(key)
            batsman.append(ball[key]['batsman'])
            bowler.append(ball[key]['bowler'])
            runs.append(ball[key]['runs']['total'])
            city.append(row['info.city'])
            venue.append(row['info.venue'])
            try:
                player_of_dismissed.append(ball[key]['wicket']['player_out'])
            except:
                player_of_dismissed.append('0')
    loop_df = pd.DataFrame({
            'match_id':match_id,
            'teams':teams,
            'batting_team':batting_team,
            'ball':ball_of_match,
            'batsman':batsman,
            'bowler':bowler,
            'runs':runs,
            'player_dismissed':player_of_dismissed,
            'city':city,
            'venue':venue
        })
    delivery_df = pd.concat([delivery_df, loop_df], ignore_index=True)

In [85]:
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue
0,2,"[Sunrisers Hyderabad, Royal Challengers Bangal...",Sunrisers Hyderabad,0.1,DA Warner,TS Mills,0,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"
1,2,"[Sunrisers Hyderabad, Royal Challengers Bangal...",Sunrisers Hyderabad,0.2,DA Warner,TS Mills,0,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"
2,2,"[Sunrisers Hyderabad, Royal Challengers Bangal...",Sunrisers Hyderabad,0.3,DA Warner,TS Mills,4,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"
3,2,"[Sunrisers Hyderabad, Royal Challengers Bangal...",Sunrisers Hyderabad,0.4,DA Warner,TS Mills,0,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"
4,2,"[Sunrisers Hyderabad, Royal Challengers Bangal...",Sunrisers Hyderabad,0.5,DA Warner,TS Mills,2,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"
...,...,...,...,...,...,...,...,...,...,...
142540,1170,"[Royal Challengers Bangalore, Sunrisers Hydera...",Sunrisers Hyderabad,19.2,BCJ Cutting,SR Watson,6,0,Bangalore,M Chinnaswamy Stadium
142541,1170,"[Royal Challengers Bangalore, Sunrisers Hydera...",Sunrisers Hyderabad,19.3,BCJ Cutting,SR Watson,6,0,Bangalore,M Chinnaswamy Stadium
142542,1170,"[Royal Challengers Bangalore, Sunrisers Hydera...",Sunrisers Hyderabad,19.4,BCJ Cutting,SR Watson,1,0,Bangalore,M Chinnaswamy Stadium
142543,1170,"[Royal Challengers Bangalore, Sunrisers Hydera...",Sunrisers Hyderabad,19.5,B Kumar,SR Watson,1,0,Bangalore,M Chinnaswamy Stadium


In [86]:
def bowl(row):
    for team in row['teams']:
        if team != row['batting_team']:
            return team

In [87]:
delivery_df['bowling_team'] = delivery_df.apply(bowl,axis=1)

In [88]:
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,2,"[Sunrisers Hyderabad, Royal Challengers Bangal...",Sunrisers Hyderabad,0.1,DA Warner,TS Mills,0,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore
1,2,"[Sunrisers Hyderabad, Royal Challengers Bangal...",Sunrisers Hyderabad,0.2,DA Warner,TS Mills,0,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore
2,2,"[Sunrisers Hyderabad, Royal Challengers Bangal...",Sunrisers Hyderabad,0.3,DA Warner,TS Mills,4,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore
3,2,"[Sunrisers Hyderabad, Royal Challengers Bangal...",Sunrisers Hyderabad,0.4,DA Warner,TS Mills,0,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore
4,2,"[Sunrisers Hyderabad, Royal Challengers Bangal...",Sunrisers Hyderabad,0.5,DA Warner,TS Mills,2,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore
...,...,...,...,...,...,...,...,...,...,...,...
142540,1170,"[Royal Challengers Bangalore, Sunrisers Hydera...",Sunrisers Hyderabad,19.2,BCJ Cutting,SR Watson,6,0,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore
142541,1170,"[Royal Challengers Bangalore, Sunrisers Hydera...",Sunrisers Hyderabad,19.3,BCJ Cutting,SR Watson,6,0,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore
142542,1170,"[Royal Challengers Bangalore, Sunrisers Hydera...",Sunrisers Hyderabad,19.4,BCJ Cutting,SR Watson,1,0,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore
142543,1170,"[Royal Challengers Bangalore, Sunrisers Hydera...",Sunrisers Hyderabad,19.5,B Kumar,SR Watson,1,0,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore


In [89]:
delivery_df.drop(columns=['teams'],inplace=True)

In [90]:
delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,2,Sunrisers Hyderabad,0.1,DA Warner,TS Mills,0,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore
1,2,Sunrisers Hyderabad,0.2,DA Warner,TS Mills,0,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore
2,2,Sunrisers Hyderabad,0.3,DA Warner,TS Mills,4,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore
3,2,Sunrisers Hyderabad,0.4,DA Warner,TS Mills,0,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore
4,2,Sunrisers Hyderabad,0.5,DA Warner,TS Mills,2,0,Hyderabad,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore
...,...,...,...,...,...,...,...,...,...,...
142540,1170,Sunrisers Hyderabad,19.2,BCJ Cutting,SR Watson,6,0,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore
142541,1170,Sunrisers Hyderabad,19.3,BCJ Cutting,SR Watson,6,0,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore
142542,1170,Sunrisers Hyderabad,19.4,BCJ Cutting,SR Watson,1,0,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore
142543,1170,Sunrisers Hyderabad,19.5,B Kumar,SR Watson,1,0,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore


In [91]:
delivery_df['batting_team'].unique()

array(['Sunrisers Hyderabad', 'Mumbai Indians', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Royal Challengers Bangalore',
       'Kolkata Knight Riders', 'Delhi Daredevils', 'Kings XI Punjab',
       'Rajasthan Royals', 'Chennai Super Kings', 'Delhi Capitals',
       'Punjab Kings', 'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiants'],
      dtype=object)

In [93]:
team_mapping = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Kings XI Punjab': 'Punjab Kings',
    'Royal Challengers Bangalore': 'Royal Challengers Bengaluru',
    'Deccan Chargers': 'Sunrisers Hyderabad'
}
delivery_df['batting_team'] = delivery_df['batting_team'].replace(team_mapping)
print("Updated Team List:")
print(delivery_df['batting_team'].unique())

Updated Team List:
['Sunrisers Hyderabad' 'Mumbai Indians' 'Gujarat Lions'
 'Rising Pune Supergiant' 'Royal Challengers Bengaluru'
 'Kolkata Knight Riders' 'Delhi Capitals' 'Punjab Kings'
 'Rajasthan Royals' 'Chennai Super Kings' 'Lucknow Super Giants'
 'Gujarat Titans' 'Kochi Tuskers Kerala' 'Pune Warriors'
 'Rising Pune Supergiants']


In [94]:
teams=[
    'Sunrisers Hyderabad',
    'Mumbai Indians', 
    'Royal Challengers Bengaluru',
    'Kolkata Knight Riders', 
    'Delhi Capitals',
    'Punjab Kings',
    'Rajasthan Royals', 
    'Chennai Super Kings', 
    'Lucknow Super Giants',
    'Gujarat Titans'  
]

In [95]:

delivery_df = delivery_df[delivery_df['batting_team'].isin(teams)]
delivery_df = delivery_df[delivery_df['bowling_team'].isin(teams)]

In [96]:

delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
741,8,Kolkata Knight Riders,0.1,G Gambhir,SL Malinga,1,0,Mumbai,Wankhede Stadium,Mumbai Indians
742,8,Kolkata Knight Riders,0.2,CA Lynn,SL Malinga,1,0,Mumbai,Wankhede Stadium,Mumbai Indians
743,8,Kolkata Knight Riders,0.3,CA Lynn,SL Malinga,1,0,Mumbai,Wankhede Stadium,Mumbai Indians
744,8,Kolkata Knight Riders,0.4,G Gambhir,SL Malinga,0,0,Mumbai,Wankhede Stadium,Mumbai Indians
745,8,Kolkata Knight Riders,0.5,G Gambhir,SL Malinga,1,0,Mumbai,Wankhede Stadium,Mumbai Indians
...,...,...,...,...,...,...,...,...,...,...
142289,1168,Sunrisers Hyderabad,19.2,Bipul Sharma,M Morkel,0,0,Delhi,Feroz Shah Kotla,Kolkata Knight Riders
142290,1168,Sunrisers Hyderabad,19.3,Bipul Sharma,M Morkel,6,0,Delhi,Feroz Shah Kotla,Kolkata Knight Riders
142291,1168,Sunrisers Hyderabad,19.4,Bipul Sharma,M Morkel,1,0,Delhi,Feroz Shah Kotla,Kolkata Knight Riders
142292,1168,Sunrisers Hyderabad,19.5,B Kumar,M Morkel,0,B Kumar,Delhi,Feroz Shah Kotla,Kolkata Knight Riders


In [97]:
output = delivery_df[['match_id','batting_team','bowling_team','ball','runs','player_dismissed','city','venue']]

In [98]:
output

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
741,8,Kolkata Knight Riders,Mumbai Indians,0.1,1,0,Mumbai,Wankhede Stadium
742,8,Kolkata Knight Riders,Mumbai Indians,0.2,1,0,Mumbai,Wankhede Stadium
743,8,Kolkata Knight Riders,Mumbai Indians,0.3,1,0,Mumbai,Wankhede Stadium
744,8,Kolkata Knight Riders,Mumbai Indians,0.4,0,0,Mumbai,Wankhede Stadium
745,8,Kolkata Knight Riders,Mumbai Indians,0.5,1,0,Mumbai,Wankhede Stadium
...,...,...,...,...,...,...,...,...
142289,1168,Sunrisers Hyderabad,Kolkata Knight Riders,19.2,0,0,Delhi,Feroz Shah Kotla
142290,1168,Sunrisers Hyderabad,Kolkata Knight Riders,19.3,6,0,Delhi,Feroz Shah Kotla
142291,1168,Sunrisers Hyderabad,Kolkata Knight Riders,19.4,1,0,Delhi,Feroz Shah Kotla
142292,1168,Sunrisers Hyderabad,Kolkata Knight Riders,19.5,0,B Kumar,Delhi,Feroz Shah Kotla


In [99]:
pickle.dump(output,open('dataset_level2.pkl','wb'))