In [1]:
import pandas as pd
import yaml
import glob

### Reading the data from https://cricsheet.org/ which I have downloaded as odis

In [2]:
path = r'odis' # use your path
allFiles = glob.glob(path + "/*.yaml")

dfs = []

columns = ['info.teams','info.overs','info.match_type']
cities = ['HYDERABAD', 'NAGPUR', 'RANCHI', 'MOHALI', 'DELHI']

for file_ in allFiles:
    #print(file_)
    with open(file_, 'r') as f:
        df = pd.io.json.json_normalize(yaml.load(f))
        if 'info.city' in df:
            df['team1'] = df['info.teams'].apply(lambda teams : teams[0].upper()) 
            df['team2'] = df['info.teams'].apply(lambda teams : teams[1].upper())
            df = df[df['team1'].isin(['INDIA','AUSTRALIA']) & df['team2'].isin(['INDIA','AUSTRALIA'])]
            df['info.city'] = df['info.city'].apply(lambda city : city.upper())
            df = df[df['info.city'].isin(cities)]
            df = df.drop(columns,1)
            df = df[df['info.gender'] == 'male']
            dfs.append(df)

frame = pd.concat(dfs, axis = 0, ignore_index = True)    

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [4]:
frame.head()
frame.shape
#df['team1'] 

(7, 28)

### Raw DataFrame : Without Cleanup

In [5]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 28 columns):
info.city                      7 non-null object
info.dates                     7 non-null object
info.gender                    7 non-null object
info.neutral_venue             0 non-null float64
info.outcome.by.runs           4 non-null float64
info.outcome.by.wickets        2 non-null float64
info.outcome.method            0 non-null object
info.outcome.result            1 non-null object
info.outcome.winner            6 non-null object
info.player_of_match           6 non-null object
info.supersubs.Australia       0 non-null object
info.supersubs.Bangladesh      0 non-null object
info.supersubs.India           0 non-null object
info.supersubs.New Zealand     0 non-null object
info.supersubs.Pakistan        0 non-null object
info.supersubs.South Africa    0 non-null object
info.supersubs.Sri Lanka       0 non-null object
info.supersubs.West Indies     0 non-null object
info.toss.deci

### Cleaning data : Removing Columns which are not required

In [6]:
columns_not_required = ['info.supersubs.Australia','info.supersubs.Bangladesh','info.supersubs.India'
                        ,'info.supersubs.New Zealand', 'info.supersubs.Pakistan', 'info.supersubs.South Africa',
                       'info.supersubs.Sri Lanka', 'info.supersubs.West Indies', 'meta.created', 'meta.data_version',
                       'meta.revision','info.neutral_venue','info.neutral_venue','info.outcome.method','info.gender']
frame = frame.drop(columns_not_required,1)


### DataFrame after cleanup

In [7]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 14 columns):
info.city                  7 non-null object
info.dates                 7 non-null object
info.outcome.by.runs       4 non-null float64
info.outcome.by.wickets    2 non-null float64
info.outcome.result        1 non-null object
info.outcome.winner        6 non-null object
info.player_of_match       6 non-null object
info.toss.decision         7 non-null object
info.toss.winner           7 non-null object
info.umpires               7 non-null object
info.venue                 7 non-null object
innings                    7 non-null object
team1                      7 non-null object
team2                      7 non-null object
dtypes: float64(2), object(12)
memory usage: 864.0+ bytes


### Solution Starts from here

In [8]:
frame

Unnamed: 0,info.city,info.dates,info.outcome.by.runs,info.outcome.by.wickets,info.outcome.result,info.outcome.winner,info.player_of_match,info.toss.decision,info.toss.winner,info.umpires,info.venue,innings,team1,team2
0,HYDERABAD,[2009-11-05],3.0,,,Australia,[SR Tendulkar],bat,Australia,"[EAR de Silva, SK Tarapore]","Rajiv Gandhi International Stadium, Uppal","[{'1st innings': {'team': 'Australia', 'delive...",INDIA,AUSTRALIA
1,NAGPUR,[2009-10-28],99.0,,,India,[MS Dhoni],field,Australia,"[AM Saheba, SK Tarapore]","Vidarbha Cricket Association Stadium, Jamtha","[{'1st innings': {'team': 'India', 'deliveries...",INDIA,AUSTRALIA
2,NAGPUR,[2007-10-14],18.0,,,Australia,[A Symonds],bat,Australia,"[Aleem Dar, AM Saheba]",Vidarbha Cricket Association Ground,"[{'1st innings': {'team': 'Australia', 'delive...",INDIA,AUSTRALIA
3,HYDERABAD,[2007-10-05],47.0,,,Australia,[A Symonds],bat,Australia,"[SA Bucknor, SL Shastri]","Rajiv Gandhi International Stadium, Uppal","[{'1st innings': {'team': 'Australia', 'delive...",INDIA,AUSTRALIA
4,NAGPUR,[2013-10-30],,6.0,,India,[V Kohli],field,India,"[NJ Llong, S Ravi]","Vidarbha Cricket Association Stadium, Jamtha","[{'1st innings': {'team': 'Australia', 'delive...",INDIA,AUSTRALIA
5,RANCHI,[2013-10-23],,,no result,,,field,India,"[RA Kettleborough, VA Kulkarni]",JSCA International Stadium Complex,"[{'1st innings': {'team': 'Australia', 'delive...",INDIA,AUSTRALIA
6,DELHI,[2009-10-31],,6.0,,India,[Yuvraj Singh],bat,Australia,"[SS Hazare, AM Saheba]",Feroz Shah Kotla,"[{'1st innings': {'team': 'Australia', 'delive...",INDIA,AUSTRALIA


### From above DataFrame we can first conclude following
- Hyderabad there are two matches in both Australia wins => **Chances of Australia to win in Hyderabad**
- Nagpur there are three macthes and two won by India and 1 won by Australia => **Chances of India to win in Nagpur**
- Delhi one match and India won => **Chances of India to win in Delhi**
- Ranchi there was no result i.e. it was draw => **So 50-50 chances of both India and Australia in Ranchi**
- Mohali no data => **So 50-50 chances of both India and Australia in Mohali**

## Problem Statement\[1.\] Winner of the series: India will be winner of Series

## Problem Statement\[2.\] Series output : India will win 4 matches and Australia will win 1 match

#### For highest run scorer, wicket taker and sixes first getting the exploded data from INNINGS column of dataframe

In [23]:
def getBatsmanAndBowler(in_frame):
    in_frame.innings[0][0]['1st innings']['team']
    innings1_batsman = {'batsman':[], 'runs':[], 'sixes':[]}
    innings1_bowler = {'bowler':[], 'wickets':[]}
    in_frame.innings[0][0]['1st innings']['deliveries']
    
    for inning in in_frame.innings:
        for delivery in inning[0]['1st innings']['deliveries']:
            for k,v in delivery.items():
                #print(str(k)+" "+str(v))
                batsman = v['batsman']
                bowler = v['bowler']
                batsman_run = v['runs']['batsman']
                if batsman in innings1_batsman['batsman']:
                    innings1_batsman['runs'][innings1_batsman['batsman'].index(batsman)] = innings1_batsman['runs'][innings1_batsman['batsman'].index(batsman)]+batsman_run
                    if batsman_run == 6:
                        innings1_batsman['sixes'][innings1_batsman['batsman'].index(batsman)] = innings1_batsman['sixes'][innings1_batsman['batsman'].index(batsman)]+1
                else:    
                    innings1_batsman['batsman'].append(batsman)
                    innings1_batsman['runs'].append(batsman_run)
                    if batsman_run == 6:
                        innings1_batsman['sixes'].append(1) 
                    else:
                        innings1_batsman['sixes'].append(0)
            
                if bowler in innings1_bowler['bowler']:
                    if 'wicket' in v:
                        innings1_bowler['wickets'][innings1_bowler['bowler'].index(bowler)] = innings1_bowler['wickets'][innings1_bowler['bowler'].index(bowler)]+1
                else:
                    innings1_bowler['bowler'].append(bowler)
                    if 'wicket' in v:
                        innings1_bowler['wickets'].append(1)
                    else:
                        innings1_bowler['wickets'].append(0)
                        
        for delivery in inning[1]['2nd innings']['deliveries']:
            for k,v in delivery.items():
                #print(str(k)+" "+str(v))
                batsman = v['batsman']
                bowler = v['bowler']
                batsman_run = v['runs']['batsman']
                if batsman in innings1_batsman['batsman']:
                    innings1_batsman['runs'][innings1_batsman['batsman'].index(batsman)] = innings1_batsman['runs'][innings1_batsman['batsman'].index(batsman)]+batsman_run
                    if batsman_run == 6:
                        innings1_batsman['sixes'][innings1_batsman['batsman'].index(batsman)] = innings1_batsman['sixes'][innings1_batsman['batsman'].index(batsman)]+1
                else:    
                    innings1_batsman['batsman'].append(batsman)
                    innings1_batsman['runs'].append(batsman_run)
                    if batsman_run == 6:
                        innings1_batsman['sixes'].append(1) 
                    else:
                        innings1_batsman['sixes'].append(0)
            
                if bowler in innings1_bowler['bowler']:
                    if 'wicket' in v:
                        innings1_bowler['wickets'][innings1_bowler['bowler'].index(bowler)] = innings1_bowler['wickets'][innings1_bowler['bowler'].index(bowler)]+1
                else:
                    innings1_bowler['bowler'].append(bowler)
                    if 'wicket' in v:
                        innings1_bowler['wickets'].append(1)
                    else:
                        innings1_bowler['wickets'].append(0) 
    
    print(innings1_batsman)
    print(innings1_bowler)
    return (innings1_batsman,innings1_bowler)

#### Creating Dataframe for batsman with runs and sixes, named as batsman_df
#### Creating Dataframe for bowlers with wickets, named as bowler_df

In [24]:
output = getBatsmanAndBowler(frame)
batsman_df = pd.DataFrame.from_dict(output[0])
bowler_df = pd.DataFrame.from_dict(output[1])

{'batsman': ['SR Watson', 'SE Marsh', 'RT Ponting', 'CL White', 'MEK Hussey', 'V Sehwag', 'SR Tendulkar', 'G Gambhir', 'Yuvraj Singh', 'MS Dhoni', 'SK Raina', 'Harbhajan Singh', 'RA Jadeja', 'P Kumar', 'A Nehra', 'MM Patel', 'TD Paine', 'AC Voges', 'MG Johnson', 'NM Hauritz', 'PM Siddle', 'BW Hilfenhaus', 'AC Gilchrist', 'MJ Clarke', 'BJ Hodge', 'A Symonds', 'BJ Haddin', 'JR Hopes', 'B Lee', 'SC Ganguly', 'IK Pathan', 'R Dravid', 'RV Uthappa', 'Z Khan', 'ML Hayden', 'RG Sharma', 'S Sreesanth', 'PJ Hughes', 'AJ Finch', 'GJ Bailey', 'GJ Maxwell', 'S Dhawan', 'V Kohli', 'JP Faulkner', 'CJ McKay', 'MC Henriques'], 'runs': [269, 133, 190, 80, 165, 89, 326, 96, 237, 285, 146, 26, 23, 10, 1, 2, 8, 104, 56, 30, 3, 16, 80, 59, 23, 196, 28, 50, 17, 86, 32, 7, 44, 6, 60, 89, 1, 24, 25, 254, 101, 114, 115, 23, 7, 12], 'sixes': [6, 3, 2, 5, 3, 2, 5, 0, 6, 4, 4, 1, 0, 1, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 1, 0, 1, 2, 2, 0, 2, 0, 0, 3, 0, 0, 0, 9, 5, 0, 1, 0, 0, 0]}
{'bowler': ['P Kumar', 'A Nehra',

## Problem Statement\[3.\] Highest Run Scorer: MS Dhoni (Shown below)
#### Because SR Tendulkar doesn't play now

In [25]:
batsman_df
batsman_df.sort_values('runs', ascending=False).head(20)

Unnamed: 0,batsman,runs,sixes
6,SR Tendulkar,326,5
9,MS Dhoni,285,4
0,SR Watson,269,6
39,GJ Bailey,254,9
8,Yuvraj Singh,237,6
25,A Symonds,196,9
2,RT Ponting,190,2
4,MEK Hussey,165,3
10,SK Raina,146,4
1,SE Marsh,133,3


## Problem Statement\[4.\] Maximum Sixes: GJ Maxwell (Shown below)
#### Other playes who hit more sixes are retired now

In [26]:
batsman_df.sort_values('sixes', ascending=False).head(20)

Unnamed: 0,batsman,runs,sixes
25,A Symonds,196,9
39,GJ Bailey,254,9
0,SR Watson,269,6
8,Yuvraj Singh,237,6
3,CL White,80,5
6,SR Tendulkar,326,5
40,GJ Maxwell,101,5
9,MS Dhoni,285,4
10,SK Raina,146,4
4,MEK Hussey,165,3


## Problem Statement\[5.\] Highest Wicket Taker: RA Jadeja (Shown below)
#### MG Johnson retired now

In [20]:
bowler_df
bowler_df.sort_values('wickets', ascending=False).head(20)

Unnamed: 0,bowler,wickets
14,MG Johnson,11
3,RA Jadeja,9
26,GB Hogg,7
17,Z Khan,6
0,P Kumar,4
4,Harbhajan Singh,4
32,R Ashwin,4
31,Mohammed Shami,4
8,CJ McKay,4
9,SR Watson,4


In [73]:
frame.innings[0][1]

{'2nd innings': {'team': 'India',
  'deliveries': [{0.1: {'batsman': 'V Sehwag',
     'bowler': 'BW Hilfenhaus',
     'non_striker': 'SR Tendulkar',
     'runs': {'batsman': 2, 'extras': 0, 'total': 2}}},
   {0.2: {'batsman': 'V Sehwag',
     'bowler': 'BW Hilfenhaus',
     'non_striker': 'SR Tendulkar',
     'runs': {'batsman': 1, 'extras': 0, 'total': 1}}},
   {0.3: {'batsman': 'SR Tendulkar',
     'bowler': 'BW Hilfenhaus',
     'non_striker': 'V Sehwag',
     'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
   {0.4: {'batsman': 'SR Tendulkar',
     'bowler': 'BW Hilfenhaus',
     'non_striker': 'V Sehwag',
     'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
   {0.5: {'batsman': 'SR Tendulkar',
     'bowler': 'BW Hilfenhaus',
     'non_striker': 'V Sehwag',
     'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
   {0.6: {'batsman': 'SR Tendulkar',
     'bowler': 'BW Hilfenhaus',
     'non_striker': 'V Sehwag',
     'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
   {1.1: {