In [1]:
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objects as go

In [2]:
def rolling_window(all_overs):
    batsman = []
    window = 0
    start = 0
    for index in range(0, len(all_overs)):
    #     print(index, " : ", type())
        v = set()
        if(window >= 5):
            start = index - 5
            for i in range(start, index+1):
                v.update(all_overs[i])
            batsman.append(",".join(list(v)))
        else:
            window += 1
            v = np.nan
            batsman.append(v)
    return batsman

In [3]:
def value_consistency(col, existing_values, unik_value, data):
    mask = data[col].isin(existing_values)
    #update
    data.loc[(mask), col] = unik_value


In [4]:
def clean_df(df_raw):
    # Venue
    value_consistency('venue',['M Chinnaswamy Stadium', 'M.Chinnaswamy Stadium'], 'M Chinnaswamy Stadium', df_raw)
    value_consistency('venue',['MA Chidambaram Stadium', 'MA Chidambaram Stadium, Chepauk',
                                 'MA Chidambaram Stadium, Chepauk, Chennai'], 'MA Chidambaram Stadium, Chepauk', df_raw)
    value_consistency('venue',['Punjab Cricket Association IS Bindra Stadium',
           'Punjab Cricket Association IS Bindra Stadium, Mohali',
           'Punjab Cricket Association Stadium, Mohali'], 'Punjab Cricket Association Stadium, Mohali', df_raw)
    value_consistency('venue',['Rajiv Gandhi International Stadium',
           'Rajiv Gandhi International Stadium, Uppal'], 'Rajiv Gandhi International Stadium, Uppal', df_raw)
    value_consistency('venue',['Wankhede Stadium', 'Wankhede Stadium, Mumbai'], 'Wankhede Stadium', df_raw)
    
    df_raw.season = df_raw.season.map(str)
    
    value_consistency('batting_team', ['Delhi Capitals', 'Delhi Daredevils'], 'Delhi Capitals', df_raw)
    value_consistency('batting_team', ['Pune Warriors','Rising Pune Supergiant', 'Rising Pune Supergiants'], 'Rising Pune Supergiant', df_raw)

    value_consistency('bowling_team', ['Delhi Capitals', 'Delhi Daredevils'], 'Delhi Capitals', df_raw)
    value_consistency('bowling_team', ['Pune Warriors','Rising Pune Supergiant', 'Rising Pune Supergiants'], 'Rising Pune Supergiant', df_raw)
    
    

In [5]:
def transformFile(filename):
    df_raw = pd.read_csv('dataset/individual_match/'+filename)
    
    # Clean
    clean_df(df_raw)
    
    df = df_raw[['match_id', 'venue', 'innings', 'batting_team', 'bowling_team', 'ball', 'striker', 'non_striker', 
         'bowler', 'runs_off_bat', 'extras']].copy()
    
    #Score
    df['score'] = df['runs_off_bat'] + df['extras']
    #Over
    df['over'] = df['ball'].map(np.floor).round(0).astype('int')
    
    feats = ['match_id','over','venue', 'innings', 'batting_team', 'bowling_team'] 

    # Innings 1
    innings1 = df[df['innings'] == 1]
    batsmen_per_over = innings1.groupby(feats)['striker', 'non_striker'].apply(lambda x: set(np.unique(x)))
    bowlers_per_over = innings1.groupby(feats)['bowler'].apply(lambda x: set(np.unique(x)))
    runs = innings1.groupby(feats)['score'].sum().rolling(6).sum()

    fd_inn1 = pd.DataFrame({
        'batsmen':rolling_window(batsmen_per_over),
        'bowlers':rolling_window(bowlers_per_over),
        'runs': runs}).reset_index()
        
    # Innings 2    
    innings2 = df[df['innings'] == 2]
    batsmen_per_over = innings2.groupby(feats)['striker', 'non_striker'].apply(lambda x: set(np.unique(x)))
    bowlers_per_over = innings2.groupby(feats)['bowler'].apply(lambda x: set(np.unique(x)))
    runs = innings2.groupby(feats)['score'].sum().rolling(6).sum()
    
    
    fd_inn2 = pd.DataFrame({
        'batsmen':rolling_window(batsmen_per_over),
        'bowlers':rolling_window(bowlers_per_over),
        'runs': runs}).reset_index()
    
    fd_inn1 = fd_inn1.append(fd_inn2, ignore_index = True)
    fd_inn1.to_csv('dataset/individual_match_mod/mod_'+filename, sep='\t', index=False, na_rep="NaN")
    return fd_inn1
    
    

## Actual Transformation

In [7]:
assert False
import os
input_files = os.listdir('dataset/individual_match')
count = 0
for fileName in input_files:
    try:
        fo = transformFile(fileName)
        f_reload = pd.read_csv('dataset/individual_match_mod/mod_'+fileName, delimiter='\t')
    except RuntimeError as e:
        print(e)
        count += 1
print("Total files failed ", count)

Total files failed  0


### Merge all into One

In [8]:
assert(False)
import os

mod_files = os.listdir('dataset/individual_match_mod')
li = []
for filename in mod_files:

    df = pd.read_csv('dataset/individual_match_mod/'+filename, delimiter='\t', index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
print(frame.shape)

(31481, 9)


In [9]:
frame.to_csv('dataset/mod_all_matches.csv', sep='\t', index=False, na_rep="NaN")

## Validation

In [10]:
mod_files = os.listdir('dataset/individual_match_mod')
li = []
no_of_10 = 0
less_than_10 = 0
for filename in mod_files:

    df = pd.read_csv('dataset/individual_match_mod/'+filename, delimiter='\t', index_col=None, header=0)
#     li.append(df)
    if(df['runs'].isnull().sum()<10):
        less_than_10 += 1
        print(df['match_id'].unique(), df['runs'].isnull().sum())
    else:
        no_of_10 += 1
        
print(no_of_10)
print(less_than_10)
# frame = pd.concat(li, axis=0, ignore_index=True)

[829763.] 5
[501265.] 5
[1178424] 9
[829813] 7
815
4
