### Import Libraries

In [5]:
import pandas as pd
import numpy as np
from collections import Counter
import os.path

### Loading data and previously calculated stats

In [6]:
match_data = pd.read_csv(os.path.abspath('') + "\\..\\matches.csv")
delivery_data = pd.read_csv(os.path.abspath('') + "\\..\\deliveries.csv")

In [322]:
match_data.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,2017,Pune,2017-04-06,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,2017,Indore,2017-04-08,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,2017,Bangalore,2017-04-08,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,


In [7]:
df_avg_batsmen = pd.read_pickle(os.path.abspath('') + '\\..\\Derived stats\\batsmen_average_data.pkl')
df_avg_bowler = pd.read_pickle(os.path.abspath('') + '\\..\\Derived stats\\bowlers_average_data.pkl')

In [8]:
def is_nan(x):
    return (x is np.nan or x != x)

### Calculating Match Stats

In [13]:
match_stats_columns = ['Match ID', 'Date', 'Team Name', 'Opp Team', 'Innnings', 'City', 'Past lead', 'Target', 'Avg Bat1', 'Curr Bat1', 'Avg Bat2', 'Curr Bat2', 'Avg Bat3', 'Curr Bat3', 'Avg Bat4', 'Curr Bat4', 'Avg Bat5', 'Curr Bat5', 'Avg Bat6', 'Curr Bat6', 'Avg Bat7', 'Curr Bat7', 'Avg Bat8', 'Curr Bat8', 'Avg Bat9', 'Curr Bat9', 'Avg Bat10', 'Curr Bat10', 'Avg Bat11', 'Curr Bat11', 'Bowl1', 'Bowl2', 'Bowl3', 'Bowl4', 'Bowl5', 'Runs']

# Function to get average of bowlers
def get_bowler_economy(name, date):
    bowler_avg = df_avg_bowler[(df_avg_bowler['Name'] == name) & (df_avg_bowler['Date'] < date)]['Average economy']
    if len(bowler_avg):
        return bowler_avg.values[-1]
    else:
        return 0

def get_total_bowls(over_list):
    last_over = max(set(over_list))
    over_dict = Counter(over_list)
    balls_faced = (last_over-1)*6 + over_dict[last_over]
    return balls_faced

def get_innings_stats(match_id, match_date, team_name1, team_name2, innings, city, past_lead, target, total_match_score, match_stats):
    # Get all batsmen average for innings 1  
    in_batsmen = match_stats['batsman'].values
    in_batsmen = list(dict.fromkeys(in_batsmen))
    in_bat_avg = []
    in_bat_curr_form = []
    batsmen_count = 0
    for batsman in in_batsmen:
        batsman_past = df_avg_batsmen[(df_avg_batsmen['Name'] == batsman) & (df_avg_batsmen['Date'] <= match_date)]
        batsman_past = batsman_past.sort_values('Date')
        if(len(batsman_past)):
            bat_avg_list = [v for i, v in enumerate(batsman_past['Batting Average'].values) if i == 0 or v != batsman_past['Batting Average'].values[i-1]]
            in_bat_avg.append(bat_avg_list[-1])
            if(len(bat_avg_list) <= 5):
                in_bat_curr_form.append(bat_avg_list[-1])
            else:
                total_runs_till_now = bat_avg_list[-1]*len(bat_avg_list)
                total_runs_except_5 = bat_avg_list[-6]*(len(bat_avg_list)-5)
                curr_avg = (total_runs_till_now - total_runs_except_5)/5
                in_bat_curr_form.append(curr_avg)
                if curr_avg<0:
                    print(batsman, match_date)
                    print(bat_avg_list)
                    print("\n"*3)
        else:
            in_bat_avg.append(10)
            in_bat_curr_form.append(10)
    # Add an average of 10 if a middle order batsman didn't play                    
    in_bat_avg.extend([10]*(7-len(in_bat_avg)))
    # Add average of 0 for tailenders
    in_bat_avg.extend([0]*(11-len(in_bat_avg)))
    # Impute for current average
    in_bat_curr_form.extend([10]*(7-len(in_bat_curr_form)))
    in_bat_curr_form.extend([0]*(11-len(in_bat_curr_form)))
        
    # Get opposition economy average for innings 1
    oppposition_bowlers = list(set(match_stats['bowler'].values))
    oppposition_bowlers_stats = map(lambda x: get_bowler_economy(x, match_date), oppposition_bowlers)
    sorted_stats = sorted(oppposition_bowlers_stats)
    sorted_stats = list(filter(lambda y: y!=0, sorted_stats))
    sorted_stats = sorted_stats[:5]
    sorted_stats.extend([8]*(5-len(sorted_stats)))

    return pd.Series(
        [match_id, match_date, team_name1, team_name2, innings, city, past_lead, target, in_bat_avg[0], in_bat_curr_form[0], in_bat_avg[1],
         in_bat_curr_form[1], in_bat_avg[2], in_bat_curr_form[2], in_bat_avg[3], in_bat_curr_form[3],
         in_bat_avg[4], in_bat_curr_form[4], in_bat_avg[5], in_bat_curr_form[5], in_bat_avg[6],
         in_bat_curr_form[6], in_bat_avg[7], in_bat_curr_form[7], in_bat_avg[8], in_bat_curr_form[8], 
         in_bat_avg[9], in_bat_curr_form[9], in_bat_avg[10], in_bat_curr_form[10], sorted_stats[0],
         sorted_stats[1], sorted_stats[2], sorted_stats[3], sorted_stats[4], total_match_score], index=match_stats_columns)

# Get match stats removing successful chases
def get_match_stats(match_id, all_innings=True):
    return_rows = []
    match_date = match_data[match_data["id"] == match_id]['date'].values[0]
    city = match_data[match_data["id"] == match_id]['city'].values[0]
    
    match_data1 = delivery_data[(delivery_data['match_id'] == match_id) & (delivery_data["inning"] == 1)]
    total_match_score1 = match_data1['total_runs'].sum()
    overs_covered_in1 = match_data1['over'].values
    total_balls_in1 = get_total_bowls(overs_covered_in1)
    team_name1 = match_data1['batting_team'].values[0]
    
    match_data2 = delivery_data[(delivery_data['match_id'] == match_id) & (delivery_data["inning"] == 2)]
    total_match_score2 = match_data2['total_runs'].sum()
    overs_covered_in2 = match_data2['over'].values
    total_balls_in2 = get_total_bowls(overs_covered_in2)
    team_name2 = match_data2['batting_team'].values[0]
    
     # Figure out who won the last match
    df_1 = match_data.query("team1 == @team_name1 & team2 == @team_name2 & date < @match_date")
    df_2 = match_data.query("team1 == @team_name2 & team2 == @team_name1 & date < @match_date")
    
    final_df = pd.concat([df_1, df_2])
    final_df = final_df.sort_values('date')
    if(len(final_df)):
        match_winner = final_df['winner'].values[-1]
        if(match_winner == team_name1):
            team_lead1 = 1
            team_lead2 = 0
        else:
            team_lead1 = 0
            team_lead2 = 1
    else:
        team_lead1 = 0
        team_lead2 = 0

    return_rows.append(
        get_innings_stats(match_id, match_date, team_name1, team_name2, 1, city, team_lead1, 0, total_match_score1, match_data1))
    if all_innings:
        return_rows.append(
            get_innings_stats(match_id, match_date, team_name2, team_name1, 2, city, team_lead2, total_match_score1, total_match_score2, match_data2))

    return return_rows

In [14]:
match_stats_train = pd.DataFrame(columns = match_stats_columns) 
for _,i in match_data.iterrows():
    if(i['season'] > 2009 and i['season'] <= 2016):
        match_stats_train = match_stats_train.append(get_match_stats(i['id'], True), ignore_index=True)

match_stats_test = pd.DataFrame(columns = match_stats_columns) 
for _,i in match_data.iterrows():
    if(i['season'] == 2017):
        match_stats_test = match_stats_test.append(get_match_stats(i['id'], True), ignore_index=True)

### Save calculated stats

In [422]:
match_stats_test.to_pickle('match_stats_test.pkl')
match_stats_train.to_pickle('match_stats_train.pkl')