Use keras model to generate scores

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
import dateutil
from sklearn.linear_model import LinearRegression

In [2]:
#!pip install python-dateutil

In [3]:
#!ls csv_data/

In [4]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")

In [5]:
cutoff_start_year = '2011'
cutoff_start_date = datetime.strptime(cutoff_start_year+'-01-01','%Y-%m-%d')

In [6]:
match_summary_df = pd.read_csv('csv_data/match_list.csv',parse_dates=['date'],date_parser=custom_date_parser)
#country_rank_df = pd.read_csv('csv_data/')

In [7]:
recent_match_summary_df = match_summary_df[match_summary_df['date']>=cutoff_start_date]
recent_match_summary_df.shape

(969, 11)

In [8]:
recent_match_summary_df

Unnamed: 0,match_id,date,location,first_innings,second_innings,winner,win_by,win_dif,toss_winner,player_of_match,train_data
408,463150,2011-01-12,Durban,South Africa,India,South Africa,runs,135,South Africa,LL Tsotsobe,True
409,463151,2011-01-15,Johannesburg,India,South Africa,India,runs,1,India,MM Patel,True
410,446962,2011-01-16,Melbourne Cricket Ground,England,Australia,Australia,wickets,6,England,SR Watson,True
411,463152,2011-01-18,Cape Town,South Africa,India,India,wickets,2,South Africa,YK Pathan,True
412,446963,2011-01-21,Hobart,Australia,England,Australia,runs,46,England,SE Marsh,True
...,...,...,...,...,...,...,...,...,...,...,...
1372,1198240,2020-09-16,Manchester,England,Australia,Australia,wickets,3,England,GJ Maxwell,False
1373,1233461,2020-10-30,Rawalpindi Cricket Stadium,Pakistan,Zimbabwe,Pakistan,runs,26,Pakistan,BRM Taylor,False
1374,1233462,2020-11-01,Rawalpindi Cricket Stadium,Zimbabwe,Pakistan,Pakistan,wickets,6,Zimbabwe,Iftikhar Ahmed,False
1375,1223955,2020-11-27,Sydney Cricket Ground,Australia,India,Australia,runs,66,Australia,SPD Smith,False


In [9]:
match_id_list = list(recent_match_summary_df['match_id'])

# Feature Engineering from a single match

In [10]:
team_global_cache = {}
batsman_global_cache = {}
bowler_global_cache = {}

In [11]:
def get_trend(input_df,team_opponent,team_name,target_field):
    input_df.rename(columns={'winner':'winning_team'},inplace=True)

    selected_match_id_list = list(input_df['match_id'])
    match_detail_list = []
    for match_id in selected_match_id_list:
        if input_df[input_df['match_id']==match_id]['train_data'].values[0]==True:
            match_info = pd.read_csv('csv_data/train/'+str(match_id)+'.csv')
        else:
            match_info = pd.read_csv('csv_data/test/'+str(match_id)+'.csv')
        match_detail_list.append(match_info)
    match_detail_df = pd.concat(match_detail_list)
    match_detail_df.fillna('NA',inplace=True)

    match_detail_df = input_df.merge(match_detail_df,how='inner',on='match_id')


    sorted_df = match_detail_df[match_detail_df[team_opponent].isin(team_name)].groupby('match_id').agg({'date': 'min', target_field: 'sum'}).reset_index()
    sorted_df.sort_values('date',inplace=True)

    y=np.array(sorted_df[target_field])
    x=np.array(range(sorted_df.shape[0])).reshape(-1,1)+1
    linear_trend_model = LinearRegression()
    linear_trend_model.fit(x,y)
    next_instance_num = x.shape[0]+1
    
    base = linear_trend_model.intercept_
    trend = linear_trend_model.coef_[0]
    trend_predict = linear_trend_model.predict(np.array([next_instance_num]).reshape(-1,1))[0]
    mean = sorted_df[target_field].mean()
    
    return base,trend,trend_predict,mean


    
    

In [12]:
def find_similar_team(search_country,country_rank_df):
    search_rank = country_rank_df[country_rank_df['country']==search_country]['rank'].values[0]
    if search_rank == 1:
        similar_rank_list = [2]
    elif search_rank == country_rank_df['rank'].max():
        similar_rank_list = [country_rank_df['rank'].max()-1]
    else:
        similar_rank_list = [search_rank-1,search_rank+1]
    return list(country_rank_df[country_rank_df['rank'].isin(similar_rank_list)]['country'])
    

In [13]:
def get_recent_team_performance(match_summary_df,team,match_date,global_cache={}):
    ts = (match_date - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
    match_dt = datetime.utcfromtimestamp(ts)
    
    two_months = dateutil.relativedelta.relativedelta(months=2)
    a_year = dateutil.relativedelta.relativedelta(years=1)
    check_range = match_dt - two_months
    if team in global_cache and\
    global_cache[team]['date']>=check_range and\
    global_cache[team]['date']<=match_dt:
        return global_cache[team]['performance'],global_cache
    else:
        cutoff_date = match_dt-a_year
        win_count = match_summary_df[(match_summary_df['date']>=cutoff_date) & \
                                     (match_summary_df['date']<match_date) &\
                                     (match_summary_df['winner']==team)].shape[0]
        total_win_by_runs = match_summary_df[(match_summary_df['date']>=cutoff_date) &\
                                             (match_summary_df['date']<match_date) &\
                                             (match_summary_df['winner']==team) &\
                                             (match_summary_df['win_by']=='runs')]['win_dif'].sum()
        total_win_by_wickets = match_summary_df[(match_summary_df['date']>=cutoff_date) & \
                                                (match_summary_df['date']<match_date) &\
                                                (match_summary_df['winner']==team) & \
                                                (match_summary_df['win_by']=='wickets')]['win_dif'].sum()
        matches_played = match_summary_df[(match_summary_df['date']>=cutoff_date) &\
                                          (match_summary_df['date']<match_date) &\
                                          ( \
                                           (match_summary_df['first_innings']==team) | \
                                           (match_summary_df['second_innings']==team)\
                                          )].shape[0]
        if matches_played != 0:
            win_ratio = win_count/matches_played
        else:
            win_ratio =0
        
        total_loss_by_runs = match_summary_df[\
                                              (match_summary_df['date']>=cutoff_date) &\
                                              (match_summary_df['date']<match_date) &\
                                              (match_summary_df['second_innings']==team)&\
                                              (match_summary_df['winner']!=team) &\
                                              (match_summary_df['win_by']=='runs')\
                                             ]['win_dif'].sum()
        
        total_loss_by_wickets = match_summary_df[\
                                              (match_summary_df['date']>=cutoff_date) &\
                                              (match_summary_df['date']<match_date) &\
                                              (match_summary_df['first_innings']==team)&\
                                              (match_summary_df['winner']!=team) &\
                                              (match_summary_df['win_by']=='wickets')\
                                             ]['win_dif'].sum()
        effective_win_by_runs = total_win_by_runs-total_loss_by_runs
        effective_win_by_wickets = total_win_by_wickets-total_loss_by_wickets
        performance = {
                'country':team,
                 'win_ratio':win_ratio,
                 'effective_win_by_runs':effective_win_by_runs,
                 'effective_win_by_wickets':effective_win_by_wickets,
                 'matches_played':matches_played,
                 'win_count':win_count
            }
        
        global_cache[team]={
            'date':match_dt,
            'performance':performance
            
        }
        
    return performance,global_cache
        
        
    

In [14]:
def get_recent_batsman_performance(match_summary_df,team,batsman,match_date,global_cache={}):
#     ts = (match_date - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
#     match_dt = datetime.utcfromtimestamp(ts)
    match_dt = match_date
    
    two_months = dateutil.relativedelta.relativedelta(months=2)
    a_year = dateutil.relativedelta.relativedelta(years=1)
    check_range = match_dt - two_months
    if batsman in global_cache and \
    global_cache[batsman]['date']>=check_range and \
    global_cache[batsman]['date']<=match_dt:
        return global_cache[batsman]['performance'],global_cache
    else:
        cutoff_date = match_dt-a_year
        country_games = match_summary_df[(match_summary_df['date']>=cutoff_date) 
                             & (match_summary_df['date']<match_date)
                             & ( (match_summary_df['first_innings']==team)
                                |(match_summary_df['second_innings']==team)
                             )]
        match_id_list = list(country_games['match_id'])
        
        if len(match_id_list) == 0:
            return None, global_cache
        
        match_stat_list = []
        for match_id in match_id_list:
            if country_games[country_games['match_id']==match_id]['train_data'].values[0] == True:
                match_df = pd.read_csv('csv_data/train/'+str(match_id)+'.csv')
            else:
                match_df = pd.read_csv('csv_data/test/'+str(match_id)+'.csv')
            match_stat_list.append(match_df)


        match_stat_df = pd.concat(match_stat_list)
        match_stat_df.fillna('NA',inplace=True)
        
        batsman_df = match_stat_df[match_stat_df['batsman']==batsman]
        if batsman_df.shape[0]==0:
            return None,global_cache
        
        total_runs = batsman_df['scored_runs'].sum()
        run_rate = batsman_df['scored_runs'].sum()/match_stat_df[match_stat_df['batsman']==batsman].shape[0]
        
        #opponent_variability
        #batsman_df.rename(columns={'opponent':'country'},inplace=True)
        opponent_variability = batsman_df['opponent'].nunique()
          
        #matches_played = len(list(batsman_df['match_id'].unique()))
        player_of_the_match = country_games[country_games['player_of_match']==batsman].shape[0]

        #winning contribution(effectiveness)-% of winning score
        country_win_list = list(country_games[country_games['winner']==team]['match_id'])
        winning_match_df = match_stat_df[match_stat_df['match_id'].isin(country_win_list)]
        winning_contribution = winning_match_df[winning_match_df['batsman']==batsman]['scored_runs'].sum()/winning_match_df[winning_match_df['team']==team]['scored_runs'].sum()

        #run_rate_effectiveness
        country_run_rate = winning_match_df[winning_match_df['team']==team]['scored_runs'].sum()/winning_match_df[winning_match_df['team']==team].shape[0]
        batsman_run_rate = winning_match_df[winning_match_df['batsman']==batsman]['scored_runs'].sum()/winning_match_df[winning_match_df['batsman']==batsman].shape[0]

        run_rate_effectiveness = batsman_run_rate/country_run_rate
        
        #batting_std = batsman_df.groupby(['match_id'])['scored_runs'].sum().reset_index()['scored_runs'].std()
        
        #consistency = 1/batting_std if batting_std!=0 else 1
        average_score = batsman_df.groupby(['match_id'])['scored_runs'].sum().reset_index()['scored_runs'].mean()

        performance = {
            'batsman':batsman,
            'country':team,
            'total_runs':total_runs,
            'run_rate':run_rate,
            'average_score':average_score,
            'opponent_variability':opponent_variability,
            #'matches_played':matches_played,
            'player_of_the_match':player_of_the_match,
            'winning_contribution':winning_contribution,
            'run_rate_effectiveness':run_rate_effectiveness,
            #'consistency':consistency
        }
        
        global_cache[batsman]={
            'date':match_dt,
            'performance':performance
        }
        
    return performance,global_cache

In [15]:
def get_recent_bowler_performance(match_summary_df,team,bowler,match_date,global_cache={}):
    ts = (match_date - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
    match_dt = datetime.utcfromtimestamp(ts)
    
    two_months = dateutil.relativedelta.relativedelta(months=2)
    a_year = dateutil.relativedelta.relativedelta(years=1)
    check_range = match_dt - two_months
    if bowler in global_cache and \
    global_cache[bowler]['date']>=check_range and \
    global_cache[bowler]['date']<=match_dt:
        
        return global_cache[bowler]['performance'],global_cache
    else:
        cutoff_date = match_dt-a_year
        country_games = match_summary_df[(match_summary_df['date']>=cutoff_date) 
                             & (match_summary_df['date']<match_date)
                             & ( (match_summary_df['first_innings']==team)
                                |(match_summary_df['second_innings']==team)
                             )]
        match_id_list = list(country_games['match_id'])
        
        if len(match_id_list) == 0:
            return None, global_cache
        
        match_stat_list = []
        for match_id in match_id_list:
            if country_games[country_games['match_id']==match_id]['train_data'].values[0] == True:
                match_df = pd.read_csv('csv_data/train/'+str(match_id)+'.csv')
            else:
                match_df = pd.read_csv('csv_data/test/'+str(match_id)+'.csv')
            match_stat_list.append(match_df)


        match_stat_df = pd.concat(match_stat_list)
        match_stat_df.fillna('NA',inplace=True)
        
        bowler_df = match_stat_df[match_stat_df['bowler']==bowler]
        if bowler_df.shape[0]==0:
            return None,global_cache
        
        total_runs = bowler_df['total'].sum()
        run_rate = total_runs/bowler_df.shape[0]
        negative_rate = -run_rate

        # no_of_wickets,wicket_rate,wicket_per_runs
        no_of_wickets = bowler_df['wicket'].sum()-bowler_df[bowler_df['wicket_type']=='run out'].shape[0]
        wickets_per_match = no_of_wickets/len(list(bowler_df['match_id'].unique()))
        wickets_per_run = no_of_wickets/total_runs


        
        #opponent_variability
        opponent_variability = bowler_df['team'].nunique()

        
        matches_played = len(list(bowler_df['match_id'].unique()))
        #player_of_the_match = country_games[country_games['player_of_match']==selected_bowler].shape[0]

        #winning contribution(effectiveness)-% of wickets taken in winning matches
        country_win_list = list(country_games[country_games['winner']==team]['match_id'])
        winning_match_df = match_stat_df[match_stat_df['match_id'].isin(country_win_list)]

        if winning_match_df['wicket'].sum() !=0:
            winning_contribution = winning_match_df[winning_match_df['bowler']==bowler]['wicket'].sum()/winning_match_df['wicket'].sum()
        else:
            winning_contribution = 0
        
        #winning_wicket_per_run rate contribution
        #winning wicket_per_match contirbution
 
        team_wickets_per_run = winning_match_df[winning_match_df['opponent']==team]['wicket'].sum()/winning_match_df[winning_match_df['opponent']==team]['total'].sum()
        bowler_wicket_per_run = winning_match_df[winning_match_df['bowler']==bowler]['wicket'].sum()/winning_match_df[winning_match_df['bowler']==bowler]['total'].sum()
        winning_wicket_per_run_rate_contribution = bowler_wicket_per_run/team_wickets_per_run

        team_wicket_per_match = winning_match_df[winning_match_df['opponent']==team]['wicket'].sum()/winning_match_df['match_id'].nunique()
        bowler_wicket_per_match = winning_match_df[winning_match_df['bowler']==bowler]['wicket'].sum()/winning_match_df[winning_match_df['bowler']==bowler]['match_id'].nunique()
        winning_wicket_per_match_contribution = bowler_wicket_per_match/team_wicket_per_match
        
        no_of_wins=winning_match_df[winning_match_df['bowler']==bowler]['match_id'].nunique()
        #consistency
        #consistency = 1/match_stat_df[match_stat_df['bowler']==selected_bowler].groupby(['match_id'])['wicket'].sum().reset_index()['wicket'].std()

        performance = {
            'bowler':bowler,
            'country':team,
            'negative_rate':negative_rate,
            'no_of_wickets':no_of_wickets,
            'wickets_per_match':wickets_per_match,
            'wickets_per_run':wickets_per_run,
            'no_of_wins':no_of_wins,
            #'team_score':team_score,
            'opponent_variability':opponent_variability,
            'winning_contribution':winning_contribution,
            'winning_wicket_rate_contribution':winning_wicket_per_match_contribution,
            
            
        }
        
        global_cache[bowler]={
            'date':match_dt,
            'performance':performance
        }
        
    return performance,global_cache

In [16]:
# a_month = dateutil.relativedelta.relativedelta(months=1)
# a_year = dateutil.relativedelta.relativedelta(years=1)
# cutoff_start_date-a_year
#type(cutoff_start_date)
#datetime.utcfromtimestamp(match_date)
# match_date
# ts = (match_date - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
# some_date = datetime.utcfromtimestamp(ts)
# recent_match_summary_df[recent_match_summary_df['date']>some_date]

In [17]:
#selected_match_id = match_id_list[0]
#selected_match_id=463150

In [18]:
performance_year = '2018'
cutoff_start_date = datetime.strptime(performance_year+'-01-01','%Y-%m-%d')
cutoff_end_date = datetime.strptime(performance_year+'-12-31','%Y-%m-%d')

In [19]:
selected_matches = match_summary_df[(match_summary_df['date']>=cutoff_start_date) & (match_summary_df['date']<=cutoff_end_date)]


In [20]:
match_id_list = list(selected_matches['match_id'])

In [21]:

performance_list = []
for selected_match_id in tqdm(match_id_list):
    if selected_matches[selected_matches['match_id']==selected_match_id]['train_data'].values[0]==True:
        selected_match_details = pd.read_csv('csv_data/train/'+str(selected_match_id)+'.csv')
    else:
        selected_match_details = pd.read_csv('csv_data/test/'+str(selected_match_id)+'.csv')
        
    unique_batsman_list = selected_match_details['batsman']
    for batsman in unique_batsman_list:
        if batsman not in batsman_global_cache:
            team = selected_match_details[selected_match_details['batsman']==batsman]['team'].values[0]
            performance,batsman_global_cache=get_recent_batsman_performance(selected_matches,team,batsman,cutoff_end_date,global_cache=batsman_global_cache)
            if performance is not None:
                performance_list.append(performance)
    
        
    

HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))

  batsman_run_rate = winning_match_df[winning_match_df['batsman']==batsman]['scored_runs'].sum()/winning_match_df[winning_match_df['batsman']==batsman].shape[0]
  winning_contribution = winning_match_df[winning_match_df['batsman']==batsman]['scored_runs'].sum()/winning_match_df[winning_match_df['team']==team]['scored_runs'].sum()
  country_run_rate = winning_match_df[winning_match_df['team']==team]['scored_runs'].sum()/winning_match_df[winning_match_df['team']==team].shape[0]





In [22]:
performance_df = pd.DataFrame(performance_list)

In [23]:
batsman_columns = ['total_runs', 'run_rate', 'average_score', 'opponent_variability','player_of_the_match', 'winning_contribution', 'run_rate_effectiveness']



In [24]:
performance_df.fillna(0,inplace=True)


In [25]:
import pickle
pickle.dump(performance_df,open('batsman_performance.pkl','wb'))

In [42]:
performance_matrix = np.array(performance_df[batsman_columns])

In [58]:
mm_scaler = MinMaxScaler()

In [59]:
agg_score=np.sum(mm_scaler.fit_transform(performance_matrix),axis=1)

In [60]:
performance_df['agg_score']=agg_score

In [61]:
performance_df.sort_values('agg_score',ascending=False)

Unnamed: 0,batsman,country,total_runs,run_rate,average_score,opponent_variability,player_of_the_match,winning_contribution,run_rate_effectiveness,agg_score
143,V Kohli,India,970,0.992835,80.833333,3,4,0.187547,1.133280,4.606274
141,RG Sharma,India,1021,0.997070,60.058824,6,2,0.250063,1.134841,4.542342
31,Fakhar Zaman,Pakistan,728,0.962963,48.533333,6,2,0.276699,1.182598,4.203805
173,Rahmat Shah,Afghanistan,708,0.749206,41.647059,8,2,0.215589,1.009740,4.040339
61,BRM Taylor,Zimbabwe,839,0.874870,44.157895,7,1,0.261538,1.097580,4.039284
...,...,...,...,...,...,...,...,...,...,...
30,Azhar Ali,Pakistan,0,0.000000,0.000000,1,0,0.000000,0.000000,0.000000
106,N Pradeep,Sri Lanka,0,0.000000,0.000000,1,0,0.000000,0.000000,0.000000
287,Simandeep Singh,Hong Kong,0,0.000000,0.000000,1,0,0.000000,0.000000,0.000000
285,D Ravu,Papua New Guinea,0,0.000000,0.000000,1,0,0.000000,0.000000,0.000000
