In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
import dateutil
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics.pairwise import cosine_similarity
import json
import pickle

from keras.models import model_from_json


In [2]:
def mape(y_true,y_predict):
    return np.sum((np.abs(y_true-y_predict)/y_true)*100)/len(y_true)

In [3]:
def load_model(model_name):
    # load json and create model
    json_file = open(model_name+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_name+".h5")
    return loaded_model

In [4]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")

In [5]:
cutoff_start_year = '2013'
cutoff_start_date = datetime.strptime(cutoff_start_year+'-01-01','%Y-%m-%d')

In [6]:
match_summary_df = pd.read_csv('csv_data/match_list.csv',parse_dates=['date'],date_parser=custom_date_parser)
recent_match_summary_df = match_summary_df[match_summary_df['date']>=cutoff_start_date]
recent_match_summary_df.shape

(781, 11)

In [7]:
match_stats_df = pd.read_csv('csv_data/match_stats.csv')

In [8]:
recent_match_summary_df=recent_match_summary_df.merge(match_stats_df,on='match_id',how='inner')
recent_match_summary_df=recent_match_summary_df[recent_match_summary_df['first_innings']==recent_match_summary_df['team_statistics']]


In [9]:
recent_match_summary_df.columns

Index(['match_id', 'date', 'location', 'first_innings', 'second_innings',
       'winner', 'win_by', 'win_dif', 'toss_winner', 'player_of_match',
       'train_data', 'team_statistics', 'batsman_1', 'batsman_1_runs',
       'batsman_2', 'batsman_2_runs', 'batsman_3', 'batsman_3_runs',
       'batsman_4', 'batsman_4_runs', 'batsman_5', 'batsman_5_runs',
       'batsman_6', 'batsman_6_runs', 'batsman_7', 'batsman_7_runs',
       'batsman_8', 'batsman_8_runs', 'batsman_9', 'batsman_9_runs',
       'batsman_10', 'batsman_10_runs', 'batsman_11', 'batsman_11_runs',
       'bowler_1', 'bowler_1_wickets', 'bowler_2', 'bowler_2_wickets',
       'bowler_3', 'bowler_3_wickets', 'bowler_4', 'bowler_4_wickets',
       'bowler_5', 'bowler_5_wickets', 'bowler_6', 'bowler_6_wickets',
       'bowler_7', 'bowler_7_wickets', 'bowler_8', 'bowler_8_wickets',
       'bowler_9', 'bowler_9_wickets', 'bowler_10', 'bowler_10_wickets',
       'bowler_11', 'bowler_11_wickets', 'total_run', 'total_wickets'],
     

In [10]:
country_enc_map=pickle.load(open('country_enc_map.pkl','rb'))
batsman_enc_map = pickle.load(open('batsman_enc_map.pkl','rb'))
loc_enc_map_for_batsman = pickle.load(open('loc_enc_map_for_batsman.pkl','rb'))

In [11]:
def get_oh_pos(pos):
    vec=np.zeros((11)).astype(int) 
    vec[pos-1]=1
    return vec

In [12]:
batsman_group_encode_model = load_model('batsman_group_encode_model')

# Linear Regression with mean\sum embedding

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [13]:
recent_match_summary_df.head()

Unnamed: 0,match_id,date,location,first_innings,second_innings,winner,win_by,win_dif,toss_winner,player_of_match,...,bowler_8,bowler_8_wickets,bowler_9,bowler_9_wickets,bowler_10,bowler_10_wickets,bowler_11,bowler_11_wickets,total_run,total_wickets
0,589309,2013-01-03,Kolkata,Pakistan,India,Pakistan,runs,85,India,Nasir Jamshed,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,250,10
2,589310,2013-01-06,Delhi,India,Pakistan,India,runs,10,India,MS Dhoni,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,167,10
4,565812,2013-01-11,Rajkot,England,India,England,runs,9,England,JC Tredwell,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,325,9
6,573014,2013-01-11,Melbourne Cricket Ground,Australia,Sri Lanka,Australia,runs,107,Australia,PJ Hughes,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,305,10
8,573015,2013-01-13,Adelaide Oval,Australia,Sri Lanka,Sri Lanka,wickets,8,Sri Lanka,HDRL Thirimanne,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,170,2


In [14]:
#recent_match_summary_df.iloc[recent_match_summary_df.shape[0]-1]

In [178]:
encoding_train_list = []
encoding_test_list = []

target_train_list = []
target_test_list =[]
no_of_rows = recent_match_summary_df.shape[0]
#print(no_of_rows)
for pos in tqdm(range(no_of_rows)):
    match_details = recent_match_summary_df.iloc[pos]
    location = match_details['location']
    team = match_details['first_innings']
    opposition = match_details['second_innings']
    total_run = match_details['total_run']
    is_train = match_details['train_data']
    
    loc_oh = loc_enc_map_for_batsman[location]
    opposition_oh = country_enc_map[opposition]
    
    batsman_oh_list =[]
    position_oh_list =[]
    loc_oh_list =[]
    opposition_oh_list =[]
    #print('getting batsman details')
    for bi in range(11):
        batsman = match_details['batsman_'+str(bi+1)]
        if batsman == 'not_batted':
            break
            #batsman_oh = batsman_enc_map[batsman]
        else:
            batsman_oh = batsman_enc_map[team.strip()+' '+batsman.strip()]
        position_oh = get_oh_pos(bi+1)
        
        batsman_oh_list.append(batsman_oh)
        position_oh_list.append(position_oh)
        loc_oh_list.append(loc_oh)
        opposition_oh_list.append(opposition_oh)
        
    batsman_mat = np.stack(batsman_oh_list)
    position_mat = np.stack(position_oh_list)
    loc_mat = np.stack(loc_oh_list)
    opposition_mat = np.stack(opposition_oh_list)
    #print('encoding')
    batsman_group_enc_mat = batsman_group_encode_model.predict([batsman_mat,position_mat,loc_mat,opposition_mat])
    batsman_mean =  batsman_group_enc_mat.sum(axis=0) 
    #batsman_mean = batsman_group_enc_mat.reshape(-1)
    
    if is_train:
        encoding_train_list.append(batsman_mean)
        target_train_list.append(total_run)
    else:
        encoding_test_list.append(batsman_mean)
        target_test_list.append(total_run)
    
    
    

HBox(children=(FloatProgress(value=0.0, max=781.0), HTML(value='')))




In [179]:
scaler = StandardScaler()

encoding_mat_train = np.stack(encoding_train_list)
train_x = scaler.fit_transform(encoding_mat_train)
train_y = np.stack(target_train_list)

encoding_mat_test = np.stack(encoding_test_list)
test_x = scaler.transform(encoding_mat_test)
test_y = np.stack(target_test_list)

In [180]:
lr = LinearRegression()
#lr.fit(train_x,train_y)
lr.fit(encoding_mat_train,train_y)

LinearRegression()

In [181]:
#train_y_predict = lr.predict(train_x)
#test_y_predict = lr.predict(test_x)

train_y_predict = lr.predict(encoding_mat_train)
test_y_predict = lr.predict(encoding_mat_test)

In [182]:
mape(train_y,train_y_predict),mape(test_y,test_y_predict)

(15.954402022846267, 20.562788670042373)

In [183]:
test_y.shape

(165,)

In [184]:
mean_absolute_error(train_y,train_y_predict),mean_absolute_error(test_y,test_y_predict)

(33.747131942154525, 43.53522024443655)

# Combining with team embedding

In [124]:
#country_enc_map=pickle.load(open('country_enc_map.pkl','rb'))
loc_enc_map=pickle.load(open('loc_enc_map.pkl','rb'))
group_encode_model_V2 = load_model('group_encode_model_V2')

#v1
enc_map = pickle.load(open('country_location_enc_map.pkl','rb'))
group_encode_model_V1 = load_model('group_encode_model')

In [135]:
encoding_train_list = []
encoding_test_list = []

target_train_list = []
target_test_list =[]
no_of_rows = recent_match_summary_df.shape[0]
#print(no_of_rows)
for pos in tqdm(range(no_of_rows)):
    match_details = recent_match_summary_df.iloc[pos]
    location = match_details['location']
    team = match_details['first_innings']
    opposition = match_details['second_innings']
    total_run = match_details['total_run']
    is_train = match_details['train_data']
    
    loc_oh = loc_enc_map_for_batsman[location]
    opposition_oh = country_enc_map[opposition]
    
    batsman_oh_list =[]
    position_oh_list =[]
    loc_oh_list =[]
    opposition_oh_list =[]
    #print('getting batsman details')
    for bi in range(11):
        batsman = match_details['batsman_'+str(bi+1)]
        if batsman == 'not_batted':
            break
            #batsman_oh = batsman_enc_map[batsman]
        else:
            batsman_oh = batsman_enc_map[team.strip()+' '+batsman.strip()]
        position_oh = get_oh_pos(bi+1)
        
        batsman_oh_list.append(batsman_oh)
        position_oh_list.append(position_oh)
        loc_oh_list.append(loc_oh)
        opposition_oh_list.append(opposition_oh)
        
    batsman_mat = np.stack(batsman_oh_list)
    position_mat = np.stack(position_oh_list)
    loc_mat = np.stack(loc_oh_list)
    opposition_mat = np.stack(opposition_oh_list)
    #print('encoding')
    batsman_group_enc_mat = batsman_group_encode_model.predict([batsman_mat,position_mat,loc_mat,opposition_mat])
    batsman_mean =  batsman_group_enc_mat.sum(axis=0) 
    #batsman_mean = batsman_group_enc_mat.reshape(-1)
    
    ##create team_encoding with V2
    
    team_oh_v = np.array(country_enc_map[team]).reshape(1,-1)
    opponent_oh_v = np.array(opposition_oh).reshape(1,-1)
    if location not in loc_enc_map:
        continue
    loc_oh_v=np.array(loc_enc_map[location]).reshape(1,-1)
    country_enc_vec = group_encode_model_V2.predict([team_oh_v,opponent_oh_v,loc_oh_v]).reshape(-1)

    ##create team_encoding with V1
    
#     team_oh_v = np.array(enc_map[team]).reshape(1,-1)
#     opponent_oh_v = np.array(enc_map[opposition]).reshape(1,-1)
#     if location not in loc_enc_map:
#         continue
#     loc_oh_v=np.array(enc_map[location]).reshape(1,-1)
#     country_enc_vec = group_encode_model_V1.predict([team_oh_v,opponent_oh_v,loc_oh_v]).reshape(-1)
    
    final_vector = np.concatenate([batsman_mean,country_enc_vec])
    
    if is_train:
        encoding_train_list.append(final_vector)
        target_train_list.append(total_run)
    else:
        encoding_test_list.append(final_vector)
        target_test_list.append(total_run)

HBox(children=(FloatProgress(value=0.0, max=781.0), HTML(value='')))




In [136]:
scaler_exp = StandardScaler()

encoding_mat_train = np.stack(encoding_train_list)
train_x = scaler_exp.fit_transform(encoding_mat_train)
train_y = np.stack(target_train_list)

encoding_mat_test = np.stack(encoding_test_list)
test_x = scaler_exp.transform(encoding_mat_test)
test_y = np.stack(target_test_list)

In [137]:
lr_exp = LinearRegression()
lr_exp.fit(train_x,train_y)
#lr.fit(encoding_mat_train,train_y)

LinearRegression()

In [138]:
train_y_predict = lr_exp.predict(train_x)
test_y_predict = lr_exp.predict(test_x)

#train_y_predict = lr.predict(encoding_mat_train)
#test_y_predict = lr.predict(encoding_mat_test)

In [139]:
mape(train_y,train_y_predict),mape(test_y,test_y_predict)

(15.183808804019803, 20.152482658521194)

# Combining with team embedding + engineered features

In [185]:
#country_enc_map=pickle.load(open('country_enc_map.pkl','rb'))
loc_enc_map=pickle.load(open('loc_enc_map.pkl','rb'))
group_encode_model_V2 = load_model('group_encode_model_V2')

#v1
enc_map = pickle.load(open('country_location_enc_map.pkl','rb'))
group_encode_model_V1 = load_model('group_encode_model')

feature_df = pd.read_csv('csv_data/feature_first_innings.csv',parse_dates=['match_date'],date_parser=custom_date_parser)
#feature_df.dropna(inplace=True)

feature_columns = ['team_score', 'opponent_score', 'location_base', 'location_mean','batsman_mean', 'batsman_max', 'bowler_mean']
       

feature_match_id_list = list(feature_df['match_id'].unique())

In [194]:
match_id_train_list = []
match_id_test_list = []

encoding_train_list = []
encoding_test_list = []

target_train_list = []
target_test_list =[]
no_of_rows = recent_match_summary_df.shape[0]
#print(no_of_rows)
for pos in tqdm(range(no_of_rows)):
    match_details = recent_match_summary_df.iloc[pos]
    match_id = match_details['match_id']
    if match_id not in feature_match_id_list:
        continue
    location = match_details['location']
    team = match_details['first_innings']
    opposition = match_details['second_innings']
    total_run = match_details['total_run']
    is_train = match_details['train_data']
    
    loc_oh = loc_enc_map_for_batsman[location]
    opposition_oh = country_enc_map[opposition]
    
    batsman_oh_list =[]
    position_oh_list =[]
    loc_oh_list =[]
    opposition_oh_list =[]
    #print('getting batsman details')
    for bi in range(11):
        batsman = match_details['batsman_'+str(bi+1)]
        if batsman == 'not_batted':
            break
            #batsman_oh = batsman_enc_map[batsman]
        else:
            batsman_oh = batsman_enc_map[team.strip()+' '+batsman.strip()]
        position_oh = get_oh_pos(bi+1)
        
        batsman_oh_list.append(batsman_oh)
        position_oh_list.append(position_oh)
        loc_oh_list.append(loc_oh)
        opposition_oh_list.append(opposition_oh)
        
    batsman_mat = np.stack(batsman_oh_list)
    position_mat = np.stack(position_oh_list)
    loc_mat = np.stack(loc_oh_list)
    opposition_mat = np.stack(opposition_oh_list)
    #print('encoding')
    batsman_group_enc_mat = batsman_group_encode_model.predict([batsman_mat,position_mat,loc_mat,opposition_mat])
    batsman_mean =  batsman_group_enc_mat.sum(axis=0) 
    #batsman_mean = batsman_group_enc_mat.reshape(-1)
    
    ##create team_encoding with V2
    
    team_oh_v = np.array(country_enc_map[team]).reshape(1,-1)
    opponent_oh_v = np.array(opposition_oh).reshape(1,-1)
    if location not in loc_enc_map:
        continue
    loc_oh_v=np.array(loc_enc_map[location]).reshape(1,-1)
    country_enc_vec = group_encode_model_V2.predict([team_oh_v,opponent_oh_v,loc_oh_v]).reshape(-1)

    ##create team_encoding with V1
    
#     team_oh_v = np.array(enc_map[team]).reshape(1,-1)
#     opponent_oh_v = np.array(enc_map[opposition]).reshape(1,-1)
#     if location not in loc_enc_map:
#         continue
#     loc_oh_v=np.array(enc_map[location]).reshape(1,-1)
#     country_enc_vec = group_encode_model_V1.predict([team_oh_v,opponent_oh_v,loc_oh_v]).reshape(-1)
    
    #engineered_features
    
    feature_vector = np.array(feature_df[feature_df['match_id']==match_id][feature_columns]).reshape(-1)
    
    final_vector = np.concatenate([batsman_mean,country_enc_vec,feature_vector])
    
    if is_train:
        encoding_train_list.append(final_vector)
        target_train_list.append(total_run)
        match_id_train_list.append(match_id)
    else:
        encoding_test_list.append(final_vector)
        target_test_list.append(total_run)
        match_id_test_list.append(match_id)

HBox(children=(FloatProgress(value=0.0, max=781.0), HTML(value='')))




In [195]:
scaler_exp_2 = StandardScaler()

encoding_mat_train = np.stack(encoding_train_list)
train_x = scaler_exp_2.fit_transform(encoding_mat_train)
train_y = np.stack(target_train_list)

encoding_mat_test = np.stack(encoding_test_list)
test_x = scaler_exp_2.transform(encoding_mat_test)
test_y = np.stack(target_test_list)

In [196]:
lr_exp_2 = LinearRegression()
lr_exp_2.fit(train_x,train_y)
#lr.fit(encoding_mat_train,train_y)

LinearRegression()

In [197]:
train_y_predict = lr_exp_2.predict(train_x)
test_y_predict = lr_exp_2.predict(test_x)

#train_y_predict = lr.predict(encoding_mat_train)
#test_y_predict = lr.predict(encoding_mat_test)

In [198]:
mape(train_y,train_y_predict),mape(test_y,test_y_predict)

(14.8768839853773, 18.43278736845787)

In [199]:
mean_absolute_error(train_y,train_y_predict),mean_absolute_error(test_y,test_y_predict)

(32.11337307271952, 39.05964316798236)

In [200]:
test_y.shape

(109,)

In [201]:
pickle.dump(scaler_exp_2,open('scaler_combined_embedding_first_innings_regression.pkl','wb'))
pickle.dump(lr_exp_2,open('combined_embedding_first_innings_regression.pkl','wb'))

# Create a prediction output to be used with second innings

In [202]:
embedding_prediction_train  = pd.DataFrame()
embedding_prediction_test = pd.DataFrame()

In [203]:
embedding_prediction_train['match_id']=match_id_train_list
embedding_prediction_train['actual_first_innings_runs']=target_train_list
embedding_prediction_train['predicted_first_innings_runs']=train_y_predict
embedding_prediction_train.head()

Unnamed: 0,match_id,actual_first_innings_runs,predicted_first_innings_runs
0,589310,167,197.35332
1,573014,305,317.55376
2,573015,170,226.370265
3,565813,285,262.486889
4,573016,74,191.690613


In [204]:
embedding_prediction_test['match_id']=match_id_test_list
embedding_prediction_test['actual_first_innings_runs']=target_test_list
embedding_prediction_test['predicted_first_innings_runs']=test_y_predict
embedding_prediction_test.head()

Unnamed: 0,match_id,actual_first_innings_runs,predicted_first_innings_runs
0,1153840,371,317.416427
1,1153841,319,311.965355
2,1153842,364,379.835877
3,1144156,266,315.11473
4,1144157,203,250.437164


In [207]:
embedding_prediction_train.to_csv('csv_data/first_innings_embedding_prediction_train.csv',index=False)
embedding_prediction_test.to_csv('csv_data/first_innings_embedding_prediction_test.csv',index=False)