In [13]:
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
import dateutil
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics.pairwise import cosine_similarity
import json
import pickle

from keras.models import model_from_json

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error


In [2]:
def mape(y_true,y_predict):
    return np.sum((np.abs(y_true-y_predict)/y_true)*100)/len(y_true)

In [3]:
def load_model(model_name):
    # load json and create model
    json_file = open(model_name+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_name+".h5")
    return loaded_model

In [4]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")

In [5]:
cutoff_start_year = '2013'
cutoff_start_date = datetime.strptime(cutoff_start_year+'-01-01','%Y-%m-%d')

In [6]:
match_summary_df = pd.read_csv('csv_data/match_list.csv',parse_dates=['date'],date_parser=custom_date_parser)
recent_match_summary_df = match_summary_df[match_summary_df['date']>=cutoff_start_date]
recent_match_summary_df.shape

(781, 11)

In [7]:
match_stats_df = pd.read_csv('csv_data/match_stats.csv')

In [8]:
recent_match_summary_df=recent_match_summary_df.merge(match_stats_df,on='match_id',how='inner')
recent_match_summary_df=recent_match_summary_df[recent_match_summary_df['first_innings']==recent_match_summary_df['team_statistics']]


In [9]:
recent_match_summary_df.columns

Index(['match_id', 'date', 'location', 'first_innings', 'second_innings',
       'winner', 'win_by', 'win_dif', 'toss_winner', 'player_of_match',
       'train_data', 'team_statistics', 'batsman_1', 'batsman_1_runs',
       'batsman_2', 'batsman_2_runs', 'batsman_3', 'batsman_3_runs',
       'batsman_4', 'batsman_4_runs', 'batsman_5', 'batsman_5_runs',
       'batsman_6', 'batsman_6_runs', 'batsman_7', 'batsman_7_runs',
       'batsman_8', 'batsman_8_runs', 'batsman_9', 'batsman_9_runs',
       'batsman_10', 'batsman_10_runs', 'batsman_11', 'batsman_11_runs',
       'bowler_1', 'bowler_1_wickets', 'bowler_2', 'bowler_2_wickets',
       'bowler_3', 'bowler_3_wickets', 'bowler_4', 'bowler_4_wickets',
       'bowler_5', 'bowler_5_wickets', 'bowler_6', 'bowler_6_wickets',
       'bowler_7', 'bowler_7_wickets', 'bowler_8', 'bowler_8_wickets',
       'bowler_9', 'bowler_9_wickets', 'bowler_10', 'bowler_10_wickets',
       'bowler_11', 'bowler_11_wickets', 'total_run', 'total_wickets'],
     

In [116]:
# for batsman encoding
country_enc_map=pickle.load(open('country_enc_map.pkl','rb'))
batsman_enc_map = pickle.load(open('batsman_enc_map.pkl','rb'))
loc_enc_map_for_batsman = pickle.load(open('loc_enc_map_for_batsman.pkl','rb'))

# for country encoding
loc_enc_map=pickle.load(open('loc_enc_map.pkl','rb'))
group_encode_model_V2 = load_model('group_encode_model_V2')

In [11]:
def get_oh_pos(pos):
    vec=np.zeros((11)).astype(int) 
    vec[pos-1]=1
    return vec

In [12]:
batsman_group_encode_model = load_model('batsman_group_encode_model')

In [14]:
recent_match_summary_df.head()

Unnamed: 0,match_id,date,location,first_innings,second_innings,winner,win_by,win_dif,toss_winner,player_of_match,...,bowler_8,bowler_8_wickets,bowler_9,bowler_9_wickets,bowler_10,bowler_10_wickets,bowler_11,bowler_11_wickets,total_run,total_wickets
0,589309,2013-01-03,Kolkata,Pakistan,India,Pakistan,runs,85,India,Nasir Jamshed,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,250,10
2,589310,2013-01-06,Delhi,India,Pakistan,India,runs,10,India,MS Dhoni,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,167,10
4,565812,2013-01-11,Rajkot,England,India,England,runs,9,England,JC Tredwell,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,325,9
6,573014,2013-01-11,Melbourne Cricket Ground,Australia,Sri Lanka,Australia,runs,107,Australia,PJ Hughes,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,305,10
8,573015,2013-01-13,Adelaide Oval,Australia,Sri Lanka,Sri Lanka,wickets,8,Sri Lanka,HDRL Thirimanne,...,not_bowled,0,not_bowled,0,not_bowled,0,not_bowled,0,170,2


In [14]:
#recent_match_summary_df.iloc[recent_match_summary_df.shape[0]-1]

In [180]:
encoding_train_list = []
encoding_test_list = []

team_encoding_train_list = []
team_encoding_test_list =[]

target_train_list = []
target_test_list =[]

target_sequence_train_list =[]
target_sequence_test_list = []

no_of_rows = recent_match_summary_df.shape[0]
#print(no_of_rows)
for pos in tqdm(range(no_of_rows)):
    match_details = recent_match_summary_df.iloc[pos]
    location = match_details['location']
    team = match_details['first_innings']
    opposition = match_details['second_innings']
    total_run = match_details['total_run']
    is_train = match_details['train_data']
    
    loc_oh = loc_enc_map_for_batsman[location]
    opposition_oh = country_enc_map[opposition]
    
    batsman_oh_list =[]
    position_oh_list =[]
    loc_oh_list =[]
    opposition_oh_list =[]
    
    sequence_output =[]
    #print('getting batsman details')
    for bi in range(11):
        batsman = match_details['batsman_'+str(bi+1)]
        if batsman == 'not_batted':
            #break
            batsman_oh = batsman_enc_map[batsman]
        else:
            batsman_oh = batsman_enc_map[team.strip()+' '+batsman.strip()]
        position_oh = get_oh_pos(bi+1)
        
        batsman_oh_list.append(batsman_oh)
        position_oh_list.append(position_oh)
        loc_oh_list.append(loc_oh)
        opposition_oh_list.append(opposition_oh)
        sequence_output.append(match_details['batsman_'+str(bi+1)+'_runs'])
        
    batsman_mat = np.stack(batsman_oh_list)
    position_mat = np.stack(position_oh_list)
    loc_mat = np.stack(loc_oh_list)
    opposition_mat = np.stack(opposition_oh_list)
    sequence_output_vec = np.array(sequence_output)
    #print('encoding')
    batsman_group_enc_mat = batsman_group_encode_model.predict([batsman_mat,position_mat,loc_mat,opposition_mat])
    #batsman_mean =  batsman_group_enc_mat.sum(axis=0) 
    #batsman_mean = batsman_group_enc_mat.reshape(-1)
    
    ##create team_encoding with V2
    
    team_oh_v = np.array(country_enc_map[team]).reshape(1,-1)
    opponent_oh_v = np.array(opposition_oh).reshape(1,-1)
    if location not in loc_enc_map:
        continue
    loc_oh_v=np.array(loc_enc_map[location]).reshape(1,-1)
    country_enc_vec = group_encode_model_V2.predict([team_oh_v,opponent_oh_v,loc_oh_v]).reshape(-1)
    
    if is_train:
        encoding_train_list.append(batsman_group_enc_mat)
        team_encoding_train_list.append(country_enc_vec)
        target_train_list.append(total_run)
        target_sequence_train_list.append(sequence_output_vec)
    else:
        encoding_test_list.append(batsman_group_enc_mat)
        team_encoding_test_list.append(country_enc_vec)
        target_test_list.append(total_run)
        target_sequence_test_list.append(sequence_output_vec)
    
    #break
    

HBox(children=(FloatProgress(value=0.0, max=781.0), HTML(value='')))




In [168]:
#np.stack(encoding_train_list).shape

In [181]:
scaler = StandardScaler()

encoding_mat_train = np.stack(encoding_train_list)
team_encoding_mat_train = np.stack(team_encoding_train_list)
#train_x = scaler.fit_transform(encoding_mat_train)
train_y = np.stack(target_train_list)
train_y_seq = np.stack(target_sequence_train_list)

encoding_mat_test = np.stack(encoding_test_list)
team_encoding_mat_test = np.stack(team_encoding_test_list)
#test_x = scaler.transform(encoding_mat_test)
test_y = np.stack(target_test_list)
test_y_seq = np.stack(target_sequence_test_list)

In [182]:
#encoding_mat_train.shape,train_y_seq.shape
#team_encoding_mat_train.shape

((612, 11, 10), (612, 11))

In [171]:
import keras as k
import keras.backend as K
from keras.layers import *
from keras.models import Model
from keras.regularizers import l2

from keras.optimizers import Adam, Adadelta

In [172]:
def create_sequential_model_with_inital_state(timesteps,embedding_lenght,inital_state_vector):
    sequence_input = Input((timesteps,embedding_lenght),name="sequence_input")
    initial_state = Input((inital_state_vector,),name="state_input")
    
    lstm_out = LSTM(inital_state_vector,activation='relu',return_sequences=False,
                    return_state=False, name='lstm_1')(sequence_input,initial_state=[initial_state,initial_state])
    runs_output = Dense(1,name='final_output')(lstm_out)
    
    runs_model = Model(inputs=[sequence_input,initial_state],
                      outputs=runs_output)
    
    return runs_model
    

In [173]:
def create_sequential_model(timesteps,embedding_lenght):
    sequence_input = Input((timesteps,embedding_lenght),name="sequence_input")
    
    
    lstm_out = LSTM(100,activation='relu',return_sequences=False,
                    return_state=False, name='lstm_1')(sequence_input)
#     lstm_out = LSTM(40,activation='relu',return_sequences=False,
#                     return_state=False,name='lstm_2')(lstm_out)
#    lstm_out = Flatten()(lstm_out)
    
    runs_output = Dense(10,name='dense_1',activation='relu')(lstm_out)
#    runs_output = Dense(5,name='dense_2',activation='relu')(runs_output)
    runs_output = Dense(1,name='final_output')(runs_output)
    
    runs_model = Model(inputs=[sequence_input],
                      outputs=runs_output)
    
    return runs_model
    

In [174]:
# runs_model = create_sequential_model(encoding_mat_train.shape[1],encoding_mat_train.shape[2])
runs_model = create_sequential_model_with_inital_state(encoding_mat_train.shape[1],
                                                       encoding_mat_train.shape[2],
                                                      team_encoding_mat_train.shape[1])

In [175]:
runs_model.summary()

Model: "functional_43"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sequence_input (InputLayer)     [(None, 11, 10)]     0                                            
__________________________________________________________________________________________________
state_input (InputLayer)        [(None, 30)]         0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 30)           4920        sequence_input[0][0]             
                                                                 state_input[0][0]                
                                                                 state_input[0][0]                
______________________________________________________________________________________

In [176]:
runs_model.compile(loss="mean_squared_error", metrics=["mean_absolute_error","mean_absolute_percentage_error"],optimizer=Adam(0.00001))

In [177]:
# runs_model.fit([encoding_mat_train], train_y,
#                validation_data=([encoding_mat_test],test_y),epochs=100, batch_size=10)

runs_model.fit([encoding_mat_train,team_encoding_mat_train], train_y,
               validation_data=([encoding_mat_test,team_encoding_mat_test],test_y),epochs=100, batch_size=10)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100


Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7fbc67a2ca58>

In [179]:
#team_encoding_mat_train.shape

# Seq-2-Seq

In [226]:
def create_seq2seq_model_with_inital_state(timesteps,embedding_lenght,inital_state_vector):
    sequence_input = Input((timesteps,embedding_lenght),name="sequence_input")
    initial_state = Input((inital_state_vector,),name="state_input")
    
    lstm_out,state_h,state_c = LSTM(inital_state_vector,activation='relu',return_sequences=True,
                    return_state=True, name='lstm_1')(sequence_input,initial_state=[initial_state,initial_state])
    runs_output = TimeDistributed(Dense(1,name='ts_individual_output'))(lstm_out)
    
    runs_output = Flatten(name='individual_output')(runs_output)
    total_runs_output = Dense(1,name='total_output')(state_h)

    
    runs_model = Model(inputs=[sequence_input,initial_state],
                      outputs=[runs_output,total_runs_output])
    
    return runs_model

In [227]:
runs_model = create_seq2seq_model_with_inital_state(encoding_mat_train.shape[1],
                                                       encoding_mat_train.shape[2],
                                                      team_encoding_mat_train.shape[1])

In [228]:
runs_model.summary()

Model: "functional_64"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sequence_input (InputLayer)     [(None, 11, 10)]     0                                            
__________________________________________________________________________________________________
state_input (InputLayer)        [(None, 30)]         0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 11, 30), (No 4920        sequence_input[0][0]             
                                                                 state_input[0][0]                
                                                                 state_input[0][0]                
______________________________________________________________________________________

In [229]:
runs_model.compile(loss="mean_squared_error", metrics=["mean_absolute_error"],optimizer=Adam(0.00001))

In [230]:
runs_model.fit([encoding_mat_train,team_encoding_mat_train], [train_y_seq,train_y],
               validation_data=([encoding_mat_test,team_encoding_mat_test],[test_y_seq,test_y]),epochs=100, batch_size=10)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100


Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100


Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7fbc52a4ceb8>

In [233]:
#runs_model.predict([encoding_mat_train,team_encoding_mat_train])