In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score
import pickle

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

import pickle

from keras.models import model_from_json

In [2]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")
second_inn_feature_df = pd.read_csv('csv_data/feature_second_innings.csv',parse_dates=['match_date'],date_parser=custom_date_parser)

In [3]:
second_inn_feature_df.dropna(inplace=True)

In [4]:
# first_inn_result_df_train=pd.read_csv('csv_data/result_df_tran.csv',parse_dates=['match_date'],date_parser=custom_date_parser)
# first_inn_result_df_test=pd.read_csv('csv_data/result_df_test.csv',parse_dates=['match_date'],date_parser=custom_date_parser)
# lr_first_innings=pickle.load(open('first_innings_linear_regression.pkl','rb'))
# scaler_first_innings = pickle.load(open('first_innings_linear_regression_scaler.pkl','rb'))

In [5]:
#first_innings_feature_columns = ['team_score', 'opponent_score', 'location_base', 'location_mean','batsman_mean', 'batsman_max', 'bowler_mean']

In [6]:
# second_inn_feature_train = second_inn_feature_df[second_inn_feature_df['is_train']==True]
# second_inn_feature_test = second_inn_feature_df[second_inn_feature_df['is_train']==False]

In [7]:
second_inn_feature_df.columns

Index(['match_id', 'match_date', 'team', 'opponent', 'location', 'team_score',
       'opponent_score', 'opponent_base', 'opponent_trend',
       'opponent_trend_predict', 'opponent_mean', 'location_base',
       'location_trend', 'location_trend_predict', 'location_mean',
       'current_base', 'current_trend', 'current_trend_predict',
       'current_mean', 'batsman_mean', 'batsman_max', 'bowler_mean',
       'bowler_max', 'is_train', 'noise', 'target_score', 'runs_scored',
       'win'],
      dtype='object')

In [8]:
second_innings_feature_columns = ['team_score',
       'opponent_score', 'opponent_base', 'opponent_trend',
       'opponent_trend_predict', 'opponent_mean', 'location_base',
       'location_trend', 'location_trend_predict', 'location_mean',
       'current_base', 'current_trend', 'current_trend_predict',
       'current_mean', 'batsman_mean', 'batsman_max', 'bowler_mean',
       'bowler_max','target_score']

# second_innings_feature_columns = ['team_score',
#         'batsman_mean', 'batsman_max', 'bowler_mean',
#        'target_score']

second_innings_target = ['win']

In [9]:
def load_model(model_name):
    # load json and create model
    json_file = open(model_name+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_name+".h5")
    return loaded_model

In [10]:
# getting all models and encoding maps

country_enc_map=pickle.load(open('country_enc_map.pkl','rb'))
loc_enc_map=pickle.load(open('loc_enc_map.pkl','rb'))
group_encode_model_V2 = load_model('group_encode_model_V2')

batsman_enc_map = pickle.load(open('batsman_enc_map.pkl','rb'))
loc_enc_map_for_batsman = pickle.load(open('loc_enc_map_for_batsman.pkl','rb'))
batsman_group_encode_model = load_model('batsman_group_encode_model')

#v1
#enc_map = pickle.load(open('country_location_enc_map.pkl','rb'))
#group_encode_model_V1 = load_model('group_encode_model')

match_stats_df = pd.read_csv('csv_data/match_stats.csv')
match_summary_df = pd.read_csv('csv_data/match_list.csv',parse_dates=['date'],date_parser=custom_date_parser)


In [11]:
match_summary_df.shape,match_stats_df.shape

((1377, 11), (1562, 48))

In [12]:
recent_match_summary_df=match_summary_df.merge(match_stats_df,on='match_id',how='inner')
recent_match_summary_df=recent_match_summary_df[recent_match_summary_df['second_innings']==recent_match_summary_df['team_statistics']]



In [13]:
def get_oh_pos(pos):
    vec=np.zeros((11)).astype(int) 
    vec[pos-1]=1
    return vec

In [14]:
feature_match_id_list = list(second_inn_feature_df['match_id'].unique())

In [15]:
#recent_match_summary_df[['second_innings','team_statistics']].head()

In [16]:
feature_match_id_list = list(second_inn_feature_df['match_id'].unique())

match_id_train_list = []
match_id_test_list = []

encoding_train_list = []
encoding_test_list = []

target_train_list = []
target_test_list =[]
no_of_rows = recent_match_summary_df.shape[0]
#print(no_of_rows)
for pos in tqdm(range(no_of_rows)):
    match_details = recent_match_summary_df.iloc[pos]
    match_id = match_details['match_id']
    if match_id not in feature_match_id_list:
        continue
    location = match_details['location']
    team = match_details['second_innings']
    opposition = match_details['first_innings']
    #total_run = match_details['total_run']
    is_train = match_details['train_data']
    
    loc_oh = loc_enc_map_for_batsman[location]
    opposition_oh = country_enc_map[opposition]
    
    batsman_oh_list =[]
    position_oh_list =[]
    loc_oh_list =[]
    opposition_oh_list =[]
    #print('getting batsman details')
    for bi in range(11):
        batsman = match_details['batsman_'+str(bi+1)]
        if batsman == 'not_batted':
            break
            #batsman_oh = batsman_enc_map[batsman]
        else:
            batsman_oh = batsman_enc_map[team.strip()+' '+batsman.strip()]
        position_oh = get_oh_pos(bi+1)
        
        batsman_oh_list.append(batsman_oh)
        position_oh_list.append(position_oh)
        loc_oh_list.append(loc_oh)
        opposition_oh_list.append(opposition_oh)
        
    batsman_mat = np.stack(batsman_oh_list)
    position_mat = np.stack(position_oh_list)
    loc_mat = np.stack(loc_oh_list)
    opposition_mat = np.stack(opposition_oh_list)
    #print('encoding')
    batsman_group_enc_mat = batsman_group_encode_model.predict([batsman_mat,position_mat,loc_mat,opposition_mat])
    batsman_mean =  batsman_group_enc_mat.sum(axis=0) 
    #batsman_mean = batsman_group_enc_mat.reshape(-1)
    
    ##create team_encoding with V2
    
    team_oh_v = np.array(country_enc_map[team]).reshape(1,-1)
    opponent_oh_v = np.array(opposition_oh).reshape(1,-1)
    if location not in loc_enc_map:
        continue
    loc_oh_v=np.array(loc_enc_map[location]).reshape(1,-1)
    country_enc_vec = group_encode_model_V2.predict([team_oh_v,opponent_oh_v,loc_oh_v]).reshape(-1)

    ##create team_encoding with V1
    
#     team_oh_v = np.array(enc_map[team]).reshape(1,-1)
#     opponent_oh_v = np.array(enc_map[opposition]).reshape(1,-1)
#     if location not in loc_enc_map:
#         continue
#     loc_oh_v=np.array(enc_map[location]).reshape(1,-1)
#     country_enc_vec = group_encode_model_V1.predict([team_oh_v,opponent_oh_v,loc_oh_v]).reshape(-1)
    
    #engineered_features
    
    feature_vector = np.array(second_inn_feature_df[second_inn_feature_df['match_id']==match_id][second_innings_feature_columns]).reshape(-1)
    win = second_inn_feature_df[second_inn_feature_df['match_id']==match_id]['win'].values[0]

    
    final_vector = np.concatenate([batsman_mean,country_enc_vec,feature_vector])
    
    if is_train:
        encoding_train_list.append(final_vector)
        target_train_list.append(win)
        match_id_train_list.append(match_id)
    else:
        encoding_test_list.append(final_vector)
        target_test_list.append(win)
        match_id_test_list.append(match_id)

HBox(children=(FloatProgress(value=0.0, max=781.0), HTML(value='')))




In [17]:
second_innings_scaler = StandardScaler()

encoding_mat_train = np.stack(encoding_train_list)
x_train = second_innings_scaler.fit_transform(encoding_mat_train)
y_train = np.stack(target_train_list)

encoding_mat_test = np.stack(encoding_test_list)
x_test = second_innings_scaler.transform(encoding_mat_test)
y_test = np.stack(target_test_list)

In [18]:
lgr = LogisticRegression()
lgr.fit(x_train,y_train)

LogisticRegression()

In [19]:
y_train_predict_lgr = lgr.predict(x_train)
y_test_predict_lgr = lgr.predict(x_test)

In [20]:
accuracy_score(y_train,y_train_predict_lgr)

0.9210526315789473

In [21]:
accuracy_score(y_test,y_test_predict_lgr)

0.8581560283687943

In [22]:
y_test_predict_lgr

array([0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0])

In [30]:
pickle.dump(lgr,open('second_innings_model_with_embedding_lrg.pkl','wb'))
pickle.dump(second_innings_scaler,open('second_innings_scaler_with_embedding.pkl','wb'))

# Combining with first innings results
(from simple linear regression)

In [28]:
# second_inn_feature_test_copy = pd.DataFrame(second_inn_feature_test)
# second_inn_feature_train_copy = pd.DataFrame(second_inn_feature_train)

In [15]:

# second_inn_feature_test_copy = second_inn_feature.merge(first_inn_result_df_test[['match_id','first_innings_prediction']],
#                                    on='match_id',
#                                    how='inner')
# second_inn_feature_train_copy = second_inn_feature.merge(first_inn_result_df_train[['match_id','first_innings_prediction']],
#                                    on='match_id',
#                                    how='inner')

# second_inn_feature_test_copy['target_score'] = second_inn_feature_test_copy['first_innings_prediction']
# second_inn_feature_train_copy['target_score'] = second_inn_feature_train_copy['first_innings_prediction']



In [16]:
# x_train_copy = second_innings_scaler.transform(second_inn_feature_train_copy[second_innings_feature_columns])
# y_train_copy = second_inn_feature_train_copy[second_innings_target]

# x_test_copy = second_innings_scaler.transform(second_inn_feature_test_copy[second_innings_feature_columns])
# y_test_copy = second_inn_feature_test_copy[second_innings_target]

In [17]:
# y_train_copy_predict_lgr = lgr.predict(x_train_copy)
# y_test_copy_predict_lgr = lgr.predict(x_test_copy)

In [49]:
#accuracy_score(y_train_copy,y_train_copy_predict_lgr)

In [50]:
#accuracy_score(y_test_copy,y_test_copy_predict_lgr)

# Combining with first innings results
(obtained using best model-team embedding+batsman embedding+selected features)

In [23]:
first_innings_emb_prediction_train = pd.read_csv('csv_data/first_innings_embedding_prediction_train.csv')
first_innings_emb_prediction_test = pd.read_csv('csv_data/first_innings_embedding_prediction_test.csv')

In [24]:
# second_inn_feature_test_copy = pd.DataFrame(second_inn_feature_test)
# second_inn_feature_train_copy = pd.DataFrame(second_inn_feature_train)

In [25]:
second_inn_feature_test_copy = second_inn_feature_df.merge(first_innings_emb_prediction_test[['match_id','predicted_first_innings_runs']],
                                   on='match_id',
                                   how='inner')
second_inn_feature_train_copy = second_inn_feature_df.merge(first_innings_emb_prediction_train[['match_id','predicted_first_innings_runs']],
                                   on='match_id',
                                   how='inner')

In [26]:
second_inn_feature_test_copy['target_score'] = second_inn_feature_test_copy['predicted_first_innings_runs']
second_inn_feature_train_copy['target_score'] = second_inn_feature_train_copy['predicted_first_innings_runs']

second_inn_feature_df_copy = pd.concat([second_inn_feature_train_copy,second_inn_feature_test_copy])

In [27]:
feature_match_id_list = list(second_inn_feature_df_copy['match_id'].unique())

match_id_train_list = []
match_id_test_list = []

encoding_train_list = []
encoding_test_list = []

target_train_list = []
target_test_list =[]
no_of_rows = recent_match_summary_df.shape[0]
#print(no_of_rows)
for pos in tqdm(range(no_of_rows)):
    match_details = recent_match_summary_df.iloc[pos]
    match_id = match_details['match_id']
    if match_id not in feature_match_id_list:
        continue
    location = match_details['location']
    team = match_details['second_innings']
    opposition = match_details['first_innings']
    #total_run = match_details['total_run']
    is_train = match_details['train_data']
    
    loc_oh = loc_enc_map_for_batsman[location]
    opposition_oh = country_enc_map[opposition]
    
    batsman_oh_list =[]
    position_oh_list =[]
    loc_oh_list =[]
    opposition_oh_list =[]
    #print('getting batsman details')
    for bi in range(11):
        batsman = match_details['batsman_'+str(bi+1)]
        if batsman == 'not_batted':
            break
            #batsman_oh = batsman_enc_map[batsman]
        else:
            batsman_oh = batsman_enc_map[team.strip()+' '+batsman.strip()]
        position_oh = get_oh_pos(bi+1)
        
        batsman_oh_list.append(batsman_oh)
        position_oh_list.append(position_oh)
        loc_oh_list.append(loc_oh)
        opposition_oh_list.append(opposition_oh)
        
    batsman_mat = np.stack(batsman_oh_list)
    position_mat = np.stack(position_oh_list)
    loc_mat = np.stack(loc_oh_list)
    opposition_mat = np.stack(opposition_oh_list)
    #print('encoding')
    batsman_group_enc_mat = batsman_group_encode_model.predict([batsman_mat,position_mat,loc_mat,opposition_mat])
    batsman_mean =  batsman_group_enc_mat.sum(axis=0) 
    #batsman_mean = batsman_group_enc_mat.reshape(-1)
    
    ##create team_encoding with V2
    
    team_oh_v = np.array(country_enc_map[team]).reshape(1,-1)
    opponent_oh_v = np.array(opposition_oh).reshape(1,-1)
    if location not in loc_enc_map:
        continue
    loc_oh_v=np.array(loc_enc_map[location]).reshape(1,-1)
    country_enc_vec = group_encode_model_V2.predict([team_oh_v,opponent_oh_v,loc_oh_v]).reshape(-1)

    ##create team_encoding with V1
    
#     team_oh_v = np.array(enc_map[team]).reshape(1,-1)
#     opponent_oh_v = np.array(enc_map[opposition]).reshape(1,-1)
#     if location not in loc_enc_map:
#         continue
#     loc_oh_v=np.array(enc_map[location]).reshape(1,-1)
#     country_enc_vec = group_encode_model_V1.predict([team_oh_v,opponent_oh_v,loc_oh_v]).reshape(-1)
    
    #engineered_features
    
    feature_vector = np.array(second_inn_feature_df_copy[second_inn_feature_df_copy['match_id']==match_id][second_innings_feature_columns]).reshape(-1)
    win = second_inn_feature_df_copy[second_inn_feature_df_copy['match_id']==match_id]['win'].values[0]

    
    final_vector = np.concatenate([batsman_mean,country_enc_vec,feature_vector])
    
    if is_train:
        encoding_train_list.append(final_vector)
        target_train_list.append(win)
        match_id_train_list.append(match_id)
    else:
        encoding_test_list.append(final_vector)
        target_test_list.append(win)
        match_id_test_list.append(match_id)

HBox(children=(FloatProgress(value=0.0, max=781.0), HTML(value='')))




In [28]:
encoding_mat_train = np.stack(encoding_train_list)
x_train_copy = second_innings_scaler.transform(encoding_mat_train)
y_train_copy = np.stack(target_train_list)

encoding_mat_test = np.stack(encoding_test_list)
x_test_copy = second_innings_scaler.transform(encoding_mat_test)
y_test_copy = np.stack(target_test_list)




In [29]:
y_train_copy_predict_lgr = lgr.predict(x_train_copy)
y_test_copy_predict_lgr = lgr.predict(x_test_copy)

In [30]:
accuracy_score(y_train_copy,y_train_copy_predict_lgr),accuracy_score(y_test_copy,y_test_copy_predict_lgr)

(0.9126637554585153, 0.8440366972477065)

In [48]:
#
#second_inn_feature_df_copy.merge(second_inn_feature_df,on='match_id',how='inner')[['target_score_x','target_score_y']]

In [31]:
y_test_copy_predict_lgr

array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0])

In [32]:
match_id_test_list[3]

1144156

In [33]:
x_test_copy[3]

array([-0.68048408, -0.6710452 , -1.73575387, -0.58598919, -0.67093961,
       -0.66241659, -0.6597    , -0.9662108 , -1.33521158, -1.0786623 ,
        0.4609055 ,  0.        ,  0.2566267 ,  0.43395915,  0.36384617,
        0.34953377,  0.        ,  0.2890143 ,  0.        ,  0.34725311,
       -1.5812828 , -1.21064089, -1.22623342,  0.        ,  0.        ,
       -1.37832653, -1.11351011, -1.40918541, -1.39398236, -1.31134461,
       -1.39280565, -1.48893338, -1.59222951, -1.52886508, -1.57832128,
       -1.27367422,  0.        , -1.44798417, -1.57764853, -1.46306953,
        0.73934402,  0.05712315, -0.40674332, -0.06709866, -0.54077266,
       -0.93546584,  0.21386306, -0.14791468,  0.02021795,  0.20315482,
        2.14530766, -1.87893186, -1.13044927,  0.90494331,  0.75026558,
        1.15873729,  0.13938544, -0.54412314,  0.83929708])

# SVM

In [52]:
from sklearn import svm

In [62]:
clf = svm.SVC(C=0.7,gamma='auto',kernel='linear',probability=True)
clf.fit(x_train,y_train)

SVC(C=0.7, gamma='auto', kernel='linear', probability=True)

In [63]:
y_train_predict_svm = clf.predict(x_train)
y_test_predict_svm = clf.predict(x_test)

In [64]:
accuracy_score(y_train,y_train_predict_svm),accuracy_score(y_test,y_test_predict_svm)

(0.9157894736842105, 0.8652482269503546)

In [65]:
y_train_copy_predict_svm = lgr.predict(x_train_copy)
y_test_copy_predict_svm = lgr.predict(x_test_copy)

In [66]:
accuracy_score(y_train_copy,y_train_copy_predict_svm),accuracy_score(y_test_copy,y_test_copy_predict_svm)

(0.9126637554585153, 0.8440366972477065)

In [67]:
pickle.dump(clf,open('second_innings_model_with_embedding_svm.pkl','wb'))
#pickle.dump(second_innings_scaler,open('second_innings_scaler_with_embedding.pkl','wb'))

# NB

In [58]:
from sklearn.naive_bayes import GaussianNB

In [59]:
nb = GaussianNB()
nb.fit(x_train,y_train)

GaussianNB()

In [60]:
y_train_predict_nb = nb.predict(x_train)
y_test_predict_nb = nb.predict(x_test)

In [61]:
accuracy_score(y_train,y_train_predict_nb),accuracy_score(y_test,y_test_predict_nb)

(0.8298245614035088, 0.7659574468085106)

In [68]:
!pip install xgboost



#  XGBoost

In [69]:
import xgboost as xg
from xgboost import XGBClassifier

In [70]:
xg_model = XGBClassifier(max_depth=3,n_estimators=40)
xg_model.fit(x_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=40, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [71]:
y_train_predict_xg = xg_model.predict(x_train)
y_test_predict_xg = xg_model.predict(x_test)

In [72]:
accuracy_score(y_train,y_train_predict_xg),accuracy_score(y_test,y_test_predict_xg)

(1.0, 0.8297872340425532)