In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score
import pickle

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")
second_inn_feature_df = pd.read_csv('csv_data/feature_second_innings.csv',parse_dates=['match_date'],date_parser=custom_date_parser)

In [3]:
second_inn_feature_df.dropna(inplace=True)

In [4]:
first_inn_result_df_train=pd.read_csv('csv_data/result_df_tran.csv',parse_dates=['match_date'],date_parser=custom_date_parser)
first_inn_result_df_test=pd.read_csv('csv_data/result_df_test.csv',parse_dates=['match_date'],date_parser=custom_date_parser)
lr_first_innings=pickle.load(open('first_innings_linear_regression.pkl','rb'))
scaler_first_innings = pickle.load(open('first_innings_linear_regression_scaler.pkl','rb'))

In [5]:
first_innings_feature_columns = ['team_score', 'opponent_score', 'location_base', 'location_mean','batsman_mean', 'batsman_max', 'bowler_mean']

In [6]:
second_inn_feature_train = second_inn_feature_df[second_inn_feature_df['is_train']==True]
second_inn_feature_test = second_inn_feature_df[second_inn_feature_df['is_train']==False]

In [7]:
second_inn_feature_train.columns

Index(['match_id', 'match_date', 'team', 'opponent', 'location', 'team_score',
       'opponent_score', 'opponent_base', 'opponent_trend',
       'opponent_trend_predict', 'opponent_mean', 'location_base',
       'location_trend', 'location_trend_predict', 'location_mean',
       'current_base', 'current_trend', 'current_trend_predict',
       'current_mean', 'batsman_mean', 'batsman_max', 'bowler_mean',
       'bowler_max', 'is_train', 'noise', 'target_score', 'runs_scored',
       'win'],
      dtype='object')

In [8]:
second_innings_feature_columns = ['team_score',
       'opponent_score', 'opponent_base', 'opponent_trend',
       'opponent_trend_predict', 'opponent_mean', 'location_base',
       'location_trend', 'location_trend_predict', 'location_mean',
       'current_base', 'current_trend', 'current_trend_predict',
       'current_mean', 'batsman_mean', 'batsman_max', 'bowler_mean',
       'bowler_max','target_score']

# second_innings_feature_columns = ['team_score',
#         'batsman_mean', 'batsman_max', 'bowler_mean',
#        'target_score']

second_innings_target = ['win']

In [9]:
second_innings_scaler = StandardScaler()
x_train = second_innings_scaler.fit_transform(second_inn_feature_train[second_innings_feature_columns])
y_train = second_inn_feature_train[second_innings_target]

x_test = second_innings_scaler.transform(second_inn_feature_test[second_innings_feature_columns])
y_test = second_inn_feature_test[second_innings_target]


In [10]:
lgr = LogisticRegression()
lgr.fit(x_train,y_train)

  return f(**kwargs)


LogisticRegression()

In [11]:
y_train_predict_lgr = lgr.predict(x_train)
y_test_predict_lgr = lgr.predict(x_test)

In [12]:
accuracy_score(y_train,y_train_predict_lgr)

0.8493589743589743

In [13]:
accuracy_score(y_test,y_test_predict_lgr)

0.8297872340425532

# Combining with first innings results
(from simple linear regression)

In [14]:
second_inn_feature_test_copy = pd.DataFrame(second_inn_feature_test)
second_inn_feature_train_copy = pd.DataFrame(second_inn_feature_train)

In [15]:

second_inn_feature_test_copy = second_inn_feature_test_copy.merge(first_inn_result_df_test[['match_id','first_innings_prediction']],
                                   on='match_id',
                                   how='inner')
second_inn_feature_train_copy = second_inn_feature_train_copy.merge(first_inn_result_df_train[['match_id','first_innings_prediction']],
                                   on='match_id',
                                   how='inner')

second_inn_feature_test_copy['target_score'] = second_inn_feature_test_copy['first_innings_prediction']
second_inn_feature_train_copy['target_score'] = second_inn_feature_train_copy['first_innings_prediction']

In [16]:
x_train_copy = second_innings_scaler.transform(second_inn_feature_train_copy[second_innings_feature_columns])
y_train_copy = second_inn_feature_train_copy[second_innings_target]

x_test_copy = second_innings_scaler.transform(second_inn_feature_test_copy[second_innings_feature_columns])
y_test_copy = second_inn_feature_test_copy[second_innings_target]

In [17]:
y_train_copy_predict_lgr = lgr.predict(x_train_copy)
y_test_copy_predict_lgr = lgr.predict(x_test_copy)

In [18]:
accuracy_score(y_train_copy,y_train_copy_predict_lgr)

0.7668539325842697

In [19]:
accuracy_score(y_test_copy,y_test_copy_predict_lgr)

0.7798165137614679

# Combining with first innings results
(obtained using best model-team embedding+batsman embedding+selected features)

In [24]:
first_innings_emb_prediction_train = pd.read_csv('csv_data/first_innings_embedding_prediction_train.csv')
first_innings_emb_prediction_test = pd.read_csv('csv_data/first_innings_embedding_prediction_test.csv')

In [25]:
second_inn_feature_test_copy = pd.DataFrame(second_inn_feature_test)
second_inn_feature_train_copy = pd.DataFrame(second_inn_feature_train)

In [26]:
second_inn_feature_test_copy = second_inn_feature_test_copy.merge(first_innings_emb_prediction_test[['match_id','predicted_first_innings_runs']],
                                   on='match_id',
                                   how='inner')
second_inn_feature_train_copy = second_inn_feature_train_copy.merge(first_innings_emb_prediction_train[['match_id','predicted_first_innings_runs']],
                                   on='match_id',
                                   how='inner')

In [27]:
second_inn_feature_test_copy['target_score'] = second_inn_feature_test_copy['predicted_first_innings_runs']
second_inn_feature_train_copy['target_score'] = second_inn_feature_train_copy['predicted_first_innings_runs']

In [28]:
x_train_copy = second_innings_scaler.transform(second_inn_feature_train_copy[second_innings_feature_columns])
y_train_copy = second_inn_feature_train_copy[second_innings_target]

x_test_copy = second_innings_scaler.transform(second_inn_feature_test_copy[second_innings_feature_columns])
y_test_copy = second_inn_feature_test_copy[second_innings_target]

In [29]:
y_train_copy_predict_lgr = lgr.predict(x_train_copy)
y_test_copy_predict_lgr = lgr.predict(x_test_copy)

In [30]:
accuracy_score(y_train_copy,y_train_copy_predict_lgr),accuracy_score(y_test_copy,y_test_copy_predict_lgr)

(0.8100436681222707, 0.7889908256880734)

# playing with probability

In [150]:
y_train_predict_lgr_proba = lgr.predict_proba(x_train)
y_test_predict_lgr_proba = lgr.predict_proba(x_test)

In [151]:
y_train_predict_lgr_proba[1],y_train_predict_lgr[1]

(array([0.63285163, 0.36714837]), 0)

In [152]:
y_train_predict_lgr_proba.shape

(936, 2)

In [153]:
ananlysis_df_train = pd.DataFrame()
ananlysis_df_test = pd.DataFrame()

ananlysis_df_train['actual']=y_train['win']
ananlysis_df_train['predicted_probabilty_loose']=y_train_predict_lgr_proba[:,0]
ananlysis_df_train['predicted_probabilty_win']=y_train_predict_lgr_proba[:,1]

ananlysis_df_test['actual']=y_test['win']
ananlysis_df_test['predicted_probabilty_loose']=y_test_predict_lgr_proba[:,0]
ananlysis_df_test['predicted_probabilty_win']=y_test_predict_lgr_proba[:,1]

In [154]:
ananlysis_df_train['proba_dif']=abs(ananlysis_df_train['predicted_probabilty_loose']-ananlysis_df_train['predicted_probabilty_win'])
ananlysis_df_test['proba_dif']=abs(ananlysis_df_test['predicted_probabilty_loose']-ananlysis_df_test['predicted_probabilty_win'])

In [155]:
ananlysis_df_train.shape,ananlysis_df_test.shape

((936, 4), (141, 4))

In [156]:
ananlysis_df_train[ananlysis_df_train['proba_dif']<=0.20].shape,ananlysis_df_test[ananlysis_df_test['proba_dif']<=0.20].shape



((125, 4), (14, 4))

In [157]:
ananlysis_df_train=ananlysis_df_train[ananlysis_df_train['proba_dif']>0.10]
ananlysis_df_test = ananlysis_df_test[ananlysis_df_test['proba_dif']>0.10]

In [158]:
ananlysis_df_train['prediction']=round(ananlysis_df_train['predicted_probabilty_win']).astype(int)
ananlysis_df_test['prediction']=round(ananlysis_df_test['predicted_probabilty_win']).astype(int)

In [159]:
ananlysis_df_train[ananlysis_df_train['prediction'] == ananlysis_df_train['actual']].shape[0]/ananlysis_df_train.shape[0]


0.8727064220183486

In [160]:
ananlysis_df_test[ananlysis_df_test['prediction'] == ananlysis_df_test['actual']].shape[0]/ananlysis_df_test.shape[0]

0.8409090909090909

In [20]:
import pickle

In [22]:
pickle.dump(lgr,open('second_innings_model_lrg.pkl','wb'))
pickle.dump(second_innings_scaler,open('second_innings_scaler.pkl','wb'))

# XGBoost

In [36]:
import xgboost as xg
from xgboost import XGBClassifier

In [65]:
xg_model = XGBClassifier(max_depth=3,n_estimators=40)
xg_model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=40, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [66]:
y_train_predict_xg = xg_model.predict(x_train)
y_test_predict_xg = xg_model.predict(x_test)

In [67]:
accuracy_score(y_train,y_train_predict_xg),accuracy_score(y_test,y_test_predict_xg)

(0.9348290598290598, 0.8014184397163121)

# Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [73]:
rfc = RandomForestClassifier(max_depth=3)

In [74]:
rfc.fit(x_train,y_train)

  rfc.fit(x_train,y_train)


RandomForestClassifier(max_depth=3)

In [75]:
y_train_predict_rfc = rfc.predict(x_train)
y_test_predict_rfc = rfc.predict(x_test)

In [76]:
accuracy_score(y_train,y_train_predict_rfc),accuracy_score(y_test,y_test_predict_rfc)

(0.8322649572649573, 0.7304964539007093)

# SVM

In [78]:
from sklearn import svm


In [115]:
clf = svm.SVC(C=0.7,gamma='auto',kernel='linear')
clf.fit(x_train,y_train)

  return f(**kwargs)


SVC(C=0.7, gamma='auto', kernel='linear')

In [116]:
y_train_predict_svm = clf.predict(x_train)
y_test_predict_svm = clf.predict(x_test)

In [117]:
accuracy_score(y_train,y_train_predict_svm),accuracy_score(y_test,y_test_predict_svm)

(0.8482905982905983, 0.8226950354609929)

# statsmodel

In [137]:
import statsmodels.api as sm 

In [138]:
log_reg = sm.Logit(y_train, sm.add_constant(x_train)).fit() 

         Current function value: 0.363850
         Iterations: 35


  warn("Maximum Likelihood optimization failed to converge. "


In [139]:
y_train_predict_stats = np.round(np.array(log_reg.predict(sm.add_constant(x_train))))
y_test_predict_stats = np.round(np.array(log_reg.predict(sm.add_constant(x_test))))

In [140]:
accuracy_score(y_train,y_train_predict_stats),accuracy_score(y_test,y_test_predict_stats)

(0.8461538461538461, 0.8297872340425532)

In [141]:
log_reg.summary()

  bse_ = np.sqrt(np.diag(self.cov_params()))
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,win,No. Observations:,936.0
Model:,Logit,Df Residuals:,919.0
Method:,MLE,Df Model:,16.0
Date:,"Mon, 14 Dec 2020",Pseudo R-squ.:,0.4749
Time:,13:42:07,Log-Likelihood:,-340.56
converged:,False,LL-Null:,-648.57
Covariance Type:,nonrobust,LLR p-value:,9.143e-121

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2652,0.100,2.664,0.008,0.070,0.460
x1,-0.9447,0.166,-5.685,0.000,-1.270,-0.619
x2,0.1016,0.140,0.726,0.468,-0.173,0.376
x3,0.0373,,,,,
x4,0.1613,0.348,0.463,0.643,-0.521,0.843
x5,0.0571,,,,,
x6,0.0958,,,,,
x7,0.3765,5.92e+06,6.36e-08,1.000,-1.16e+07,1.16e+07
x8,0.3464,0.362,0.957,0.339,-0.363,1.056


In [142]:
#constant,x1,x15,x16,x17,x19

# Naive Bayes

In [155]:
from sklearn.naive_bayes import GaussianNB

In [157]:
nb = GaussianNB()
nb.fit(x_train,y_train)

  return f(**kwargs)


GaussianNB()

In [158]:
y_train_predict_nb = nb.predict(x_train)
y_test_predict_nb = nb.predict(x_test)

In [159]:
accuracy_score(y_train,y_train_predict_nb),accuracy_score(y_test,y_test_predict_nb)

(0.7510683760683761, 0.7092198581560284)