In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error
import pickle

from keras.models import model_from_json

# LinearRegression

In [2]:
def mape(y_true,y_predict):
    return np.sum((np.abs(y_true-y_predict)/y_true)*100)/len(y_true)

In [3]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")
feature_df = pd.read_csv('csv_data/feature_first_innings.csv',parse_dates=['match_date'],date_parser=custom_date_parser)

In [4]:
feature_df['team'].unique()

array(['Australia', 'Sri Lanka', 'Zimbabwe', 'South Africa', 'India',
       'New Zealand', 'England', 'Pakistan', 'West Indies', 'Bangladesh',
       'Ireland', 'Scotland', 'Afghanistan', 'United Arab Emirates',
       'Hong Kong', 'Papua New Guinea', 'United States of America'],
      dtype=object)

In [5]:
# cutoff_start_year = '2014'
# cutoff_start_date = datetime.strptime(cutoff_start_year+'-01-01','%Y-%m-%d')
# #feature_df=feature_df[feature_df['match_date']>cutoff_start_date]
# feature_df=feature_df[feature_df['team']=='India']
# feature_df=feature_df[feature_df['match_date']>cutoff_start_date]
#feature_df = feature_df[feature_df['noise']==False]

In [6]:
feature_df.columns

Index(['match_id', 'match_date', 'team', 'opponent', 'location', 'team_score',
       'opponent_score', 'opponent_base', 'opponent_trend',
       'opponent_trend_predict', 'opponent_mean', 'location_base',
       'location_trend', 'location_trend_predict', 'location_mean',
       'current_base', 'current_trend', 'current_trend_predict',
       'current_mean', 'batsman_mean', 'batsman_max', 'bowler_mean',
       'bowler_max', 'is_train', 'noise', 'runs_scored'],
      dtype='object')

In [7]:
feature_df.dropna(inplace=True)
feature_df.isnull().sum()

match_id                  0
match_date                0
team                      0
opponent                  0
location                  0
team_score                0
opponent_score            0
opponent_base             0
opponent_trend            0
opponent_trend_predict    0
opponent_mean             0
location_base             0
location_trend            0
location_trend_predict    0
location_mean             0
current_base              0
current_trend             0
current_trend_predict     0
current_mean              0
batsman_mean              0
batsman_max               0
bowler_mean               0
bowler_max                0
is_train                  0
noise                     0
runs_scored               0
dtype: int64

In [8]:
# feature_columns = ['team_score',
#        'opponent_score', 'opponent_base', 'opponent_trend',
#        'opponent_trend_predict', 'opponent_mean', 'location_base',
#        'location_trend', 'location_trend_predict', 'location_mean',
#        'current_base', 'current_trend', 'current_trend_predict',
#        'current_mean', 'batsman_mean', 'batsman_max', 'bowler_mean',
#        'bowler_max']
#pvalue selection : selected x1,x2,x7,x10,x15,x16,x17

feature_columns = ['team_score', 'opponent_score', 'location_base', 'location_mean','batsman_mean', 'batsman_max', 'bowler_mean']
       


target_column = ['runs_scored']

In [9]:
feature_df_train = feature_df[feature_df['is_train']==True]
feature_df_test = feature_df[feature_df['is_train']==False]

In [10]:
#enc_map = pickle.load(open('country_enc_map.pkl','rb'))
# pickle.dump(country_enc_map,open('country_enc_map.pkl','wb'))
# pickle.dump(loc_enc_map,open('loc_enc_map.pkl','wb'))
# pickle.dump(enc_country_map,open('enc_country_map.pkl','wb'))
# pickle.dump(enc_loc_map,open('enc_loc_map.pkl','wb'))

country_enc_map=pickle.load(open('country_enc_map.pkl','rb'))
loc_enc_map=pickle.load(open('loc_enc_map.pkl','rb'))

In [49]:
#group_enc_model=load_model()

In [11]:
def load_model(model_name):
    # load json and create model
    json_file = open(model_name+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_name+".h5")
    return loaded_model

In [60]:
group_encode_model = load_model('group_encode_model_V2')

In [61]:
feature_df_train.head()

Unnamed: 0,match_id,match_date,team,opponent,location,team_score,opponent_score,opponent_base,opponent_trend,opponent_trend_predict,...,current_trend,current_trend_predict,current_mean,batsman_mean,batsman_max,bowler_mean,bowler_max,is_train,noise,runs_scored
0,351684,2009-01-16,Australia,South Africa,Melbourne Cricket Ground,3.150979,2.903334,377.0,0.0,377.0,...,-17.8,216.2,269.6,2.813601,3.685827,3.783969,4.455548,True,False,271
1,351685,2009-01-18,Australia,South Africa,Hobart,3.150979,2.903334,483.0,-106.0,165.0,...,-16.5,219.7,269.2,2.843284,3.685827,3.557268,4.455548,True,False,249
2,385749,2009-01-20,Sri Lanka,Pakistan,Karachi,2.902562,3.531763,302.0,0.0,302.0,...,-24.2,104.4,177.0,3.100178,5.01592,5.099186,5.450619,True,False,219
3,385750,2009-01-21,Sri Lanka,Pakistan,Karachi,2.902562,3.531763,385.0,-83.0,136.0,...,13.0,205.2,166.2,3.200293,5.01592,5.099186,5.450619,True,False,290
4,378759,2009-01-23,Zimbabwe,Bangladesh,Mirpur,1.294306,0.797861,205.0,0.0,205.0,...,-18.8,120.2,176.6,2.046474,3.238383,2.749104,3.034771,True,False,119


In [62]:
feature_df_train.shape

(714, 26)

In [63]:
team_loc_list = list(loc_enc_map.keys())
#df.drop(df[df['Age'] < 25].index, inplace = True) 
feature_df_train=feature_df_train[feature_df_train['location'].isin(team_loc_list)]


In [64]:
feature_df_test=feature_df_test[feature_df_test['location'].isin(team_loc_list)]

In [65]:
team_oh_list=[]
opponent_oh_list=[]
loc_oh_list=[]
for ind in range(feature_df_train.shape[0]):
    team_oh_list.append(country_enc_map[feature_df_train.iloc[ind]['team']])
    opponent_oh_list.append(country_enc_map[feature_df_train.iloc[ind]['opponent']])
    loc_oh_list.append(loc_enc_map[feature_df_train.iloc[ind]['location']])

team_oh = np.stack(team_oh_list)
opponent_oh = np.stack(opponent_oh_list)
loc_oh = np.stack(loc_oh_list)

group_enc_train = group_encode_model.predict([team_oh,opponent_oh,loc_oh])


In [66]:
team_oh_list_test=[]
opponent_oh_list_test=[]
loc_oh_list_test=[]
for ind in range(feature_df_test.shape[0]):
    team_oh_list_test.append(country_enc_map[feature_df_test.iloc[ind]['team']])
    opponent_oh_list_test.append(country_enc_map[feature_df_test.iloc[ind]['opponent']])
    loc_oh_list_test.append(loc_enc_map[feature_df_test.iloc[ind]['location']])

team_oh_test = np.stack(team_oh_list_test)
opponent_oh_test = np.stack(opponent_oh_list_test)
loc_oh_test = np.stack(loc_oh_list_test)

group_enc_test = group_encode_model.predict([team_oh_test,opponent_oh_test,loc_oh_test])



In [67]:
#group_enc_train[0]

In [68]:
scaler = StandardScaler()
#y_scaler = StandardScaler()
x_train_features = np.array(feature_df_train[feature_columns])
x_train_full = np.concatenate([x_train_features,group_enc_train],axis=1)
x_train = scaler.fit_transform(x_train_full)
y_train = np.array(feature_df_train[target_column])
#y_train_sc = y_scaler.fit_transform(feature_df_train[target_column])

x_test_features = np.array(feature_df_test[feature_columns])
x_test_full = np.concatenate([x_test_features,group_enc_test],axis=1)
x_test = scaler.fit_transform(x_test_full)
y_test = np.array(feature_df_test[target_column])
#y_test_sc = y_scaler.transform(feature_df_test[target_column])




# x_train = np.array(feature_df_train[feature_columns])
# y_train = np.array(feature_df_train[target_column])

# x_test = np.array(feature_df_test[feature_columns])
# y_test = np.array(feature_df_test[target_column])




In [69]:
lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression()

In [70]:
y_train_predict = lr.predict(x_train)
y_test_predict = lr.predict(x_test)

In [71]:
mean_absolute_error(y_train,y_train_predict)

40.27986618270431

In [72]:
mean_absolute_error(y_test,y_test_predict)

44.90991908413388

In [73]:
mape(np.array(y_train),np.array(y_train_predict))

19.019358180580642

In [74]:
mape(np.array(y_test),np.array(y_test_predict))

20.834507566792386

In [75]:
mean_squared_error(y_test,y_test_predict)

3252.4563958807944

In [70]:
pickle.dump(lr,open('first_innings_linear_regression_enc_V2.pkl','wb'))

In [76]:
pickle.dump(scaler,open('first_innings_linear_regression_scaler_enc_V2.pkl','wb'))

In [77]:
result_df_train=pd.DataFrame(feature_df_train)
result_df_test =pd.DataFrame(feature_df_test)
result_df_train['first_innings_prediction']=y_train_predict
result_df_test['first_innings_prediction']=y_test_predict

result_df_train.to_csv('csv_data/result_df_tran.csv',index=False)
result_df_test.to_csv('csv_data/result_df_test.csv',index=False)

In [74]:
#y_test

In [76]:
# compare_df = pd.DataFrame()
# compare_df['actual']=y_test['runs_scored']
# compare_df['predicted']=y_test_predict.reshape(-1)

In [77]:
#compare_df['percentage_error']=abs(compare_df['actual']-compare_df['predicted'])/compare_df['actual']

In [79]:
#compare_df

# lasso

In [80]:
from sklearn import linear_model

In [85]:
las = linear_model.Lasso(alpha=0.3)
las.fit(x_train,y_train)

Lasso(alpha=0.3)

In [83]:
y_test_predict_ls = las.predict(x_test)
y_train_predict_ls = las.predict(x_train)

In [87]:
mape(y_train,y_train_predict)

18.75695495696914

In [94]:
mape(y_test,y_test_predict_ls.reshape(-1,1))

20.716980239553493

In [95]:
y_test.shape,y_test_predict_ls.shape

((109, 1), (109,))

# XGBOOST

In [86]:
import xgboost as xg

In [87]:
train_dmatrix = xg.DMatrix(data = x_train, label = y_train) 
test_dmatrix = xg.DMatrix(data = x_test, label = y_test) 

In [88]:
param = {"booster":"gblinear", "objective":"reg:linear"} 

In [89]:
xgb_r = xg.train(params = param, dtrain = train_dmatrix, num_boost_round = 10) 




In [90]:
y_test_predict_xg = xgb_r.predict(test_dmatrix)
y_train_predict_xg = xgb_r.predict(train_dmatrix)

In [91]:
# y_train_predict_dt = dt.predict(x_train)
# y_test_predict_dt = dt.predict(x_test)

In [92]:
mean_absolute_error(np.array(y_test),np.array(y_test_predict_xg))

45.5171721572176

In [93]:
mape(np.array(y_test).reshape(-1),y_test_predict_xg)

22.65944162359071

In [94]:
mape(np.array(y_train).reshape(-1),y_train_predict_xg)

20.25278265779196

In [95]:
#y_test_predict_xg

# Randomforest

In [96]:
from sklearn.ensemble import RandomForestRegressor

In [97]:
rf = RandomForestRegressor(max_depth=8,n_estimators=50,criterion='mae')

In [98]:
rf.fit(x_train,y_train)

  rf.fit(x_train,y_train)


RandomForestRegressor(criterion='mae', max_depth=8, n_estimators=50)

In [99]:
y_train_predict_rf=rf.predict(x_train)
y_test_predict_rf= rf.predict(x_test)

In [100]:
mape(np.array(y_test).reshape(-1),y_test_predict_rf)

23.281510548596753

In [101]:
mape(np.array(y_train).reshape(-1),y_train_predict_rf)

13.462300581977942

# statsmodel

In [102]:
import statsmodels.api as sm

In [103]:
model = sm.OLS(y_train, sm.add_constant(x_train)).fit()

In [104]:
y_test_predicted_sm=model.predict(sm.add_constant(x_test))

In [105]:
model.summary()

0,1,2,3
Dep. Variable:,runs_scored,R-squared:,0.287
Model:,OLS,Adj. R-squared:,0.272
Method:,Least Squares,F-statistic:,18.98
Date:,"Sat, 12 Dec 2020",Prob (F-statistic):,7.27e-43
Time:,20:50:43,Log-Likelihood:,-3916.2
No. Observations:,723,AIC:,7864.0
Df Residuals:,707,BIC:,7938.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,255.8728,2.048,124.907,0.000,251.851,259.895
x1,-9.0921,3.352,-2.712,0.007,-15.673,-2.511
x2,7.4274,3.097,2.398,0.017,1.347,13.507
x3,1.3299,4.247,0.313,0.754,-7.008,9.668
x4,2.0869,7.099,0.294,0.769,-11.850,16.024
x5,0.4358,4.262,0.102,0.919,-7.931,8.803
x6,1.7270,1.567,1.102,0.271,-1.349,4.803
x7,12.4907,5.957,2.097,0.036,0.795,24.187
x8,14.4552,8.949,1.615,0.107,-3.115,32.025

0,1,2,3
Omnibus:,12.311,Durbin-Watson:,1.85
Prob(Omnibus):,0.002,Jarque-Bera (JB):,13.775
Skew:,-0.246,Prob(JB):,0.00102
Kurtosis:,3.464,Cond. No.,9120000000000000.0


selected x1,x2,x7,x10,x15,x16,x17

In [106]:
mape(np.array(y_test).reshape(-1),y_test_predicted_sm)


22.039536421362772

In [107]:
mean_absolute_error(np.array(y_test).reshape(-1),y_test_predicted_sm)

45.43723924019558

# with PCA

In [108]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

In [109]:
pca_scaler = StandardScaler()

In [110]:
pca = PCA(n_components=2)

In [111]:
x_train_pca=pca.fit_transform(pca_scaler.fit_transform(x_train))

In [112]:
lr_pca = LinearRegression()
lr_pca.fit(x_train_pca,y_train)

LinearRegression()

In [113]:
y_test_predict_pca =lr_pca.predict(pca.transform(pca_scaler.transform(x_test)))

In [114]:
mape(np.array(y_test),y_test_predict_pca)

24.557551133830028

In [115]:
#y_test_predict_pca.shape

In [116]:
#np.array(y_test).reshape(-1).shape

# SVM

In [117]:
from sklearn.svm import SVR

In [118]:
svr = SVR(C=0.5, epsilon=0.01)

In [119]:
svr.fit(x_train,y_train)

  return f(**kwargs)


SVR(C=0.5, epsilon=0.01)

In [120]:
y_test_predict_svr =svr.predict(x_test)

In [121]:
mape(np.array(y_test).reshape(-1),y_test_predict_svr)

25.846783163229134

# polynomial regression

In [122]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

In [132]:
poly = PolynomialFeatures(3)

In [133]:
x_train_poly = poly.fit_transform(x_train)

In [134]:
lr_poly = LinearRegression()

In [135]:
lr_poly.fit(x_train_poly,y_train)

LinearRegression()

In [136]:
y_test_predict_poly = lr_poly.predict(poly.transform(x_test))
y_train_predict_poly = lr_poly.predict(poly.transform(x_train))

In [137]:
mape(np.array(y_test),y_test_predict_poly)

187.18202146462477

In [138]:
mape(np.array(y_train),y_train_predict_poly)

3.0130001924180565

In [139]:
compare_df = pd.DataFrame()
compare_df['actual']=np.array(y_test).reshape(-1)
compare_df['predict']=np.array(y_test_predict_poly)

In [140]:
compare_df

Unnamed: 0,actual,predict
0,371,606.328315
1,319,489.073878
2,364,343.832440
3,266,344.225914
4,203,369.945787
...,...,...
104,294,520.875259
105,231,229.444440
106,302,295.823723
107,374,650.466732


In [141]:
np.array(y_test).reshape(-1).shape

(109,)

# Comparing trend predictions

In [142]:
feature_df.columns

Index(['match_id', 'match_date', 'team', 'opponent', 'location', 'team_score',
       'opponent_score', 'opponent_base', 'opponent_trend',
       'opponent_trend_predict', 'opponent_mean', 'location_base',
       'location_trend', 'location_trend_predict', 'location_mean',
       'current_base', 'current_trend', 'current_trend_predict',
       'current_mean', 'batsman_mean', 'batsman_max', 'bowler_mean',
       'bowler_max', 'is_train', 'noise', 'runs_scored'],
      dtype='object')

In [143]:
feature_df[['opponent_trend_predict','runs_scored']]

Unnamed: 0,opponent_trend_predict,runs_scored
0,377.0,271
1,165.0,249
2,302.0,219
3,136.0,290
4,205.0,119
...,...,...
827,245.3,294
828,467.2,231
829,389.4,302
830,311.8,374


In [144]:
opponent_trend_predict = np.array(feature_df['opponent_trend_predict'])
current_trend_predict = np.array(feature_df['current_trend_predict'])
location_trend_predict = np.array(feature_df['location_trend_predict'])
runs_scored = np.array(feature_df['runs_scored'])

In [145]:
mape(runs_scored,opponent_trend_predict),mape(runs_scored,current_trend_predict),mape(runs_scored,location_trend_predict)

(33.14884160652851, 31.552384325510975, 36.28616640575539)

In [146]:
max_predict = np.max(np.array(feature_df[['opponent_trend_predict','current_trend_predict','location_trend_predict']]),axis=1)
min_predict = np.min(np.array(feature_df[['opponent_trend_predict','current_trend_predict','location_trend_predict']]),axis=1)
mean_predict = np.mean(np.array(feature_df[['opponent_trend_predict','current_trend_predict','location_trend_predict']]),axis=1)


In [147]:
mape(runs_scored,max_predict),mape(runs_scored,min_predict),mape(runs_scored,mean_predict)

(39.83091281447402, 32.73509820997395, 28.048566843159747)