### Import Libraries

In [14]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import numpy as np
import os.path
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import pickle
from joblib import dump, load

### Load pre-computed stats

In [15]:
df_train = pd.read_pickle(os.path.abspath('') + '\\..\\Derived stats\\match_stats_train.pkl')
df_test = pd.read_pickle(os.path.abspath('') + '\\..\\Derived stats\\match_stats_test.pkl')

# delete outlier
""" Assuming, in most normal matches, teams 
score more than 75 runs on an average as shown
by past statistics. """
df_train = df_train[df_train['Runs'] > 75]
df_test = df_test[df_test['Runs'] > 75]

cols = df_train.columns
cols = cols.map(lambda x: x.replace(' ', '_') if isinstance(x, (str, 'utf-8')) else x)
df_train.columns = cols
df_test.columns = cols

# To remove the other inning for which outliers were removed
match_ids = df_test['Match_ID'].unique()
m_ids_toBeRemoved = []          
for m_id in match_ids:
    if len(df_test[df_test['Match_ID'] == m_id]) !=2:
        m_ids_toBeRemoved.append(m_id)

for m_id in m_ids_toBeRemoved:
    df_test = df_test[df_test['Match_ID'] != m_id]

### Computing additional features

#### Adding feature: Average runs scored against same opponent in previous matches

In [17]:
""" Previous run with same opponent """

prev_run = []
for Date, Team_Name, Opp_Team  in df_train[['Date','Team_Name', 'Opp_Team']].itertuples(index=False):
    df_t = df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs']
    if len(df_t) == 0:
        prev_run.append(df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team")['Runs'].mean())
        continue
    prev_run.append(df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs'].values[-1])
df_train['Prev Run'] = prev_run

prev_run = []
for Date, Team_Name, Opp_Team  in df_test[['Date','Team_Name', 'Opp_Team']].itertuples(index=False):
    df_t = df_test.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs']
    if len(df_t) == 0:
        df_t_t = df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")
        if len(df_t_t) == 0:
            prev_run.append(0)
        else:
            prev_run.append(df_t_t['Runs'].values[-1])
        continue
    prev_run.append(df_test.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs'].values[-1])
df_test['Prev Run'] = prev_run

#### Adding feature: Average runs scored in a venue till date irrespective of teams

In [18]:
""" Previous run on venue irrespective of team """

prev_run = []
for Date, city  in df_train[['Date', 'City']].itertuples(index=False):
    df_t = df_train.query("City == @city & Date < @Date")
    if len(df_t) == 0:
        df_t_t = df_train.query("City == @city")
        if len(df_t_t) == 0:
            prev_run.append(0)
        else:
            prev_run.append(df_t_t['Runs'].mean())
        continue
    prev_run.append(df_t['Runs'].mean())
df_train['Prev Venue Run'] = prev_run

prev_run = []
for Date, city  in df_test[['Date', 'City']].itertuples(index=False):
    df_te = df_test.query("City == @city & Date < @Date")
    df_tr = df_train.query("City == @city & Date < @Date")
    prev_run.append((df_te['Runs'].sum(axis = 0, skipna = True) + df_tr['Runs'].sum(axis = 0, skipna = True))/(len(df_te)+len(df_tr)))
df_test['Prev Venue Run'] = prev_run

### Print data before model fitting

In [40]:
df_train.head(5)

Unnamed: 0,Match ID,Date,Team Name,Opp Team,Innnings,City,Past lead,Target,Avg Bat1,Curr Bat1,...,Avg Bat10,Curr Bat10,Avg Bat11,Curr Bat11,Bowl1,Bowl2,Bowl3,Bowl4,Bowl5,Runs
0,175,2010-03-12,Kolkata Knight Riders,Deccan Chargers,1,Mumbai,0,0,14.125,11.6,...,0.0,0.0,0,0,6.729167,7.833333,8.574074,9.125,10.666667,161
1,175,2010-03-12,Deccan Chargers,Kolkata Knight Riders,2,Mumbai,1,161,31.033333,33.2,...,0.0,0.0,0,0,3.75,7.145833,7.322917,9.35,10.666667,150
2,176,2010-03-13,Mumbai Indians,Rajasthan Royals,1,Mumbai,0,0,28.269231,15.8,...,0.0,0.0,0,0,7.75,8.006667,8.490741,8.595238,8.0,212
3,176,2010-03-13,Rajasthan Royals,Mumbai Indians,2,Mumbai,1,212,28.391304,14.0,...,0.0,0.0,0,0,6.243056,8.0625,9.65,8.0,8.0,208
4,177,2010-03-13,Kings XI Punjab,Delhi Daredevils,1,Chandigarh,1,0,27.6,27.6,...,1.25,1.25,10,10,7.020833,7.208333,7.424242,8.0625,8.845238,142


In [41]:
df_test.head(5)

Unnamed: 0,Match ID,Date,Team Name,Opp Team,Innnings,City,Past lead,Target,Avg Bat1,Curr Bat1,...,Avg Bat10,Curr Bat10,Avg Bat11,Curr Bat11,Bowl1,Bowl2,Bowl3,Bowl4,Bowl5,Runs
0,1,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,1,Hyderabad,1,0,33.73,55.963474,...,0,0,0,0,7.033333,8.236111,8.958333,10.8,8.0,207
1,1,2017-04-05,Royal Challengers Bangalore,Sunrisers Hyderabad,2,Hyderabad,0,207,37.923077,41.6,...,10,10,10,10,7.130952,7.435185,7.4375,7.440476,7.75,172
2,2,2017-04-06,Mumbai Indians,Rising Pune Supergiant,1,Pune,0,0,19.27,20.8,...,0,0,0,0,6.5,7.0,7.570833,8.364583,8.640625,184
3,2,2017-04-06,Rising Pune Supergiant,Mumbai Indians,2,Pune,0,184,30.05618,27.4,...,0,0,0,0,6.25,8.416667,8.875,8.983333,9.105263,187
4,3,2017-04-07,Gujarat Lions,Kolkata Knight Riders,1,Rajkot,1,0,10.0,10.0,...,0,0,0,0,6.1,8.006667,8.526882,8.535714,9.25,183


### Selecting Features to train the model

In [19]:
df_train_inning_1 = df_train[df_train['Innnings'] == 1]
df_test_inning_1 = df_test[df_test['Innnings'] == 1]

In [20]:
df_inning_2 = df_test[df_test['Innnings'] == 2]
inning_2_input = df_inning_2[['Avg_Bat1', 'Avg_Bat2', 'Avg_Bat3', 'Avg_Bat4', 'Avg_Bat5', 'Avg_Bat6', 'Avg_Bat7', 'Avg_Bat8', 'Avg_Bat9', 'Avg_Bat10', 'Avg_Bat11','Prev Run', 'Prev Venue Run', 'Past_lead']]

In [21]:
cols = ['Avg_Bat1', 'Avg_Bat2', 'Avg_Bat3', 'Avg_Bat4', 'Avg_Bat5', 'Avg_Bat6', 'Avg_Bat7', 'Avg_Bat8', 'Avg_Bat9', 'Avg_Bat10', 'Avg_Bat11', 'Prev Run', 'Prev Venue Run', 'Past_lead']

X_train = df_train_inning_1[cols]
y_train = df_train_inning_1[['Runs']]

X_test = df_test_inning_1[cols]
y_test = df_test_inning_1[['Runs']]

### Random Forest Regressor

In [22]:
# Finding best hyper parameters using grid search cross validation
param_dict = {'max_depth':[2,3,4,5,6], 'max_features':[3,4,5,6,7,8,9,10], 'n_estimators':[500,550,600,650,700,750]}
#param_dict = {'max_depth':[2,3,4,5,6], 'max_features':[3,4,5,6,7,8,9,10], 'n_estimators':np.linspace(100,750,num = 20, dtype = int)}
randomRegressionModel = RandomForestRegressor(random_state=0)
randomRegressionModel_cv = GridSearchCV(estimator=randomRegressionModel, param_grid=param_dict, cv= 5, iid=False, n_jobs = -1)
randomRegressionModel_cv.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=0,
                                             verbose=0, warm_start=False),
             iid=False, n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 5, 6],
       

In [23]:
predictedRunsTrain = randomRegressionModel_cv.predict(X_train)
print('Train random regression model RMSE:', np.sqrt(mean_squared_error(y_train, predictedRunsTrain)))

Train random regression model RMSE: 17.801406265857686


In [24]:
predictedRunsTest = randomRegressionModel_cv.predict(X_test)
print('Test random regression Linear model RMSE:', np.sqrt(mean_squared_error(y_test, predictedRunsTest)))

Test random regression Linear model RMSE: 22.043061471205007


In [25]:
randomRegressionModel_cv.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
                      max_features=3, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=750,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

#### Best model parameters: max_features = 3, max_depth = 6, n_estimators = 750

In [26]:
randomRegressionModel_final = RandomForestRegressor(random_state=0, max_features = 3, max_depth = 4, n_estimators = 600)
randomRegressionModel_final.fit(X_train, y_train.values.ravel())

predictedRunsTrain = randomRegressionModel_cv.predict(X_train)
print('Train random regression model RMSE:', np.sqrt(mean_squared_error(y_train, predictedRunsTrain)))

predictedRunsTest = randomRegressionModel_cv.predict(X_test)
print('Test random regression Linear model RMSE:', np.sqrt(mean_squared_error(y_test, predictedRunsTest)))

Train random regression model RMSE: 17.801406265857686
Test random regression Linear model RMSE: 22.043061471205007


### Accuracy of winner prediction

In [27]:
matches = pd.read_csv('matches.csv')
matchId_winnerInning_dict = {}
for index, row in matches.iterrows():
    if row['winner'] == row['team1']:
        matchId_winnerInning_dict[row['id']] = 1
    if row['winner'] == row['team2']:
        matchId_winnerInning_dict[row['id']] = 2

In [30]:
inning1_out = predictedRunsTest
inning2_out = randomRegressionModel_cv.predict(inning_2_input)

winner_df = df_inning_2[['Match_ID']]
winner = [ 1 if inning1_out[i] > inning2_out[i] else 2 for i in range(len(inning1_out))]
winner_df['Winner'] = winner
correct_count = 0
for match_id, winner  in winner_df[['Match_ID', 'Winner']].itertuples(index=False):
    if winner == matchId_winnerInning_dict[match_id]:
        correct_count += 1
winPredictionAccuracy = float((correct_count*100)/len(winner_df))

print(winPredictionAccuracy)

74.07407407407408


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Print Scores prediction

In [None]:
for id, runs in df_train[['Match_ID', 'Runs']].itertuples(index=False):
    print(runs)

In [None]:
for runs in predictedRunsTrain:
    print(runs)