### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os.path
from tqdm import tqdm
import random
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

### Load pre-computed stats

In [2]:
df_train = pd.read_pickle(os.path.abspath('') + '\\..\\Derived stats\\match_stats_train.pkl')
df_test = pd.read_pickle(os.path.abspath('') + '\\..\\Derived stats\\match_stats_test.pkl')

# delete outlier
""" Assuming, in most normal matches, teams 
score more than 75 runs on an average as shown
by past statistics. """
df_train = df_train[df_train['Runs'] > 75]
df_test = df_test[df_test['Runs'] > 75]

cols = df_train.columns
cols = cols.map(lambda x: x.replace(' ', '_') if isinstance(x, (str, 'utf-8')) else x)
df_train.columns = cols
df_test.columns = cols

# To remove the other inning for which outliers were removed
match_ids = df_test['Match_ID'].unique()
m_ids_toBeRemoved = []          
for m_id in match_ids:
    if len(df_test[df_test['Match_ID'] == m_id]) !=2:
        m_ids_toBeRemoved.append(m_id)

for m_id in m_ids_toBeRemoved:
    df_test = df_test[df_test['Match_ID'] != m_id]

### Computing additional features

#### Adding feature: Average runs scored against same opponent in previous matches

In [13]:
""" Previous run with same opponent """

prev_run = []
for Date, Team_Name, Opp_Team  in df_train[['Date','Team_Name', 'Opp_Team']].itertuples(index=False):
    df_t = df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs']
    if len(df_t) == 0:
        prev_run.append(df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team")['Runs'].mean())
        continue
    prev_run.append(df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs'].values[-1])
df_train['Prev Run'] = prev_run

prev_run = []
for Date, Team_Name, Opp_Team  in df_test[['Date','Team_Name', 'Opp_Team']].itertuples(index=False):
    df_t = df_test.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs']
    if len(df_t) == 0:
        df_t_t = df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")
        if len(df_t_t) == 0:
            prev_run.append(0)
        else:
            prev_run.append(df_t_t['Runs'].values[-1])
        continue
    prev_run.append(df_test.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs'].values[-1])
df_test['Prev Run'] = prev_run

#### Adding feature: Average runs scored in a venue till date irrespective of teams

In [14]:
""" Previous run on venue irrespective of team """

prev_run = []
for Date, city  in df_train[['Date', 'City']].itertuples(index=False):
    df_t = df_train.query("City == @city & Date < @Date")
    if len(df_t) == 0:
        df_t_t = df_train.query("City == @city")
        if len(df_t_t) == 0:
            prev_run.append(0)
        else:
            prev_run.append(df_t_t['Runs'].mean())
        continue
    prev_run.append(df_t['Runs'].mean())
df_train['Prev Venue Run'] = prev_run

prev_run = []
for Date, city  in df_test[['Date', 'City']].itertuples(index=False):
    df_te = df_test.query("City == @city & Date < @Date")
    df_tr = df_train.query("City == @city & Date < @Date")
    prev_run.append((df_te['Runs'].sum(axis = 0, skipna = True) + df_tr['Runs'].sum(axis = 0, skipna = True))/(len(df_te)+len(df_tr)))
df_test['Prev Venue Run'] = prev_run

### Print data before model fitting

In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

### Selecting Features to train the model

In [28]:
df_train_inning_1 = df_train[df_train['Innnings'] == 1]
df_test_inning_1 = df_test[df_test['Innnings'] == 1]

In [29]:
df_inning_2 = df_test[df_test['Innnings'] == 2]
inning_2_input = df_inning_2[['Avg_Bat1', 'Avg_Bat2', 'Avg_Bat3', 'Avg_Bat4', 'Avg_Bat5', 'Avg_Bat6', 'Avg_Bat7', 'Avg_Bat8', 'Avg_Bat9', 'Avg_Bat10', 'Avg_Bat11','Prev Run', 'Prev Venue Run', 'Past_lead']]

In [30]:
X_train = df_train_inning_1[['Avg_Bat1', 'Avg_Bat2', 'Avg_Bat3', 'Avg_Bat4', 'Avg_Bat5', 'Avg_Bat6', 'Avg_Bat7', 'Avg_Bat8', 'Avg_Bat9', 'Avg_Bat10', 'Avg_Bat11', 'Prev Run', 'Prev Venue Run', 'Past_lead']]
y_train = df_train_inning_1[['Runs']]

X_test = df_test_inning_1[['Avg_Bat1', 'Avg_Bat2', 'Avg_Bat3', 'Avg_Bat4', 'Avg_Bat5', 'Avg_Bat6', 'Avg_Bat7', 'Avg_Bat8', 'Avg_Bat9', 'Avg_Bat10', 'Avg_Bat11','Prev Run', 'Prev Venue Run', 'Past_lead']]
y_test = df_test_inning_1[['Runs']]

### Support vector Regressor with linear kernel

In [31]:
svr_linear_grid_cv = GridSearchCV(estimator=SVR(kernel='linear'), param_grid={'C':np.logspace(-5, 5, num=10, base=2)}, cv= 5, iid=False, verbose=10, n_jobs = -1)
svr_linear_grid_cv.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0350s.) Setting batch_size=10.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of  50 | elapsed:    0.0s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   21.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated',
                           kernel='linear', max_iter=-1, shrinking=True,
                           tol=0.001, verbose=False),
             iid=False, n_jobs=-1,
             param_grid={'C': array([3.12500000e-02, 6.75037337e-02, 1.45816130e-01, 3.14980262e-01,
       6.80395000e-01, 1.46973449e+00, 3.17480210e+00, 6.85795186e+00,
       1.48139954e+01, 3.20000000e+01])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

#### Model evaluation

In [32]:
print('Test RMSE:', np.sqrt(mean_squared_error(y_test, svr_linear_grid_cv.predict(X_test))))
print('Test R2:', r2_score(y_test, svr_linear_grid_cv.predict(X_test)))
print('Test Explained Variance:', explained_variance_score(y_test, svr_linear_grid_cv.predict(X_test)))
print()
print('Train RMSE:', np.sqrt(mean_squared_error(y_train, svr_linear_grid_cv.predict(X_train))))
print('Train R2:', r2_score(y_train, svr_linear_grid_cv.predict(X_train)))
print('Train Explained Variance:', explained_variance_score(y_train, svr_linear_grid_cv.predict(X_train)))

Test RMSE: 24.758718723646993
Test R2: -0.057016466283945855
Test Explained Variance: 0.08328952877913942

Train RMSE: 24.52398590326927
Train R2: 0.309022670892905
Train Explained Variance: 0.30911307414900524


#### Accuracy of winner prediction

In [33]:
matches = pd.read_csv('matches.csv')
matchId_winnerInning_dict = {}
for index, row in matches.iterrows():
    if row['winner'] == row['team1']:
        matchId_winnerInning_dict[row['id']] = 1
    if row['winner'] == row['team2']:
        matchId_winnerInning_dict[row['id']] = 2

In [34]:
inning1_out = svr_linear_grid_cv.predict(X_test)
inning2_out = svr_linear_grid_cv.predict(inning_2_input)

winner_df = df_inning_2[['Match_ID']]
winner = [ 1 if inning1_out[i] > inning2_out[i] else 2 for i in range(len(inning1_out))]
winner_df['Winner'] = winner
correct_count = 0
for match_id, winner  in winner_df[['Match_ID', 'Winner']].itertuples(index=False):
    if winner == matchId_winnerInning_dict[match_id]:
        correct_count += 1
winPredictionAccuracy = float((correct_count*100)/len(winner_df))

print(winPredictionAccuracy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


66.66666666666667


### Support vector Regressor with RBF kernel

In [35]:
param_dict = {'C':np.logspace(-5, 5, num=10, base=2), 'gamma':np.logspace(-6, 6, num=10, base=2), 'degree':[1,2,3]}
svr_rbf_grid_cv = GridSearchCV(estimator=SVR(kernel='rbf'), param_grid=param_dict, cv= 5, iid=False, verbose=10, n_jobs = -1)
svr_rbf_grid_cv.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0350s.) Setting batch_size=10.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 348 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 458 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 568 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 698 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 978 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 1298 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 1500

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid=False, n_jobs=-1,
             param_grid={'C': array([3.12500000e-02, 6.75037337e-02, 1.45816130e-01, 3.14980262e-01,
       6.80395000e-01, 1.46973449e+00, 3.17480210e+00, 6.85795186e+00,
       1.48139954e+01, 3.20000000e+01]),
                         'degree': [1, 2, 3],
                         'gamma': array([1.56250000e-02, 3.93725328e-02, 9.92125657e-02, 2.50000000e-01,
       6.29960525e-01, 1.58740105e+00, 4.00000000e+00, 1.00793684e+01,
       2.53984168e+01, 6.40000000e+01])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

#### Model evaluation

In [36]:
print('Test RMSE:', np.sqrt(mean_squared_error(y_test, svr_rbf_grid_cv.predict(X_test))))
print('Test R2:', r2_score(y_test, svr_rbf_grid_cv.predict(X_test)))
print('Test Explained Variance:', explained_variance_score(y_test, svr_rbf_grid_cv.predict(X_test)))
print()
print('Train RMSE:', np.sqrt(mean_squared_error(y_train, svr_rbf_grid_cv.predict(X_train))))
print('Train R2:', r2_score(y_train, svr_rbf_grid_cv.predict(X_train)))
print('Train Explained Variance:', explained_variance_score(y_train, svr_rbf_grid_cv.predict(X_train)))

Test RMSE: 25.819389635271946
Test R2: -0.14952220212881207
Test Explained Variance: -0.01131349896834366

Train RMSE: 10.596836971721524
Train R2: 0.8709867877542454
Train Explained Variance: 0.8710905847659012


#### Accuracy of winner prediction

In [37]:
inning1_out = svr_linear_grid_cv.predict(X_test)
inning2_out = svr_linear_grid_cv.predict(inning_2_input)

winner_df = df_inning_2[['Match_ID']]
winner = [ 1 if inning1_out[i] > inning2_out[i] else 2 for i in range(len(inning1_out))]
winner_df['Winner'] = winner
correct_count = 0
for match_id, winner  in winner_df[['Match_ID', 'Winner']].itertuples(index=False):
    if winner == matchId_winnerInning_dict[match_id]:
        correct_count += 1
winPredictionAccuracy = float((correct_count*100)/len(winner_df))

print(winPredictionAccuracy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


66.66666666666667


### Support vector Regressor with sigmoid kernel

In [38]:
svr_sigmoid_grid_cv = GridSearchCV(estimator=SVR(kernel='sigmoid'), param_grid=param_dict, cv= 5, iid=False, verbose=10, n_jobs = -1)
svr_sigmoid_grid_cv.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0290s.) Setting batch_size=12.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 308 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 548 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 680 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 836 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 992 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 1172 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 1352 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:    7.8s finished
  y = column_or_1d(y,

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated',
                           kernel='sigmoid', max_iter=-1, shrinking=True,
                           tol=0.001, verbose=False),
             iid=False, n_jobs=-1,
             param_grid={'C': array([3.12500000e-02, 6.75037337e-02, 1.45816130e-01, 3.14980262e-01,
       6.80395000e-01, 1.46973449e+00, 3.17480210e+00, 6.85795186e+00,
       1.48139954e+01, 3.20000000e+01]),
                         'degree': [1, 2, 3],
                         'gamma': array([1.56250000e-02, 3.93725328e-02, 9.92125657e-02, 2.50000000e-01,
       6.29960525e-01, 1.58740105e+00, 4.00000000e+00, 1.00793684e+01,
       2.53984168e+01, 6.40000000e+01])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

#### Model evaluation

In [39]:
print('Test RMSE:', np.sqrt(mean_squared_error(y_test, svr_sigmoid_grid_cv.predict(X_test))))
print('Test R2:', r2_score(y_test, svr_sigmoid_grid_cv.predict(X_test)))
print('Test Explained Variance:', explained_variance_score(y_test, svr_sigmoid_grid_cv.predict(X_test)))
print()
print('Train RMSE:', np.sqrt(mean_squared_error(y_train, svr_sigmoid_grid_cv.predict(X_train))))
print('Train R2:', r2_score(y_train, svr_sigmoid_grid_cv.predict(X_train)))
print('Train Explained Variance:', explained_variance_score(y_train, svr_sigmoid_grid_cv.predict(X_train)))

Test RMSE: 25.349022919295393
Test R2: -0.10802068746925042
Test Explained Variance: 0.0

Train RMSE: 29.553954064953178
Train R2: -0.0034895881520409233
Train Explained Variance: 1.1102230246251565e-16


#### Accuracy of winner prediction

In [40]:
inning1_out = svr_linear_grid_cv.predict(X_test)
inning2_out = svr_linear_grid_cv.predict(inning_2_input)

winner_df = df_inning_2[['Match_ID']]
winner = [ 1 if inning1_out[i] > inning2_out[i] else 2 for i in range(len(inning1_out))]
winner_df['Winner'] = winner
correct_count = 0
for match_id, winner  in winner_df[['Match_ID', 'Winner']].itertuples(index=False):
    if winner == matchId_winnerInning_dict[match_id]:
        correct_count += 1
winPredictionAccuracy = float((correct_count*100)/len(winner_df))

print(winPredictionAccuracy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


66.66666666666667
