### Import Libraries

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import seaborn as sns
import numpy as np
import math
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
import os.path

### Load pre-computed stats

In [3]:
df_train = pd.read_pickle(os.path.abspath('') + '\\..\\Derived stats\\match_stats_train.pkl')
df_test = pd.read_pickle(os.path.abspath('') + '\\..\\Derived stats\\match_stats_test.pkl')

# delete outlier
""" Assuming, in most normal matches, teams 
score more than 75 runs on an average as shown
by past statistics. """
df_train = df_train[df_train['Runs'] > 75]
df_test = df_test[df_test['Runs'] > 75]

cols = df_train.columns
cols = cols.map(lambda x: x.replace(' ', '_') if isinstance(x, (str, 'utf-8')) else x)
df_train.columns = cols
df_test.columns = cols

# To remove the other inning for which outliers were removed
match_ids = df_test['Match_ID'].unique()
m_ids_toBeRemoved = []          
for m_id in match_ids:
    if len(df_test[df_test['Match_ID'] == m_id]) !=2:
        m_ids_toBeRemoved.append(m_id)

for m_id in m_ids_toBeRemoved:
    df_test = df_test[df_test['Match_ID'] != m_id]

### Computing additional features

#### Adding feature: Average runs scored against same opponent in previous matches

In [76]:
""" Previous run with same opponent """

prev_run = []
for Date, Team_Name, Opp_Team  in df_train[['Date','Team_Name', 'Opp_Team']].itertuples(index=False):
    df_t = df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs']
    if len(df_t) == 0:
        prev_run.append(df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team")['Runs'].mean())
        continue
    prev_run.append(df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs'].values[-1])
df_train['Prev Run'] = prev_run

prev_run = []
for Date, Team_Name, Opp_Team  in df_test[['Date','Team_Name', 'Opp_Team']].itertuples(index=False):
    df_t = df_test.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs']
    if len(df_t) == 0:
        df_t_t = df_train.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")
        if len(df_t_t) == 0:
            prev_run.append(0)
        else:
            prev_run.append(df_t_t['Runs'].values[-1])
        continue
    prev_run.append(df_test.query("Team_Name == @Team_Name & Opp_Team == @Opp_Team & Date < @Date")['Runs'].values[-1])
df_test['Prev Run'] = prev_run

#### Adding feature: Average runs scored in a venue till date irrespective of teams

In [77]:
""" Previous run on venue irrespective of team """

prev_run = []
for Date, city  in df_train[['Date', 'City']].itertuples(index=False):
    df_t = df_train.query("City == @city & Date < @Date")
    if len(df_t) == 0:
        df_t_t = df_train.query("City == @city")
        if len(df_t_t) == 0:
            prev_run.append(0)
        else:
            prev_run.append(df_t_t['Runs'].mean())
        continue
    prev_run.append(df_t['Runs'].mean())
df_train['Prev Venue Run'] = prev_run

prev_run = []
for Date, city  in df_test[['Date', 'City']].itertuples(index=False):
    df_te = df_test.query("City == @city & Date < @Date")
    df_tr = df_train.query("City == @city & Date < @Date")
    prev_run.append((df_te['Runs'].sum(axis = 0, skipna = True) + df_tr['Runs'].sum(axis = 0, skipna = True))/(len(df_te)+len(df_tr)))
df_test['Prev Venue Run'] = prev_run

### Print data before model fitting

In [81]:
df_train.head(5)

Unnamed: 0,Match_ID,Date,Team_Name,Opp_Team,Innnings,City,Past_lead,Target,Avg_Bat1,Curr_Bat1,...,Avg_Bat11,Curr_Bat11,Bowl1,Bowl2,Bowl3,Bowl4,Bowl5,Runs,Prev Run,Prev Venue Run
0,175,2010-03-12,Kolkata Knight Riders,Deccan Chargers,1,Mumbai,0,0,14.125,11.6,...,0,0,6.729167,7.833333,8.574074,9.125,10.666667,161,160.2,157.776119
1,175,2010-03-12,Deccan Chargers,Kolkata Knight Riders,2,Mumbai,1,161,31.033333,33.2,...,0,0,3.75,7.145833,7.322917,9.35,10.666667,150,147.2,157.776119
2,176,2010-03-13,Mumbai Indians,Rajasthan Royals,1,Mumbai,0,0,28.269231,15.8,...,0,0,7.75,8.006667,8.490741,8.595238,8.0,212,163.384615,155.5
3,176,2010-03-13,Rajasthan Royals,Mumbai Indians,2,Mumbai,1,212,28.391304,14.0,...,0,0,6.243056,8.0625,9.65,8.0,8.0,208,160.615385,155.5
4,177,2010-03-13,Kings XI Punjab,Delhi Daredevils,1,Chandigarh,1,0,27.6,27.6,...,10,10,7.020833,7.208333,7.424242,8.0625,8.845238,142,146.714286,155.071429


In [82]:
df_test.head(5)

Unnamed: 0,Match_ID,Date,Team_Name,Opp_Team,Innnings,City,Past_lead,Target,Avg_Bat1,Curr_Bat1,...,Avg_Bat11,Curr_Bat11,Bowl1,Bowl2,Bowl3,Bowl4,Bowl5,Runs,Prev Run,Prev Venue Run
0,1,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,1,Hyderabad,1,0,33.73,55.963474,...,0,0,7.033333,8.236111,8.958333,10.8,8.0,207,208,146.117647
1,1,2017-04-05,Royal Challengers Bangalore,Sunrisers Hyderabad,2,Hyderabad,0,207,37.923077,41.6,...,10,10,7.130952,7.435185,7.4375,7.440476,7.75,172,200,146.117647
2,2,2017-04-06,Mumbai Indians,Rising Pune Supergiant,1,Pune,0,0,19.27,20.8,...,0,0,6.5,7.0,7.570833,8.364583,8.640625,184,0,147.6
3,2,2017-04-06,Rising Pune Supergiant,Mumbai Indians,2,Pune,0,184,30.05618,27.4,...,0,0,6.25,8.416667,8.875,8.983333,9.105263,187,0,147.6
4,3,2017-04-07,Gujarat Lions,Kolkata Knight Riders,1,Rajkot,1,0,10.0,10.0,...,0,0,6.1,8.006667,8.526882,8.535714,9.25,183,125,154.5


### Data Vizualization

In [None]:
sns.set(style="ticks", color_codes=True)
sns.pairplot(df_train)

### Selecting Features to train the model

In [54]:
df_train_inning_1 = df_train[df_train['Innnings'] == 1]
df_test_inning_1 = df_test[df_test['Innnings'] == 1]

In [55]:
df_inning_2 = df_test[df_test['Innnings'] == 2]
inning_2_input = df_inning_2[['Avg_Bat1', 'Avg_Bat2', 'Avg_Bat3', 'Avg_Bat4', 'Avg_Bat5', 'Avg_Bat6', 'Avg_Bat7', 'Avg_Bat8', 'Avg_Bat9', 'Avg_Bat10', 'Avg_Bat11','Prev Run', 'Prev Venue Run', 'Past_lead']]

In [56]:
X_train = df_train_inning_1[['Avg_Bat1', 'Avg_Bat2', 'Avg_Bat3', 'Avg_Bat4', 'Avg_Bat5', 'Avg_Bat6', 'Avg_Bat7', 'Avg_Bat8', 'Avg_Bat9', 'Avg_Bat10', 'Avg_Bat11', 'Prev Run', 'Prev Venue Run','Past_lead']]
y_train = df_train_inning_1[['Runs']]

X_test = df_test_inning_1[['Avg_Bat1', 'Avg_Bat2', 'Avg_Bat3', 'Avg_Bat4', 'Avg_Bat5', 'Avg_Bat6', 'Avg_Bat7', 'Avg_Bat8', 'Avg_Bat9', 'Avg_Bat10', 'Avg_Bat11','Prev Run', 'Prev Venue Run', 'Past_lead']]
y_test = df_test_inning_1[['Runs']]

## Linear Regression

In [57]:
runsPredictor = LinearRegression()
runsPredictor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [58]:
predictedRunsTrain = runsPredictor.predict(X_train)
print('Train Linear model RMSE:', np.sqrt(mean_squared_error(y_train, predictedRunsTrain)))

Train Linear model RMSE: 24.304875829036273


In [59]:
predictedRunsTest = runsPredictor.predict(X_test)
print('Test Linear model RMSE:', np.sqrt(mean_squared_error(y_test, predictedRunsTest)))

Test Linear model RMSE: 25.77450911129639


### Accuracy of winner prediction

In [65]:
matches = pd.read_csv('matches.csv')
matchId_winnerInning_dict = {}
for index, row in matches.iterrows():
    if row['winner'] == row['team1']:
        matchId_winnerInning_dict[row['id']] = 1
    if row['winner'] == row['team2']:
        matchId_winnerInning_dict[row['id']] = 2

In [84]:
inning1_out = predictedRunsTest
inning2_out = runsPredictor.predict(inning_2_input)

winner_df = df_inning_2[['Match_ID']]
winner = [ 1 if inning1_out[i] > inning2_out[i] else 2 for i in range(len(inning1_out))]
winner_df['Winner'] = winner
correct_count = 0
for match_id, winner  in winner_df[['Match_ID', 'Winner']].itertuples(index=False):
    if winner == matchId_winnerInning_dict[match_id]:
        correct_count += 1
winPredictionAccuracy = float((correct_count*100)/len(winner_df))

print(winPredictionAccuracy)

64.81481481481481


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Regression with Regularization

In [60]:
poly_features = PolynomialFeatures(degree = 2)  
X_train_poly = poly_features.fit_transform(X_train)

In [61]:
ridge_lr = Ridge(fit_intercept=True)
lasso_lr = Lasso(fit_intercept=True)

ridge_lr_grid_cv = GridSearchCV(estimator=ridge_lr, param_grid={'alpha':np.logspace(-5, 5, 20)}, cv= 5, iid=False, n_jobs = -1)
lasso_lr_grid_cv = GridSearchCV(estimator=lasso_lr, param_grid={'alpha':np.logspace(-5, 5, 20)}, cv= 5, iid=False, n_jobs = -1)

ridge_lr_grid_cv.fit(X_train_poly, y_train)
lasso_lr_grid_cv.fit(X_train_poly, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid=False, n_jobs=-1,
             param_grid={'alpha': array([1.00000000e-05, 3.35981829e-05, 1.12883789e-04, 3.79269019e-04,
       1.27427499e-03, 4.28133240e-03, 1.43844989e-02, 4.83293024e-02,
       1.62377674e-01, 5.45559478e-01, 1.83298071e+00, 6.15848211e+00,
       2.06913808e+01, 6.95192796e+01, 2.33572147e+02, 7.84759970e+02,
       2.63665090e+03, 8.85866790e+03, 2.97635144e+04, 1.00000000e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [62]:
X_test_poly = poly_features.fit_transform(X_test)
y_ridge_pred = ridge_lr_grid_cv.predict(X_test_poly)
y_lasso_pred = lasso_lr_grid_cv.predict(X_test_poly)

In [63]:
print('Train Ridge RMSE:', np.sqrt(mean_squared_error(y_train, ridge_lr_grid_cv.predict(X_train_poly))))
print('Test Ridge RMSE:', np.sqrt(mean_squared_error(y_test, y_ridge_pred)))

Train Ridge RMSE: 21.83706939809203
Test Ridge RMSE: 24.983200604426063


In [64]:
print('Train Lasso RMSE:', np.sqrt(mean_squared_error(y_train, lasso_lr_grid_cv.predict(X_train_poly))))
print('Test Lasso RMSE:', np.sqrt(mean_squared_error(y_test, y_lasso_pred)))

Train Lasso RMSE: 24.140978068270485
Test Lasso RMSE: 24.046720768869907


### Accuracy of winner prediction

In [73]:
inning1_out = y_lasso_pred
X_inning2_poly = poly_features.transform(inning_2_input)
inning2_out = lasso_lr_grid_cv.predict(X_inning2_poly)

winner_df = df_inning_2[['Match_ID']]
winner = [ 1 if inning1_out[i] > inning2_out[i] else 2 for i in range(len(inning1_out))]
winner_df['Winner'] = winner
correct_count = 0
for match_id, winner  in winner_df[['Match_ID', 'Winner']].itertuples(index=False):
    if winner == matchId_winnerInning_dict[match_id]:
        correct_count += 1
winPredictionAccuracy = float((correct_count*100)/len(winner_df))

print(winPredictionAccuracy)

66.66666666666667


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### Print Scores prediction

In [None]:
for id, runs in df_test[['Match_ID', 'Runs']].itertuples(index=False):
    print(runs)

In [None]:
for runs in y_ridge_pred:
    print(runs)