In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import os.path
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNetCV
from collections import OrderedDict

np.random.seed(500)

### Data Input and Visualization

Reading the input file

In [2]:
deliveries_unprocessed = pd.read_csv(os.path.abspath('') + "\\..\\deliveries.csv")
matches_unprocesed = pd.read_csv(os.path.abspath('') + "\\..\\matches.csv")
print('Deliveries Data Size:', deliveries_unprocessed.shape)
print('Matches Data Size:', matches_unprocesed.shape)

Deliveries Data Size: (150139, 21)
Matches Data Size: (633, 18)


Below code creates a dictionary of matchId and the date it was played and add date information in deliveries table

In [187]:
matchId_date_dict = {}
for index, row in matches_unprocesed.iterrows():
    matchId_date_dict[row['id']] = row['date']
    
date_column = []
for index, row in deliveries_unprocessed.iterrows():
    date_column.append(matches_unprocesed.iloc[row['match_id'] - 1]['date'])
deliveries_unprocessed['date'] = date_column

Removing matches with no results and result with duckworth lewis method

In [188]:
matchId_dl_and_noResult = []
for index, row in matches_unprocesed.iterrows():
    if row['result'] == 'no result':
        matchId_dl_and_noResult.append(row['id'])
    if row['dl_applied'] == 1:
        matchId_dl_and_noResult.append(row['id'])

In [189]:
drop_index_in_matches = []
for index, row in matches_unprocesed.iterrows():
    if row['id'] in matchId_dl_and_noResult:
        drop_index_in_matches.append(index)
        
drop_index_in_deliveries = []
for index, row in deliveries_unprocessed.iterrows():
    if row['match_id'] in matchId_dl_and_noResult:
        drop_index_in_deliveries.append(index)

In [190]:
matches = matches_unprocesed.drop(drop_index_in_matches)
deliveries = deliveries_unprocessed.drop(drop_index_in_deliveries)

Sorting the data with respect to date

In [191]:
deliveries = deliveries.sort_values(by=['date', 'match_id', 'inning', 'over', 'ball'])
deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder,date
13862,60,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,1,SC Ganguly,BB McCullum,P Kumar,0,...,1,0,0,0,1,1,,,,2008-04-18
13863,60,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,2,BB McCullum,SC Ganguly,P Kumar,0,...,0,0,0,0,0,0,,,,2008-04-18
13864,60,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,3,BB McCullum,SC Ganguly,P Kumar,0,...,0,0,0,0,1,1,,,,2008-04-18
13865,60,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,4,BB McCullum,SC Ganguly,P Kumar,0,...,0,0,0,0,0,0,,,,2008-04-18
13866,60,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,5,BB McCullum,SC Ganguly,P Kumar,0,...,0,0,0,0,0,0,,,,2008-04-18


In [192]:
matches = matches.sort_values(by=['date'])
matches.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
59,60,2008,Bangalore,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Kolkata Knight Riders,140,0,BB McCullum,M Chinnaswamy Stadium,Asad Rauf,RE Koertzen,
61,62,2008,Delhi,2008-04-19,Rajasthan Royals,Delhi Daredevils,Rajasthan Royals,bat,normal,0,Delhi Daredevils,0,9,MF Maharoof,Feroz Shah Kotla,Aleem Dar,GA Pratapkumar,
60,61,2008,Chandigarh,2008-04-19,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings,bat,normal,0,Chennai Super Kings,33,0,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",MR Benson,SL Shastri,
63,64,2008,Kolkata,2008-04-20,Deccan Chargers,Kolkata Knight Riders,Deccan Chargers,bat,normal,0,Kolkata Knight Riders,0,5,DJ Hussey,Eden Gardens,BF Bowden,K Hariharan,
62,63,2008,Mumbai,2008-04-20,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,normal,0,Royal Challengers Bangalore,0,5,MV Boucher,Wankhede Stadium,SJ Davis,DJ Harper,


Extracting the matchId-team-batsmen and matchId-team-oppositionBowler data in dictionary

In [193]:
matchId_team_batsmen_dict = OrderedDict()
matchId_team_opposition_bowler_dict = {}
for index, row in deliveries.iterrows():
    batsmen_set = matchId_team_batsmen_dict.get((row['match_id'], row['batting_team']), set())
    batsmen_set.add(row['batsman'])
    batsmen_set.add(row['non_striker'])
    matchId_team_batsmen_dict[(row['match_id'], row['batting_team'])] = batsmen_set
    
    opposition_bowler_set = matchId_team_opposition_bowler_dict.get((row['match_id'], row['batting_team']), set())
    opposition_bowler_set.add(row['bowler'])
    matchId_team_opposition_bowler_dict[(row['match_id'], row['batting_team'])] = opposition_bowler_set

Created a dictionary that assigned a value to each player

In [194]:
batsmen = deliveries['batsman'].unique()
bowlers = deliveries['bowler'].unique()
all_players = list(set(batsmen) | set(bowlers))
player_dict = {}
for i in range(len(all_players)):
    player_dict[all_players[i]] = i

Created a dictionary that stores score for each matchId and team played

In [195]:
matchId_team_score = OrderedDict()
for index, row in deliveries.iterrows():
    score = matchId_team_score.get((row['match_id'], row['batting_team']), 0)
    if int(row['is_super_over']) == 0:
        score += int(row['total_runs'])
    matchId_team_score[(row['match_id'], row['batting_team'])] = score

Following code creates 2 dictionaries - one which stores batting average for each batsmen for each day he played and another dictionary that stores the bowling econnomy of each bowler for each day he bowled in IPL

In [196]:
batting_average = pd.read_pickle('batsmen_average_data.pkl')
bowling_average = pd.read_pickle('bowlers_average_data.pkl')

In [197]:
batting_average.head()

Unnamed: 0,Name,Date,Batting Average,Average Balls Faced
0,DA Warner,2017-04-05,33.73,24.4
1,S Dhawan,2017-04-05,27.517857,23.410714
2,MC Henriques,2017-04-05,17.74359,14.461538
3,Yuvraj Singh,2017-04-05,22.27619,17.733333
4,DJ Hooda,2017-04-05,12.291667,9.333333


In [198]:
bowling_average.head()

Unnamed: 0,Name,Date,Average no of wickets,Average economy
0,YS Chahal,2017-04-05,1.380952,8.150794
1,S Aravind,2017-04-05,1.535714,8.309524
2,SR Watson,2017-04-05,1.104651,7.946705
3,STR Binny,2017-04-05,0.433962,7.738994
4,A Nehra,2017-04-05,1.353659,7.846545


In [199]:
batsmen_date_average_run = {}
bowler_date_average_economy = {}
for index, row in batting_average.iterrows():
    batsmen_date_average_run[(row['Name'], row['Date'])] = row['Batting Average']
for index, row in bowling_average.iterrows():
    bowler_date_average_economy[(row['Name'], row['Date'])] = row['Average economy']

Below code creates an input vector for each match and team played in it

In [200]:
matchId_team_player_vector_dict = OrderedDict()
for (match_id, team) in matchId_team_batsmen_dict:
    player_vector = np.zeros(len(all_players))
    for batsman in list(matchId_team_batsmen_dict[(match_id, team)]):
        player_vector[player_dict[batsman]] = 1
#         player_vector[player_dict[batsman]] = batsmen_date_average_run.get((batsman, matchId_date_dict[match_id]), 10)
    for bowler in list(matchId_team_opposition_bowler_dict[(match_id, team)]):
        player_vector[player_dict[bowler]] = 1
#         player_vector[player_dict[bowler]] = bowler_date_average_economy.get((bowler, matchId_date_dict[match_id]), 8)
    matchId_team_player_vector_dict[(match_id, team)] = player_vector

In [201]:
X_np = []
y_np = []
for (match_id, team) in matchId_team_batsmen_dict:
    X_np.append(matchId_team_player_vector_dict[(match_id, team)])
    y_np.append([match_id, team, matchId_team_score[(match_id, team)]])

In [202]:
X = pd.DataFrame(np.array(X_np))
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,484,485,486,487,488,489,490,491,492,493
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [173]:
y = pd.DataFrame(np.array(y_np))
y.head()

Unnamed: 0,0,1,2
0,60,Kolkata Knight Riders,222
1,60,Royal Challengers Bangalore,82
2,61,Chennai Super Kings,240
3,61,Kings XI Punjab,207
4,62,Rajasthan Royals,129


Train test split with first 0.8 ratio of data to train and rest to test

In [174]:
X_train = X[:int(6*len(X)/10)]
y_train = y[:int(6*len(X)/10)]
X_test = X[int(6*len(X)/10):]
y_test = y[int(6*len(X)/10):]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [204]:
X_train.shape

(740, 494)

In [205]:
X_test.shape

(494, 494)

In [206]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.preprocessing import PolynomialFeatures

In [178]:
poly_features = PolynomialFeatures(degree = 2, interaction_only=True)  
X_train_poly = poly_features.fit_transform(X_train)

In [179]:
ridge_lr = Ridge(fit_intercept=True)
lasso_lr = Lasso(fit_intercept=True)

ridge_lr_grid_cv = GridSearchCV(estimator=ridge_lr, param_grid={'alpha':np.logspace(-5, 5, 20)}, cv= 5, iid=False, n_jobs = -1, verbose = 10)
lasso_lr_grid_cv = GridSearchCV(estimator=lasso_lr, param_grid={'alpha':np.logspace(-5, 5, 20)}, cv= 5, iid=False, n_jobs = -1, verbose = 10)

ridge_lr_grid_cv.fit(X_train_poly, y_train[2])
lasso_lr_grid_cv.fit(X_train_poly, y_train[2])

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   40.1s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   58.1s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  88 out of 100 | elapsed:  1.8min remaining:   15.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.9min finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done  88 out of 100 | elapsed:  5.4min remaining:   44.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid=False, n_jobs=-1,
             param_grid={'alpha': array([1.00000000e-05, 3.35981829e-05, 1.12883789e-04, 3.79269019e-04,
       1.27427499e-03, 4.28133240e-03, 1.43844989e-02, 4.83293024e-02,
       1.62377674e-01, 5.45559478e-01, 1.83298071e+00, 6.15848211e+00,
       2.06913808e+01, 6.95192796e+01, 2.33572147e+02, 7.84759970e+02,
       2.63665090e+03, 8.85866790e+03, 2.97635144e+04, 1.00000000e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

In [180]:
X_test_poly = poly_features.fit_transform(X_test)
y_ridge_pred = ridge_lr_grid_cv.predict(X_test_poly)
y_lasso_pred = lasso_lr_grid_cv.predict(X_test_poly)

In [181]:
total_matches = len(y_ridge_pred)/2
ridge_winner_predictions = []
lasso_winner_predictions = []
correct_winner = []

for i in range(0, len(y_ridge_pred), 2):
    if y_ridge_pred[i] > y_ridge_pred[i+1]:
        ridge_winner_predictions.append(1)
    else:
        ridge_winner_predictions.append(2)

for i in range(0, len(y_lasso_pred), 2):
    if y_lasso_pred[i] > y_lasso_pred[i+1]:
        lasso_winner_predictions.append(1)
    else:
        lasso_winner_predictions.append(2)
        
for i in range(0, len(y_test[2]), 2):
    if y_test.iloc[i][2] > y_test.iloc[i+1][2]:
        correct_winner.append(1)
    else:
        correct_winner.append(2)

In [182]:
ridge_win_loss_accuracy = 0
lasso_win_loss_accuracy = 0
for i in range(0, len(correct_winner)):
    if correct_winner[i] == ridge_winner_predictions[i]:
        ridge_win_loss_accuracy += 1
    if correct_winner[i] == lasso_winner_predictions[i]:
        lasso_win_loss_accuracy += 1
        
ridge_win_loss_accuracy = ridge_win_loss_accuracy/total_matches
lasso_win_loss_accuracy = lasso_win_loss_accuracy/total_matches

print('Ridge Win-Loss Accuracy:', ridge_win_loss_accuracy)
print('Lasso Win-Loss Accuracy:', lasso_win_loss_accuracy)

Ridge Win-Loss Accuracy: 0.5303643724696356
Lasso Win-Loss Accuracy: 0.4979757085020243


In [183]:
print('Test Ridge RMSE:', np.sqrt(mean_squared_error(y_test[2], y_ridge_pred)))
print('Test Ridge R2:', r2_score(y_test[2], y_ridge_pred))
print('Test Ridge Explained Variance:', explained_variance_score(y_test[2], y_ridge_pred))
print()
print('Train Ridge RMSE:', np.sqrt(mean_squared_error(y_train[2], ridge_lr_grid_cv.predict(X_train_poly))))
print('Train Ridge R2:', r2_score(y_train[2], ridge_lr_grid_cv.predict(X_train_poly)))
print('Train Ridge Explained Variance:', explained_variance_score(y_train[2], ridge_lr_grid_cv.predict(X_train_poly)))

Test Ridge RMSE: 29.599816222749897
Test Ridge R2: -0.0054286675030368325
Test Ridge Explained Variance: -0.004700565887977959

Train Ridge RMSE: 14.52354834535788
Train Ridge R2: 0.7567646162046445
Train Ridge Explained Variance: 0.7567646162046445


In [184]:
print('Test Lasso RMSE:', np.sqrt(mean_squared_error(y_test[2], y_lasso_pred)))
print('Test Lasso R2:', r2_score(y_test[2], y_lasso_pred))
print('Test Lasso Explained Variance:', explained_variance_score(y_test[2], y_lasso_pred))
print()
print('Train Lasso RMSE:', np.sqrt(mean_squared_error(y_train[2], lasso_lr_grid_cv.predict(X_train_poly))))
print('Train Lasso R2:', r2_score(y_train[2], lasso_lr_grid_cv.predict(X_train_poly)))
print('Train Lasso Explained Variance:', explained_variance_score(y_train[2], lasso_lr_grid_cv.predict(X_train_poly)))

Test Lasso RMSE: 30.190002593404422
Test Lasso R2: -0.04592257141926104
Test Lasso Explained Variance: -0.009955266996837464

Train Lasso RMSE: 27.717923766161512
Train Lasso R2: 0.11406333591979756
Train Lasso Explained Variance: 0.11406333591979756


In [198]:
y_ridge_pred = y_ridge_pred.astype(int)
y_total = np.array([y_test, y_ridge_pred], np.int32)
df = pd.DataFrame(y_total)
filepath = 'score_prediction.xlsx'
df.to_excel(filepath, index=False)