# Predicting Pakistan Super League (PSL) Matches using Random Forest Classification

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score,roc_auc_score
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

## Extract Training/Validation data

* **Extract the data for all the matches.** *(From PSL:S01 to PSL:S06[Up-to Match#14])*
* **Matches with no-result or matches decided on D/L method were discared.** *(5 Matches)*
* **Tied matches are treated as 'win' for both teams.**
* **Team batting first is referred to as Team1, and Team batting second is referred to as Team2**

**Following features were extracted:**
* Run-rate in powerplay for Team1 (*PP_rr*)
* Wickets remaining after powerplay for Team1 (*PP_wkt_rem*)
* Run-rate in overs 6-15 for Team1 (*mid_rr*)
* Wickets remaining after overs 6-15 for Team1 (*mid_wkt_rem*)
* Overall run-rate for Team1 after first innings (*RR*)
* Match result w.r.t. Team1 (*Win*)
* Required Run-rate after each over for Team2 (*rrr_T2*)
* Wickets remaining after each over for Team2 (*T2_wkt_rem*)

In [None]:
# This will load the whole data, where each row represents a completed match
# Column 1 shows the run-rate of the team batting first (Team 1) at the end of 6 overs (powerplay).
# Column 2 shows the wickets remaining for the team batting first (Team 1) at the end of 6 overs (powerplay).
# Column 3 shows the run-rate of the team batting first (Team 1) at the end of 15 overs.
# Column 4 shows the wickets remaining for the team batting first (Team 1) at the end of 15 overs.
# Column 5 shows the run-rate of the team batting first at the completion of 1st innings i.e. 20 overs.
# Column 6 shows if the team batting first won or lost. (I treated Tie as a win for both teams so this column would be 0 if team batting first lost and 1 otherwise).
# Column 7 shows the required run-rate for the chasing team (Team 2) at the end of 1st over.
# Column 8 shows the wickets remaining for the chasing team (Team 2) at the end of 1st over.
# Column 9-44 show the same data for chasing team (Team 2) as column 7-8, at the end of overs number 2-19 respectively.
df_data = pd.read_table('/kaggle/input/pakistansuperleaguepsl-155matches/PSL_155matches.csv',sep=",")
df_data.head()

In [None]:
# This data frame is a subset of whole dataset which contains features from Team 1's innings only
df1 = df_data.iloc[:,0:6]
df1.head()

In [None]:
def situational_model(overnum, seed):
    t = overnum-1
    df2 = df_data.iloc[:,[0, 1 , 2 ,3 ,4 ,5 , (t*2)+6, (t*2)+7]] # This add two features i.e. Required run-rate and wickets remaining after particular over
    
    if overnum == 0: # This means we are interested in prediction after the end of first innings.
        X = df2.values[:,[0,1,2,3,4]]
    else:
        X = df2.values[:,[0,1,2,3,4,6,7]]

    y = df2.values[:,5] # 0 if Team batting first loses and 1 otherwise
    X[X==np.inf] = 10000 # setting inf to high number just to avoid complications
    
    sm = SMOTE(random_state=42) # SMOTE function to deal with class imbalance 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) #80% matches for training- 20% for testing
    
    X_res, y_res = sm.fit_resample(X_train, y_train) # Since the data is imbalanced, we oversample the minority class in training data using SMOTE
    
    clf2 = RandomForestClassifier(random_state=seed).fit(X_res,y_res) # Random Forest classifier
    
    # Uncomment the below lines if you want to see the performance of model on test data
    
    #if seed == 42:
    #    print('Valdiation accuracy: %.2f'%clf2.score(X_test,y_test)) 
    
    return clf2 

## Testing
**Now we test the performance of our model on the current matches of PSL:S06.**

I haven't written a script to extract these features for current matches. Right now I am maintaining it manually at:
<https://github.com/AmmarMalik93/PSL-Prediction/blob/main/new_matches.csv>

So for example we are interested in knowing the performance of our model on Match no. 6 (IU vs LQ) in the above file, and in particular we are interested in knowing the winning probablities after the end of 1st innings, 6 overs after the second innings, and 15 overs after the second innings. Then,

In [None]:
num = 6 # Match of interest in test file
overnum = [0,6,15] # Overs of interest

In [None]:
df_new = pd.read_table('https://raw.githubusercontent.com/AmmarMalik93/PSL-Prediction/main/new_matches.csv', sep=',', index_col=0)
match = df_new.iloc[:,num-1].values.reshape(1,-1)
Team1 = df_new.columns[num-1].split('_')[0][0:2]
Team2 = df_new.columns[num-1].split('_')[1][0:2]


print('%s vs %s'%(Team1,Team2))


## For other matches you can provide these values manually too

**We train our model 100 times with variable seed and average their performance, in order to be sure that the probablities are not by chance and are consistent**

In [None]:
Team1_prob = list()
Team2_prob = list()
RRR = list()
Wkts = list()

for over_num in overnum:
    over_num = int(over_num)
    win_prob = list()

    if over_num == 0:
        test = match[:,[0,1,2,3,4]]
    else:
        test = match[:,[0,1,2,3,4, (2*(over_num-1))+5, (2*(over_num-1))+5+1]]

    for perm in tqdm(range(100)):
        clf2 = situational_model((over_num), seed = perm)
        win_prob.append(clf2.predict_proba(test)[0][1])

    win_prob = np.mean(np.asarray(win_prob))

    if over_num == 0:
        print('After 1st Innings')
    else:
        print ('After Over # %d'%over_num)

    print ('Win Probablity for %s: %.2f'%(Team1,win_prob))
    print ('Win Probablity for %s: %.2f'%(Team2,1-win_prob))

    if over_num == 0:
        print('Required Run Rate: %.2f, Wickets Remaining: %d'%(match[:,4]+0.05,10))
        RRR.append(match[:,4]+0.05)
        Wkts.append(np.array([10]))
    else:
        print ('Required Run Rate: %.2f, Wickets Remaining: %d'%((test[:,-2]),test[:,-1]))
        RRR.append(test[:,-2])
        Wkts.append(test[:,-1])
    Team1_prob.append(win_prob)
    Team2_prob.append(1-win_prob)

In [None]:
# Define colors for each team

teams = ['KK', 'LQ', 'PZ', 'IU', 'QG', 'MS']
cols = ['tab:blue', 'greenyellow', 'gold', 'tab:orange', 'tab:purple', 'tab:green']

## Plots

**If a single over is provided then a pie-chart is displayed showing the winning probablities after that over for both teams**
**If multiple overs are provided then a bar chart is displayed showing winning probablities after the end of each of those overs**

In [None]:
if len(overnum)> 1:
    fig, (ax1,ax2) = plt.subplots(2,1,figsize=(15,12))
    width = 0.5
    ax1.bar(np.arange(len(overnum)), Team1_prob[0:len(overnum)], width, label=Team1,
            color=cols[teams.index(Team1)])
    ax1.bar(np.arange(len(overnum)), Team2_prob[0:len(overnum)], width, label=Team2,
            bottom = Team1_prob[0:len(overnum)],color=cols[teams.index(Team2)])
    ax1.set_ylabel('Win Probablity', fontsize=16, fontweight='bold')
    ax1.set_xlabel('After Over #', fontsize=16, fontweight='bold')
    ax1.set_title(str('%svs%s'%(Team1,Team2)), fontsize=16, fontweight='bold')
    ax1.legend(fontsize=16)
    ax1.set_xticks(np.arange(len(overnum)))
    ax1.set_xticklabels(overnum)
    ax1.tick_params(labelsize=16)

    ax2.plot(np.arange(len(overnum)),RRR[0:len(overnum)], 'bP--', label='Required RR',linewidth=5.0, fillstyle='full', markersize=10)
    ax2.set_xlabel('After Over #', fontsize=16, fontweight='bold')
    ax3 = ax2.twinx()
    ax3.bar(np.arange(len(overnum)),np.asarray(Wkts).flatten()[0:len(overnum)], width, label='Wkt Rem.', color='green', alpha=0.5)

    ax2.set_xticks(np.arange(len(overnum)))
    ax2.set_xticklabels(overnum)
    ax2.tick_params(axis = 'y', labelsize=16, labelcolor='blue')
    ax2.tick_params(axis = 'x', labelsize=16, labelcolor='black')
    ax2.set_ylabel('Required RR', fontsize=16, fontweight='bold',color='blue')

    ax3.tick_params(labelsize=16, labelcolor='green')
    ax3.set_ylabel('Wickets Remaining', fontsize=16, fontweight='bold', color='green')

    ax1.grid(linestyle='-.')
    ax2.grid(linestyle='-.')
    #plt.savefig('results/%svs%s_overbyover.png'%(Team1,Team2))
    plt.show()
else:
    
    plt.subplots(figsize=(10,8))
    sizes = [np.mean(win_prob), 1-np.mean(win_prob)]
    plt.pie(sizes, labels=[Team1, Team2], autopct='%1.1f%%', wedgeprops=dict(width=0.65),
    colors = [cols[teams.index(Team1)], cols[teams.index(Team2)]], textprops={'fontsize': 14, 'fontweight': 'bold'},
    shadow=True, startangle=90)

    if overnum !=0:
        plt.title('%s vs %s, Match # %d, Prediction After %d overs 2nd Innings'%(Team1,Team2,num+14,over_num), fontsize = 16, fontweight= 'bold')
    else:
        plt.title('%s vs %s, Match # %d, Prediction After 1st Innings'%(Team1,Team2,num+14), fontsize = 16, fontweight= 'bold')
    #plt.savefig('results/%svs%s_pie.png'%(Team1,Team2))
    plt.show()