In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
import seaborn as sb
sb.set_style("white")
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
import time

import warnings
warnings.filterwarnings('ignore')

In [63]:
import tensorflow as tf
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Dense,Input,Dropout
from tensorflow.keras.utils import to_categorical,plot_model

In [2]:
data = pd.read_csv("C:/Users/hp/Downloads/Studies/Data Mining/cp3/latest.csv")
data

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.1,India,Sri Lanka,RG Sharma,Shubman Gill,...,0,,,,,,,,,
1,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.2,India,Sri Lanka,RG Sharma,Shubman Gill,...,0,,,,,,bowled,RG Sharma,,
2,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.3,India,Sri Lanka,V Kohli,Shubman Gill,...,0,,,,,,,,,
3,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.4,India,Sri Lanka,V Kohli,Shubman Gill,...,0,,,,,,,,,
4,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.5,India,Sri Lanka,V Kohli,Shubman Gill,...,0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22574,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,34.5,New Zealand,South Africa,MJ Henry,GD Phillips,...,0,,,,,,,,,
22575,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,34.6,New Zealand,South Africa,MJ Henry,GD Phillips,...,0,,,,,,,,,
22576,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,35.1,New Zealand,South Africa,GD Phillips,MJ Henry,...,0,,,,,,,,,
22577,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,35.2,New Zealand,South Africa,GD Phillips,MJ Henry,...,0,,,,,,,,,


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22579 entries, 0 to 22578
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   match_id                22579 non-null  int64  
 1   season                  22579 non-null  object 
 2   start_date              22579 non-null  object 
 3   venue                   22579 non-null  object 
 4   innings                 22579 non-null  int64  
 5   ball                    22579 non-null  float64
 6   batting_team            22579 non-null  object 
 7   bowling_team            22579 non-null  object 
 8   striker                 22579 non-null  object 
 9   non_striker             22579 non-null  object 
 10  bowler                  22579 non-null  object 
 11  runs_off_bat            22579 non-null  int64  
 12  extras                  22579 non-null  int64  
 13  wides                   580 non-null    float64
 14  noballs                 50 non-null   

In [4]:
for i in data.columns:
    if len(data[i].unique()):
        print(data[i].value_counts())
        print()
        print()

27         618
19         616
22         614
8          613
23         601
1384433    601
21         599
1384427    598
18         597
1384430    597
7          597
4          594
26         589
6          585
30         582
11         575
28         566
10         561
1384429    558
17         558
2          557
5          557
13         556
1384431    545
15         539
1          525
9          525
32         524
16         519
29         518
1384425    489
14         485
1384428    485
31         473
1384426    470
20         448
24         445
12         442
3          441
1384424    431
1384432    425
25         361
Name: match_id, dtype: int64


2023/24    22579
Name: season, dtype: int64


2023-10-10    1210
2023-10-28    1184
2023-11-04    1068
2023-10-21    1064
2023-10-07    1035
2023-10-23     614
2023-11-10     601
2023-10-24     601
2023-10-22     599
2023-11-07     597
2023-10-20     597
2023-10-27     589
2023-10-09     585
2023-10-30     582
2023-10-13     575
2023-10-

In [5]:
print(data.isnull().sum())

match_id                      0
season                        0
start_date                    0
venue                         0
innings                       0
ball                          0
batting_team                  0
bowling_team                  0
striker                       0
non_striker                   0
bowler                        0
runs_off_bat                  0
extras                        0
wides                     21999
noballs                   22529
byes                      22547
legbyes                   22414
penalty                   22578
wicket_type               21938
player_dismissed          21938
other_wicket_type         22578
other_player_dismissed    22578
dtype: int64


### Data Pre-processing and Feature Engineering

In [6]:
#creating a column 'wicket' with value 1 if a wicket has fallen in that ball
data["wicket"] = data.apply(lambda row: 1 if isinstance(row["player_dismissed"], str) 
                            or isinstance(row["other_player_dismissed"], str) else 0, axis=1)

In [7]:
#dropping 'Other Wicket Type' and 'Other Player Dismissed' since they are not required
data.drop(['other_wicket_type','other_player_dismissed'], axis = 1, inplace = True)

In [8]:
#Filling None values in runs-related attributes
data[["wides", "noballs", "byes", "legbyes", "penalty"]] = data[["wides", "noballs", "byes", "legbyes", "penalty"]].fillna(0)

In [9]:
#Filling None values in wickets-related attributes
data.wicket_type.fillna("no wicket", inplace=True)
data.player_dismissed.fillna("no wicket", inplace=True)

In [10]:
#Separating over number and ball number
data["ball"] = data["ball"].astype(str)
data[["over", "ball_num"]] = data["ball"].str.split(".", expand=True).astype(int)
data["ball"] = data["ball"].astype(float)
data["over"] = data["over"] + 1
data["ball_left"] = 306 - (data["over"]*6 + data["ball_num"])
data["ball_left"] = data["ball_left"].clip(lower=0)

In [11]:
#creating a new column to add total runs scored per ball
data["total_runs"] = data["runs_off_bat"] + data["extras"]

In [12]:
#creating a new column to keep track of total runs scored so far in the innings
data["score"] = data.groupby(["match_id", "innings"])["total_runs"].cumsum()

In [13]:
#creating a new column to keep track of wickets remaining
data["wickets_remaining"] = data.groupby(["match_id","innings"])["wicket"].apply(lambda x: 10 - x.cumsum())

In [14]:
#calculating current run rate
data["run_rate"] = data.groupby(["match_id","innings"]).apply(lambda x: (x["score"]*6)/(300-x["ball_left"])).reset_index(level=[0,1], drop=True)

In [15]:
#setting targets in the data
targets = data[data["innings"] == 1].groupby("match_id")["score"].max().reset_index()
targets.rename(columns={"score": "target"}, inplace=True)

data = data.merge(targets, on="match_id", how="left")
data.loc[data['innings'] == 1, 'target'] = 0
data.loc[data['innings'] == 2, 'target'] += 1

In [16]:
#Finding the winning team
idx_winner = data.loc[data.groupby('match_id')['score'].idxmax()]
winners = idx_winner.set_index('match_id')['batting_team']
data['winner'] = data['match_id'].map(winners)

In [17]:
data

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,wicket,over,ball_num,ball_left,total_runs,score,wickets_remaining,run_rate,target,winner
0,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.1,India,Sri Lanka,RG Sharma,Shubman Gill,...,0,1,1,299,4,4,10,24.000000,0,India
1,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.2,India,Sri Lanka,RG Sharma,Shubman Gill,...,1,1,2,298,0,4,9,12.000000,0,India
2,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.3,India,Sri Lanka,V Kohli,Shubman Gill,...,0,1,3,297,0,4,9,8.000000,0,India
3,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.4,India,Sri Lanka,V Kohli,Shubman Gill,...,0,1,4,296,0,4,9,6.000000,0,India
4,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.5,India,Sri Lanka,V Kohli,Shubman Gill,...,0,1,5,295,0,4,9,4.800000,0,India
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22574,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,34.5,New Zealand,South Africa,MJ Henry,GD Phillips,...,0,35,5,91,0,161,1,4.622010,358,South Africa
22575,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,34.6,New Zealand,South Africa,MJ Henry,GD Phillips,...,0,35,6,90,0,161,1,4.600000,358,South Africa
22576,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,35.1,New Zealand,South Africa,GD Phillips,MJ Henry,...,0,36,1,89,0,161,1,4.578199,358,South Africa
22577,32,2023/24,2023-11-01,"Maharashtra Cricket Association Stadium, Pune",2,35.2,New Zealand,South Africa,GD Phillips,MJ Henry,...,0,36,2,88,6,167,1,4.726415,358,South Africa


In [18]:
#Setting Req runs and req run rate for second innings
mask = data['target'] > 0

data.loc[mask, 'req_runs'] = data.loc[mask, 'target'] - data.loc[mask, 'score']
data.loc[mask, 'req_rr'] = (data.loc[mask, 'req_runs'] * 6) / data.loc[mask, 'ball_left']

In [20]:
data.fillna(0, inplace=True)

In [21]:
#Creating Target Variable named 'result' which shows 1 if the batting team won, 0 otherwise
data["result"] = data.apply(lambda x: 1 if x["batting_team"] == x["winner"] else 0, axis=1)

In [22]:
data.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,ball_left,total_runs,score,wickets_remaining,run_rate,target,winner,req_runs,req_rr,result
0,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.1,India,Sri Lanka,RG Sharma,Shubman Gill,...,299,4,4,10,24.0,0,India,0.0,0.0,1
1,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.2,India,Sri Lanka,RG Sharma,Shubman Gill,...,298,0,4,9,12.0,0,India,0.0,0.0,1
2,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.3,India,Sri Lanka,V Kohli,Shubman Gill,...,297,0,4,9,8.0,0,India,0.0,0.0,1
3,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.4,India,Sri Lanka,V Kohli,Shubman Gill,...,296,0,4,9,6.0,0,India,0.0,0.0,1
4,1384424,2023/24,2023-11-02,"Wankhede Stadium, Mumbai",1,0.5,India,Sri Lanka,V Kohli,Shubman Gill,...,295,0,4,9,4.8,0,India,0.0,0.0,1


### Model to predict Winner of the match

In [23]:
data.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'wicket', 'over',
       'ball_num', 'ball_left', 'total_runs', 'score', 'wickets_remaining',
       'run_rate', 'target', 'winner', 'req_runs', 'req_rr', 'result'],
      dtype='object')

In [28]:
#filtering required columns for modelling
df_model = data[["venue", "batting_team", "bowling_team", "ball", "score", "run_rate", "req_rr",
                      "ball_left", "req_runs",'wickets_remaining', "target", "result"]]

In [29]:
#shuffling the dataset
df_model = df_model.sample(frac=1)

In [30]:
df_model = df_model[~(df_model.req_rr > 100)]

In [31]:
X = df_model.iloc[:,:-1]
y = df_model.iloc[:, -1]

In [38]:
ohe_cols = ['venue', 'batting_team', 'bowling_team']
scale_cols = ['score', 'target','ball', 'run_rate', 'req_rr', 'ball_left', 'req_runs','wickets_remaining']

In [34]:
#Hyperparameter Tuning for
#   1. Logitistic Regression
#   2. Gradient Boosting
#   3. XGBoost
#   4. CatBoost
#   5. LightGBM Classifier
#   6. Deep Learning: MLP

param_grids = {
    'LogisticRegression': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2']
    },
    'GradientBoosting': {
        'classifier__learning_rate': [1e-5, 1e-3, 3e-1, 3e-5, 5e-1, 5e-5],
        'classifier__n_estimators': [50, 100, 200, 300],
        'classifier__max_depth': [3, 5, 7]
    },
    'XGBoost': {
        'classifier__learning_rate': [1e-5, 1e-3, 3e-1, 3e-5, 5e-1, 5e-5],
        'classifier__n_estimators': [50, 100, 200, 300],
        'classifier__max_depth': [3, 5, 7]
    },
    'CatBoost': {
        'classifier__learning_rate': [1e-5, 1e-3, 3e-1, 3e-5, 5e-1, 5e-5],
        'classifier__n_estimators': [50, 100, 200, 300],
        'classifier__depth': [3, 5, 7]
    },
    'LGBMClassifier': {
        'classifier__learning_rate': [1e-5, 1e-3, 3e-1, 3e-5, 5e-1, 5e-5],
        'classifier__n_estimators': [50, 100, 200, 300],
        'classifier__max_depth': [3, 5, 7]
    },
    'DNN': {
        'classifier__learning_rate_init': [1e-5, 1e-3, 3e-1, 3e-5, 5e-1, 5e-5],
        'classifier__batch_size': [8, 16, 32, 64],
        'classifier__alpha': [0.1, 0.01, 0.001]
    }
}

In [39]:
#Building the Pipeline

ct = ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse=False, drop="first"), ohe_cols),
        ('num', StandardScaler(), scale_cols)])

pipelines = {
    'LogisticRegression': Pipeline([('transformer', ct), ('classifier', LogisticRegression())]),
    'GradientBoosting': Pipeline([('transformer', ct), ('classifier', GradientBoostingClassifier())]),
    'XGBoost': Pipeline([('transformer', ct), ('classifier', XGBClassifier())]),
    'CatBoost': Pipeline([('transformer', ct), ('classifier', CatBoostClassifier())]),
    'LGBMClassifier': Pipeline([('transformer', ct), ('classifier', LGBMClassifier())]),
    'DNN': Pipeline([('transformer', ct), ('classifier', MLPClassifier(max_iter=100))]),

}

In [40]:
#Running the models
start_time = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=83)

for model_name, pipeline in pipelines.items():
    print(f"Performing grid search for {model_name}")
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print(f"Best parameters found for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_:.4f}")

    test_accuracy = grid_search.score(X_test, y_test)
    print(f"Test set accuracy for {model_name}: {test_accuracy:.4f}\n")
    
    print("--- %s seconds ---" % (time.time() - start_time))
    print()
    print()
    start_time = time.time()

Performing grid search for LogisticRegression
Best parameters found for LogisticRegression: {'classifier__C': 100, 'classifier__penalty': 'l2'}
Best cross-validation accuracy for LogisticRegression: 0.9301
Test set accuracy for LogisticRegression: 0.9322

--- 4.3592143058776855 seconds ---


Performing grid search for GradientBoosting
Best parameters found for GradientBoosting: {'classifier__learning_rate': 0.3, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
Best cross-validation accuracy for GradientBoosting: 1.0000
Test set accuracy for GradientBoosting: 1.0000

--- 1145.0884323120117 seconds ---


Performing grid search for XGBoost
Best parameters found for XGBoost: {'classifier__learning_rate': 0.3, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
Best cross-validation accuracy for XGBoost: 1.0000
Test set accuracy for XGBoost: 1.0000

--- 62.598756551742554 seconds ---


Performing grid search for CatBoost
0:	learn: 0.5565997	total: 156ms	remaining: 7.65s

### Final Prediction using Categorical Boost

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=83)

ct = ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse=False, drop="first"), ohe_cols),
        ('num', StandardScaler(), scale_cols)])

pipe = Pipeline([('transformer', ct),
    ('CatBoost', CatBoostClassifier(depth=3, learning_rate=0.3, n_estimators=50))])

In [83]:
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
acc = accuracy_score(y_test, pred)
acc

0:	learn: 0.5565997	total: 8.35ms	remaining: 409ms
1:	learn: 0.4560492	total: 16.5ms	remaining: 397ms
2:	learn: 0.3793275	total: 24.5ms	remaining: 383ms
3:	learn: 0.3459146	total: 32.2ms	remaining: 371ms
4:	learn: 0.2833504	total: 41ms	remaining: 369ms
5:	learn: 0.2611717	total: 49.4ms	remaining: 362ms
6:	learn: 0.2269433	total: 58ms	remaining: 356ms
7:	learn: 0.1957649	total: 65.7ms	remaining: 345ms
8:	learn: 0.1757062	total: 74ms	remaining: 337ms
9:	learn: 0.1543784	total: 82.1ms	remaining: 328ms
10:	learn: 0.1384024	total: 89.9ms	remaining: 319ms
11:	learn: 0.1280810	total: 97.4ms	remaining: 308ms
12:	learn: 0.1128893	total: 106ms	remaining: 301ms
13:	learn: 0.1077580	total: 113ms	remaining: 292ms
14:	learn: 0.0932243	total: 121ms	remaining: 283ms
15:	learn: 0.0820449	total: 129ms	remaining: 274ms
16:	learn: 0.0712899	total: 137ms	remaining: 266ms
17:	learn: 0.0639145	total: 145ms	remaining: 257ms
18:	learn: 0.0588002	total: 152ms	remaining: 248ms
19:	learn: 0.0516107	total: 160ms	r

1.0

In [85]:
report = classification_report(y_test, pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2751
           1       1.00      1.00      1.00      2894

    accuracy                           1.00      5645
   macro avg       1.00      1.00      1.00      5645
weighted avg       1.00      1.00      1.00      5645



### Calculating Run Rate for all the teams

In [92]:
idx_rr = data.groupby(['match_id', 'innings'])['ball'].idxmax()
cols = ['batting_team', 'score','run_rate','ball_left']
run_rate_df = data.loc[idx_rr, cols]
run_rate_df

Unnamed: 0,batting_team,score,run_rate,ball_left
5502,England,282,5.640000,0
5723,New Zealand,283,7.788991,82
6025,Pakistan,286,5.836735,6
6280,Netherlands,205,4.979757,53
6508,Afghanistan,156,4.178571,76
...,...,...,...,...
4000,Netherlands,179,4.794643,76
2755,Sri Lanka,171,3.664286,20
2897,New Zealand,172,7.371429,160
4907,Afghanistan,244,4.880000,0


In [94]:
run_rate_df['balls_played'] = 300-run_rate_df['ball_left']

team_avg_rr = pd.DataFrame()
team_avg_rr['total_runs'] = (run_rate_df.groupby(["batting_team"])["score"].sum())
team_avg_rr['total_balls_played'] = (run_rate_df.groupby(["batting_team"])["balls_played"].sum())
team_avg_rr

Unnamed: 0_level_0,total_runs,total_balls_played
batting_team,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,2095,2386
Australia,2324,2228
Bangladesh,1747,2150
England,1908,1973
India,2113,2078
Netherlands,1612,2012
New Zealand,2537,2328
Pakistan,2084,2044
South Africa,2685,2489
Sri Lanka,2048,2267


In [95]:
team_avg_rr['run_rate']= (team_avg_rr['total_runs']*6)/team_avg_rr['total_balls_played']
team_avg_rr

Unnamed: 0_level_0,total_runs,total_balls_played,run_rate
batting_team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,2095,2386,5.268231
Australia,2324,2228,6.258528
Bangladesh,1747,2150,4.875349
England,1908,1973,5.802331
India,2113,2078,6.101059
Netherlands,1612,2012,4.807157
New Zealand,2537,2328,6.53866
Pakistan,2084,2044,6.117417
South Africa,2685,2489,6.472479
Sri Lanka,2048,2267,5.420379


### Creating new DataFrame for SemiFinals Winners Prediction

In [87]:
X_test.columns

Index(['venue', 'batting_team', 'bowling_team', 'ball', 'score', 'run_rate',
       'req_rr', 'ball_left', 'req_runs', 'wickets_remaining', 'target'],
      dtype='object')

In [96]:
new_data = pd.DataFrame(np.array(["Wankhede Stadium, Mumbai","India",'New Zealand',0.1,0,6.101059,0,299,0,10,0]).reshape(1,-1))
new_data.columns = X_test.columns

new_data2 = pd.DataFrame({'venue': ["Wankhede Stadium, Mumbai","Eden Gardens, Kolkata","Eden Gardens, Kolkata"],
            'batting_team': ["New Zealand","South Africa", "Australia"], 'bowling_team': ['India', 'Australia','South Africa'],
            'ball': [0.1,0.1,0.1], 'score':[0,0,0], 'run_rate':[6.538660,6.472479,6.258528], 'req_rr':[0,0,0],
            'ball_left':[299,299,299], 'req_runs':[0,0,0], 'wickets_remaining':[10,10,10], 'target':[0,0,0]
           })

new_data = pd.concat([new_data, new_data2], ignore_index=True)
new_data

Unnamed: 0,venue,batting_team,bowling_team,ball,score,run_rate,req_rr,ball_left,req_runs,wickets_remaining,target
0,"Wankhede Stadium, Mumbai",India,New Zealand,0.1,0,6.101059,0,299,0,10,0
1,"Wankhede Stadium, Mumbai",New Zealand,India,0.1,0,6.53866,0,299,0,10,0
2,"Eden Gardens, Kolkata",South Africa,Australia,0.1,0,6.472479,0,299,0,10,0
3,"Eden Gardens, Kolkata",Australia,South Africa,0.1,0,6.258528,0,299,0,10,0


In [97]:
pred2 = pipe.predict(new_data)
pred2

array([1, 0, 1, 1], dtype=int64)

In [98]:
prob = pipe.predict_proba(new_data)
np.round(prob, 3)

array([[0.009, 0.991],
       [0.802, 0.198],
       [0.001, 0.999],
       [0.039, 0.961]])

According to our model India will win the first semi-finals and enter the finals. However, both South Africa and Australia show high probability of winning if given the chance to bat first. If South Africa gets to bat first, then it's chances of winning are 99%.

### Finals Winner Prediction

In [99]:
finals = pd.DataFrame({'venue': ["Narendra Modi Stadium, Ahmedabad","Narendra Modi Stadium, Ahmedabad","Narendra Modi Stadium, Ahmedabad","Narendra Modi Stadium, Ahmedabad"],
            'batting_team': ["India","India","South Africa", "Australia"], 'bowling_team': ['Australia','South Africa','India','India'],
            'ball': [0.1,0.1,0.1,0.1], 'score':[0,0,0,0], 'run_rate':[6.101059,6.101059,6.472479,6.258528], 'req_rr':[0,0,0,0],
            'ball_left':[299,299,299,299], 'req_runs':[0,0,0,0], 'wickets_remaining':[10,10,10,10], 'target':[0,0,0,0]
           })
finals.columns = X_test.columns

pred_finals = pipe.predict(finals)
pred_finals

array([1, 1, 1, 0], dtype=int64)

In [100]:
prob = pipe.predict_proba(finals)
np.round(prob, 3)

array([[0.042, 0.958],
       [0.169, 0.831],
       [0.426, 0.574],
       [0.98 , 0.02 ]])

If India plays the finals against Australia, India has 95% chance of winning if Australia bowls first. If India bowls first, then India has 98% chance of winning.

If India and South Africa play the finals, then India has 83% chance of winning if South Africa bowls first. Whereas, if India bowls, they have only 42% of winning the finals and ICC World Cup trophy.