In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [95]:
prev_wc_df = pd.read_csv('./wc_stats.csv')
odi_results_df = pd.read_csv('./odi_results.csv')

In [96]:
prev_wc_df.head()

Unnamed: 0,Team,Group,Previous \nappearances,Previous \r\ntitles,Previous\r\n finals,Previous\r\n semifinals,Current \r rank
0,England,A,12,1,4,6,7
1,South Africa,A,7,0,0,5,2
2,Netherlands,A,5,0,0,0,10
3,Pakistan,A,12,1,2,6,5
4,New Zealand,A,12,0,2,9,4


In [97]:
odi_results_df.head()

Unnamed: 0,date,Team_1,Team_2,Winner,Margin,Ground
0,"Jan 12, 2011",South Africa,India,South Africa,135 runs,Durban
1,"Jan 15, 2011",South Africa,India,India,1 run,Johannesburg
2,"Jan 16, 2011",Australia,England,Australia,6 wickets,Melbourne
3,"Jan 18, 2011",South Africa,India,India,2 wickets,Cape Town
4,"Jan 21, 2011",Australia,England,Australia,46 runs,Hobart


In [98]:
# Restricting to the teams in the 2023 World Cup.
worldcup_teams = ['England', 'South Africa', 'Netherlands', 'Pakistan', 'New Zealand', 'Sri Lanka', 'Afghanistan', 'Australia', 'Bangladesh', 'India']

results_team_1 = odi_results_df.loc[odi_results_df['Team_1'].isin(worldcup_teams)]
results_team_2 = odi_results_df.loc[odi_results_df['Team_2'].isin(worldcup_teams)]

results_df = pd.concat([results_team_1, results_team_2])

In [99]:
new_results_df = results_df.drop(['date','Margin', 'Ground'], axis=1)
new_results_df

Unnamed: 0,Team_1,Team_2,Winner
0,South Africa,India,South Africa
1,South Africa,India,India
2,Australia,England,Australia
3,South Africa,India,India
4,Australia,England,Australia
...,...,...,...
1572,India,Australia,India
1573,Bangladesh,New Zealand,New Zealand
1575,India,Australia,India
1576,Bangladesh,New Zealand,New Zealand


In [100]:
# Convert 'Team_1' and 'Team_2' from categorical variables to continuous inputs
final_encoded_df = pd.concat([pd.get_dummies(new_results_df['Team_1'], prefix='Team_1'),
                           pd.get_dummies(new_results_df['Team_2'], prefix='Team_2'),
                           new_results_df.drop(['Team_1', 'Team_2'], axis=1)], axis=1)


X = final_encoded_df.drop(['Winner'], axis=1)
y = final_encoded_df['Winner']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [101]:
final_encoded_df.head()

Unnamed: 0,Team_1_Afghanistan,Team_1_Australia,Team_1_Bangladesh,Team_1_Canada,Team_1_England,Team_1_Hong Kong,Team_1_India,Team_1_Ireland,Team_1_Kenya,Team_1_Nepal,...,Team_2_Oman,Team_2_Pakistan,Team_2_Scotland,Team_2_South Africa,Team_2_Sri Lanka,Team_2_U.A.E.,Team_2_U.S.A.,Team_2_West Indies,Team_2_Zimbabwe,Winner
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,South Africa
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,India
2,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Australia
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,India
4,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Australia


In [102]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    # Add other parameters you want to tune
}

# Create the RandomForestClassifier
rf = RandomForestClassifier(random_state=0)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)




Best Parameters: {'max_depth': 10, 'n_estimators': 50}


In [103]:
# Get the best estimator from the grid search
best_rf = grid_search.best_estimator_

# Evaluate the model on the training set
train_score = best_rf.score(X_train, y_train)
print("Training Set Accuracy:", train_score)

# Evaluate the model on the test set
test_score = best_rf.score(X_test, y_test)
print("Test Set Accuracy:", test_score)


Training Set Accuracy: 0.6672672672672673
Test Set Accuracy: 0.645083932853717


In [104]:
# rf = RandomForestClassifier(n_estimators=100, max_depth=20,random_state=0)
# rf.fit(X_train, y_train) 


# score = rf.score(X_train, y_train)
# score2 = rf.score(X_test, y_test)


# print(f"Training set accuracy: {score:.3f}")
# print(f"Test set accuracy: {score2:.3f}")

In [105]:
# Team with higher ICC ranking will be favourite to win a match against team with lower ICC ranking

ranking_df = pd.read_csv('./icc_rankings.csv') 
fixtures_df = pd.read_csv('./group_stage.csv')

In [106]:
# Adding ICC ranks for each team in the fixtures df

# Merge fixtures_df with ranking_df for 'Team_1' positions
fixtures_df = fixtures_df.merge(ranking_df[['Team', 'Position']], left_on='Team_1', right_on='Team', how='left')
fixtures_df.rename(columns={'Position': 'first_position'}, inplace=True)
fixtures_df.drop('Team', axis=1, inplace=True)

# Merge fixtures_df with ranking_df for 'Team_2' positions
fixtures_df = fixtures_df.merge(ranking_df[['Team', 'Position']], left_on='Team_2', right_on='Team', how='left')
fixtures_df.rename(columns={'Position': 'second_position'}, inplace=True)
fixtures_df.drop('Team', axis=1, inplace=True)


In [107]:
fixtures_df.tail()

Unnamed: 0,Round Number,Date,Location,Team_1,Team_2,Group,Result,first_position,second_position
40,1,"Nov 9, 2023",Bengaluru,New Zealand,Sri Lanka,Group A,,5.0,7.0
41,1,"Nov 10, 2023",Ahmedabad,Afghanistan,South Africa,Group A,,9.0,3.0
42,1,"Nov 11, 2023",Pune,Australia,Bangladesh,Group A,,2.0,8.0
43,1,"Nov 11, 2023",Eden Gardens,England,Pakistan,Group A,,6.0,4.0
44,1,"Nov 12, 2023",Bengaluru,India,Netherlands,Group A,,1.0,


In [108]:
grp_stage = []

# Loop to add teams to group stage dataset based on the ranking position of each team
for index, row in fixtures_df.iterrows():
    if row['first_position'] < row['second_position']:
        grp_stage.append({'Team_1': row['Team_1'], 'Team_2': row['Team_2'], 'winning_team': None})
    else:
        grp_stage.append({'Team_1': row['Team_2'], 'Team_2': row['Team_1'], 'winning_team': None})
        
grp_stage = pd.DataFrame(grp_stage)
backup_pred_set = grp_stage
grp_stage.head()

Unnamed: 0,Team_1,Team_2,winning_team
0,New Zealand,England,
1,Pakistan,Netherlands,
2,Bangladesh,Afghanistan,
3,South Africa,Sri Lanka,
4,India,Australia,


In [109]:
# Convert categorical variables to continuous inputs
grp_stage_encoded = pd.concat([pd.get_dummies(grp_stage['Team_1'], prefix='Team_1'),
                               pd.get_dummies(grp_stage['Team_2'], prefix='Team_2'),
                               grp_stage.drop(['Team_1', 'Team_2'], axis=1)], axis=1)

# Make sure that that group stage df and training set df has same columns
missing_cols = set(final_encoded_df.columns) - set(grp_stage.columns)
for c in missing_cols:
    grp_stage[c] = 0
grp_stage = grp_stage[final_encoded_df.columns]

grp_stage = grp_stage.drop(['Winner'], axis=1)

In [110]:
# Making the points table for 2023 World Cup

points_table_df = pd.DataFrame(columns=['team', 'matches_played', 'won', 'lost', 'points'])
points_table_df.set_index('team', inplace=True)

for team in worldcup_teams:
    points_table_df.loc[team] = [0, 0, 0, 0]
points_table_df

Unnamed: 0_level_0,matches_played,won,lost,points
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
England,0,0,0,0
South Africa,0,0,0,0
Netherlands,0,0,0,0
Pakistan,0,0,0,0
New Zealand,0,0,0,0
Sri Lanka,0,0,0,0
Afghanistan,0,0,0,0
Australia,0,0,0,0
Bangladesh,0,0,0,0
India,0,0,0,0


In [111]:
# Predicting group match results and updating points table

predictions = best_rf.predict(grp_stage)
for i in range(fixtures_df.shape[0]):
    team1 = backup_pred_set.iloc[i, 1]
    team2 = backup_pred_set.iloc[i, 0]

    points_table_df.loc[team1, 'matches_played'] += 1
    points_table_df.loc[team2, 'matches_played'] += 1
    
    print(team1 + " vs " + team2)
    if predictions[i] == 1:
        print("Winner: " + team1)
        points_table_df.loc[team1, 'won'] += 1
        points_table_df.loc[team2, 'lost'] += 1
        points_table_df.loc[team1, 'points'] += 2
    else:
        print("Winner: " + team2)
        points_table_df.loc[team2, 'won'] += 1
        points_table_df.loc[team1, 'lost'] += 1
        points_table_df.loc[team2, 'points'] += 2

    # print(points_table_df)
    print("")

England vs New Zealand
Winner: New Zealand

Netherlands vs Pakistan
Winner: Pakistan

Afghanistan vs Bangladesh
Winner: Bangladesh

Sri Lanka vs South Africa
Winner: South Africa

Australia vs India
Winner: India

Netherlands vs New Zealand
Winner: New Zealand

Bangladesh vs England
Winner: England

Sri Lanka vs Pakistan
Winner: Pakistan

Afghanistan vs India
Winner: India

South Africa vs Australia
Winner: Australia

Bangladesh vs New Zealand
Winner: New Zealand

Pakistan vs India
Winner: India

Afghanistan vs England
Winner: England

Sri Lanka vs Australia
Winner: Australia

Netherlands vs South Africa
Winner: South Africa

Afghanistan vs New Zealand
Winner: New Zealand

Bangladesh vs India
Winner: India

Pakistan vs Australia
Winner: Australia

Netherlands vs Sri Lanka
Winner: Sri Lanka

England vs South Africa
Winner: South Africa

New Zealand vs India
Winner: India

Afghanistan vs Pakistan
Winner: Pakistan

Bangladesh vs South Africa
Winner: South Africa

Australia vs Netherlands


In [112]:
points_table_df = points_table_df.sort_values(by='points', ascending=False)
print(points_table_df)

              matches_played  won  lost  points
team                                           
India                      9    8     1      16
South Africa               9    7     2      14
Australia                  9    7     2      14
Pakistan                   9    6     3      12
Netherlands                9    5     4      10
New Zealand                9    5     4      10
England                    9    3     6       6
Sri Lanka                  9    3     6       6
Bangladesh                 9    1     8       2
Afghanistan                9    0     9       0


In [113]:
# Getting the teams for semi finals
top_4_teams = points_table_df.head(4)
top_4_team_names = top_4_teams.index.tolist()

print(top_4_team_names)

['India', 'South Africa', 'Australia', 'Pakistan']


In [114]:
semi_finals = [(top_4_team_names[0], top_4_team_names[3]),
               (top_4_team_names[1], top_4_team_names[2])]

In [115]:
def predict_finals(matches, ranking_df, final_encoded_df, best_rf):

    pred = []
    for match in matches:
        dict = {}
        
        team1 = match[0]
        team2 = match[1]

        team1_pos = ranking_df.loc[ranking_df['Team'] == team1, 'Position'].iloc[0]
        team2_pos = ranking_df.loc[ranking_df['Team'] == team2, 'Position'].iloc[0]

        if team1_pos < team2_pos:
            dict['Team_1'] = team1
            dict['Team_2'] = team2
        else:
            dict['Team_1'] = team2
            dict['Team_2'] = team1

        pred.append(dict)

    # print(pred)

    pred = pd.DataFrame(pred)
    backup_pred_set = pred

    pred = pd.concat([pd.get_dummies(pred['Team_1'], prefix='Team_1'),
                    pd.get_dummies(pred['Team_2'], prefix='Team_2')], axis=1)
    

    # Make sure that that pred df and training set df has same columns
    missing_cols2 = set(final_encoded_df.columns) - set(pred.columns)
    for c in missing_cols2:
        pred[c] = 0
    pred = pred[final_encoded_df.columns]

    pred = pred.drop(['Winner'], axis=1)

    
    if len(pred) == 2 :
        print("---------SEMI FINALS-----------\n")
    else :
        print("----------FINALS---------\n")

    
    
    predictions = best_rf.predict(pred)
    winners = []
    for i in range(len(pred)):
        team1 = backup_pred_set.iloc[i, 1]
        team2 = backup_pred_set.iloc[i, 0]

        print(str(team1) + " vs " + str(team2))

        if predictions[i] == 1:
            print("Winner: " + str(team1))
            winners.append(str(team1))
        else:
            print("Winner: " + str(team2))
            winners.append(str(team2))
        print("")

    if len(winners) == 2 :
        return [(winners[0], winners[1])]
    else :
        return winners[0]

In [116]:
# Results of semi finals
finals = predict_finals(semi_finals, ranking_df, final_encoded_df, best_rf)

---------SEMI FINALS-----------

Pakistan vs India
Winner: India

South Africa vs Australia
Winner: Australia



In [117]:
# Teams in finals

print(finals)

[('India', 'Australia')]


In [118]:
# Result of the finals

wc_winner = predict_finals(finals, ranking_df, final_encoded_df, best_rf)

print(f"{wc_winner} will win the 2023 Cricket World Cup")

----------FINALS---------

Australia vs India
Winner: India

India will win the 2023 Cricket World Cup


In [119]:
import pickle
pickle_out = open("classifier.pkl","wb")
pickle.dump(wc_winner, pickle_out)

pickle_out.close()