In [1]:
import pandas as pd
import pickle
import re
import os
import warnings
warnings.filterwarnings("ignore")

# Loading the data
player_scores = pd.read_pickle("Player_Scores.pkl")
pom_frequency = pd.read_pickle("pom_frequency.pkl")


# Checking the column names
print("Columns in player_scores DataFrame:", player_scores.columns)
print("Columns in pom_frequency DataFrame:", pom_frequency.columns)

Columns in player_scores DataFrame: Index(['Player', 'Combined Form_Batting', 'Combined Consistency_Batting',
       'Combined Form_Bowling', 'Combined Consistency_Bowling',
       'Combined Form_Fielding', 'Combined Consistency_Fielding'],
      dtype='object')
Columns in pom_frequency DataFrame: Index(['Player_of_Match', 'Frequency'], dtype='object')


## Preprocessing Pipeline

In [2]:
# 2023 player list - as form was calculated for 2022 
teams_players = {
    "Rajasthan Royals": ['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D Padikkal', 'SO Hetmyer', 'R Ashwin', 'R Parag', 'TA Boult', 'OC McCoy', 'M Prasidh Krishna', 'YS Chahal'],
    "Royal Challengers Bangalore": ['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ Maxwell', 'MK Lomror', 'KD Karthik', 'Shahbaz Ahmed', 'PWH de Silva', 'HV Patel', 'JR Hazlewood', 'Mohammed Siraj'],
    "Sunrisers Hyderabad": ['PK Garg', 'Abhishek Sharma', 'RA Tripathi', 'AK Markram', 'N Pooran', 'Washington Sundar', 'R Shepherd', 'J Suchith', 'B Kumar', 'Umran Malik', 'Fazalhaq Farooqi'],
    "Delhi Capitals": ['PP Shaw', 'DA Warner', 'MR Marsh', 'RR Pant', 'SN Khan', 'R Powell', 'AR Patel', 'SN Thakur', 'Kuldeep Yadav', 'A Nortje', 'KK Ahmed'],
    "Chennai Super Kings": ['RD Gaikwad', 'DP Conway', 'MM Ali', 'N Jagadeesan', 'AT Rayudu', 'MS Dhoni', 'MJ Santner', 'Simarjeet Singh', 'Mukesh Choudhary', 'PH Solanki', 'M Pathirana'],
    "Gujarat Titans": ['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pandya', 'DA Miller', 'R Tewatia', 'Rashid Khan', 'R Sai Kishore', 'LH Ferguson', 'Yash Dayal', 'Mohammed Shami'],
    "Kolkata Knight Riders": ['VR Iyer', 'AM Rahane', 'N Rana', 'SS Iyer', 'SW Billings', 'RK Singh', 'AD Russell', 'SP Narine', 'UT Yadav', 'TG Southee', 'CV Varun'],
    "Punjab Kings": ['JM Bairstow', 'S Dhawan', 'PBB Rajapaksa', 'LS Livingstone', 'MA Agarwal', 'JM Sharma', 'Harpreet Brar', 'R Dhawan', 'RD Chahar', 'K Rabada', 'Arshdeep Singh'],
    "Mumbai Indians": ['Ishan Kishan', 'RG Sharma', 'SA Yadav', 'Tilak Varma', 'KA Pollard', 'TH David', 'DR Sams', 'M Ashwin', 'K Kartikeya', 'JJ Bumrah', 'RP Meredith'],
    "Lucknow Super Giants": ['Q de Kock', 'KL Rahul', 'DJ Hooda', 'KH Pandya', 'A Badoni', 'MP Stoinis', 'JO Holder', 'PVD Chameera', 'Avesh Khan', 'Mohsin Khan', 'Ravi Bishnoi']
 
}

teams_players_df = pd.DataFrame([(team, player) for team, players in teams_players.items() for player in players], columns=['Team', 'Player'])
print("Teams and Players DataFrame:\n", teams_players_df)

Teams and Players DataFrame:
                      Team        Player
0        Rajasthan Royals   YBK Jaiswal
1        Rajasthan Royals    JC Buttler
2        Rajasthan Royals     SV Samson
3        Rajasthan Royals    D Padikkal
4        Rajasthan Royals    SO Hetmyer
..                    ...           ...
105  Lucknow Super Giants     JO Holder
106  Lucknow Super Giants  PVD Chameera
107  Lucknow Super Giants    Avesh Khan
108  Lucknow Super Giants   Mohsin Khan
109  Lucknow Super Giants  Ravi Bishnoi

[110 rows x 2 columns]


In [3]:
# Helper function to calculate team impact stats
def get_team_impact_stats(players, stats_df, score_type):
    stats_sum = stats_df[stats_df['Player'].isin(players)][score_type].sum()
    #print(f"Calculating impact stats for players {players} and score type {score_type}: {stats_sum}")
    return stats_sum

# Function to calculate top performers
def get_top_performers(players, stats_df, score_type, top_n=4):
    top_scores = stats_df[stats_df['Player'].isin(players)][score_type].nlargest(top_n).tolist()
    #print(f"Top {top_n} scores for players {players} and score type {score_type}: {top_scores}")
    return top_scores

def preprocess_input(team1, team2, teams_players_df, player_scores, pom_frequency, venue):
    print(f"Preprocessing input for teams {team1} and {team2} at venue {venue}")
    
    # Get players for each team
    team1_players = teams_players_df[teams_players_df['Team'] == team1]['Player'].tolist()
    team2_players = teams_players_df[teams_players_df['Team'] == team2]['Player'].tolist()
    #print(f"Team 1 players: {team1_players}")
    #print(f"Team 2 players: {team2_players}")

    # Initialize DataFrame for match data
    data = {f'Team1_{team}': 0 for team in IPL_TEAMS}
    data.update({f'Team2_{team}': 0 for team in IPL_TEAMS})
    data[f'Team1_{team1}'] = 1
    data[f'Team2_{team2}'] = 1

    # Ensure venue column matches model input format
    data.update({f'Venue_{v}': 0 for v in venues})
    data[f'Venue_{venue}'] = 1  # Set venue feature

    #print(f"Initial match data: {data}")

    # Determine the higher POM Frequency between the two teams
    team_assignments = {player: team1 for player in team1_players}
    team_assignments.update({player: team2 for player in team2_players})
    pom_frequency['Team'] = pom_frequency['Player_of_Match'].map(team_assignments)
    #print(f"POM frequency with team assignments: {pom_frequency}")
    team_max_pom = pom_frequency.groupby('Team')['Frequency'].max().to_dict()
    #print(f"Max POM frequency per team: {team_max_pom}")
    data['POM_Frequency'] = max(team_max_pom.get(team1, 0), team_max_pom.get(team2, 0))

    #print(f"Data after adding POM frequency: {data}")

    # Initialize the DataFrame
    match_df = pd.DataFrame([data])
    #print(f"Match DataFrame: {match_df}")

    # Calculate impact stats and top performers for both teams
    for score_type in ['Batting', 'Bowling', 'Fielding']:
        for metric in ['Form', 'Consistency']:
            combined_type = f'Combined {metric}_{score_type}'
            
            # Aggregate stats
            match_df[f'Team1_{metric}_{score_type}'] = get_team_impact_stats(team1_players, player_scores, combined_type)
            match_df[f'Team2_{metric}_{score_type}'] = get_team_impact_stats(team2_players, player_scores, combined_type)
            
            # Get top 4 performers
            team1_top_scores = get_top_performers(team1_players, player_scores, combined_type, top_n=4)
            team2_top_scores = get_top_performers(team2_players, player_scores, combined_type, top_n=4)
            
            # Add top performer scores to DataFrame
            for i in range(1, 5):
                match_df[f'Team1_{metric}_{score_type}_Top{i}'] = team1_top_scores[i-1] if i <= len(team1_top_scores) else 0
                match_df[f'Team2_{metric}_{score_type}_Top{i}'] = team2_top_scores[i-1] if i <= len(team2_top_scores) else 0
            
            #print(f"Added top {i} scores for {metric}_{score_type}")

    return match_df

# Defining the venues in the same format as your model inputs
venues = [
    "Arun Jaitley Stadium, Delhi", "Barabati Stadium", "Brabourne Stadium", "Brabourne Stadium, Mumbai", "Buffalo Park", "De Beers Diamond Oval", "Dr DY Patil Sports Academy", 
    "Dr DY Patil Sports Academy, Mumbai", "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium", "Dubai International Cricket Stadium", "Eden Gardens", "Eden Gardens, Kolkata", 
    "Feroz Shah Kotla", "Green Park", "Himachal Pradesh Cricket Association Stadium", "Holkar Cricket Stadium", "JSCA International Stadium Complex", "Kingsmead", 
    "M Chinnaswamy Stadium", "M.Chinnaswamy Stadium", "MA Chidambaram Stadium", "MA Chidambaram Stadium, Chepauk", "MA Chidambaram Stadium, Chepauk, Chennai", 
    "Maharashtra Cricket Association Stadium", "Maharashtra Cricket Association Stadium, Pune", "Narendra Modi Stadium, Ahmedabad", "Nehru Stadium", "New Wanderers Stadium", 
    "Newlands", "OUTsurance Oval", "Punjab Cricket Association IS Bindra Stadium", "Punjab Cricket Association IS Bindra Stadium, Mohali", 
    "Punjab Cricket Association Stadium, Mohali", "Rajiv Gandhi International Stadium", "Rajiv Gandhi International Stadium, Uppal", "Sardar Patel Stadium, Motera", 
    "Saurashtra Cricket Association Stadium", "Sawai Mansingh Stadium", "Shaheed Veer Narayan Singh International Stadium", "Sharjah Cricket Stadium", "Sheikh Zayed Stadium", 
    "St George's Park", "Subrata Roy Sahara Stadium", "SuperSport Park", "Vidarbha Cricket Association Stadium, Jamtha", "Wankhede Stadium", "Wankhede Stadium, Mumbai", 
    "Zayed Cricket Stadium, Abu Dhabi"
]

IPL_TEAMS = [
    "Chennai Super Kings", "Delhi Capitals", "Gujarat Titans", "Kings XI Punjab",
    "Kochi Tuskers Kerala", "Kolkata Knight Riders", "Lucknow Super Giants",
    "Mumbai Indians", "Rajasthan Royals", "Royal Challengers Bangalore", "Sunrisers Hyderabad"
]

In [4]:
# Load the pre-trained model
model_filename = 'majority_voting_classifier.pkl'
with open(model_filename, 'rb') as file:
    voting_clf = pickle.load(file)

In [5]:
# Load the pre-trained model
model_filename = 'majority_voting_classifier.pkl'
if os.path.exists(model_filename) and os.path.getsize(model_filename) > 0:
    with open(model_filename, 'rb') as file:
        voting_clf = pickle.load(file)
    print("Model loaded successfully!")
else:
    print("File does not exist or is empty.")

# Function to sanitize feature names
def sanitize_feature_names(df):
    df.columns = [re.sub(r'\s+', '_', re.sub(r'[^\w\s]', '', col)) for col in df.columns]  # Remove special chars and replace spaces with underscores
    df.columns = [re.sub(r'^(\d+)', r'_\1', col) for col in df.columns]  # Ensure names do not start with digits
    return df

Model loaded successfully!


In [6]:
# Load the input features from the pickle file
input_features_df = pd.read_pickle('Input_features.pkl')
model_features = input_features_df.columns.tolist()

## Making the prediction

In [7]:
IPL_TEAMS = [
    "Chennai Super Kings", "Delhi Capitals", "Gujarat Titans", "Kings XI Punjab",
    "Kochi Tuskers Kerala", "Kolkata Knight Riders", "Lucknow Super Giants",
    "Mumbai Indians", "Rajasthan Royals", "Royal Challengers Bangalore", "Sunrisers Hyderabad"
]

venues = [
    "Arun Jaitley Stadium, Delhi", "Barabati Stadium", "Brabourne Stadium", "Buffalo Park", "De Beers Diamond Oval", 
    "Dr DY Patil Sports Academy, Mumbai", "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium", "Dubai International Cricket Stadium", "Eden Gardens, Kolkata", 
    "Feroz Shah Kotla", "Green Park", "Himachal Pradesh Cricket Association Stadium", "Holkar Cricket Stadium", "JSCA International Stadium Complex", "Kingsmead", 
    "M Chinnaswamy Stadium", "MA Chidambaram Stadium, Chepauk",  
    "Maharashtra Cricket Association Stadium, Pune", "Narendra Modi Stadium, Ahmedabad", "Nehru Stadium", "New Wanderers Stadium", 
    "Newlands", "OUTsurance Oval", 
    "Punjab Cricket Association Stadium, Mohali", "Rajiv Gandhi International Stadium, Uppal", "Sardar Patel Stadium, Motera", 
    "Saurashtra Cricket Association Stadium", "Sawai Mansingh Stadium", "Shaheed Veer Narayan Singh International Stadium", "Sharjah Cricket Stadium",  
    "St George's Park", "Subrata Roy Sahara Stadium", "SuperSport Park", "Vidarbha Cricket Association Stadium, Jamtha", "Wankhede Stadium", "Zayed Cricket Stadium, Abu Dhabi"
]

In [8]:
def run_prediction_pipeline():
# Load the pre-trained model
    model_filename = 'majority_voting_classifier.pkl'
    if os.path.exists(model_filename) and os.path.getsize(model_filename) > 0:
        with open(model_filename, 'rb') as file:
            voting_clf = pickle.load(file)
        print("Model loaded successfully!")
    else:
        print("File does not exist or is empty.")
        return

    # Function to select team or venue
    def select_option(options):
        for idx, option in enumerate(options, 1):
            print(f"{idx}. {option}")
        selection = input("Choose an option by number: ")
        if selection.isdigit() and 1 <= int(selection) <= len(options):
            return options[int(selection) - 1]
        else:
            print("Invalid selection.")
            return select_option(options)  # Recursive call until valid input

    print("Available Teams:")
    team1 = select_option(IPL_TEAMS)
    print(f"Selected Team 1: {team1}")

    print("Available Teams:")
    team2 = select_option(IPL_TEAMS)
    print(f"Selected Team 2: {team2}")

    print("Available Venues:")
    venue = select_option(venues)
    print(f"Selected Venue: {venue}")

    if team1 not in IPL_TEAMS or team2 not in IPL_TEAMS or venue not in venues:
        print("Invalid team or venue selection.")
        return

    # Preprocessing input and preparing the data
    match_data_df = preprocess_input(team1, team2, teams_players_df, player_scores, pom_frequency, venue)
    match_data_df = sanitize_feature_names(match_data_df)

    # Add missing columns with default values
    missing_cols = [col for col in model_features if col not in match_data_df.columns]
    extra_cols = [col for col in match_data_df.columns if col not in model_features]
    for col in missing_cols:
        match_data_df[col] = 0
    match_data_df.drop(columns=extra_cols, inplace=True)
    match_data_df = match_data_df[model_features]

    # Prediction
    if set(match_data_df.columns) == set(model_features):
        win_probability = voting_clf.predict_proba(match_data_df)[:, 1]  # Assuming positive class is at index 1
        print(f"Predicted probability of {team1} winning against {team2} at {venue}: {win_probability[0] * 100:.2f}%")
    else:
        print("The features of the input data do not match the model's expected features.")

# Run the prediction pipeline
if __name__ == "__main__":
    run_prediction_pipeline()

Model loaded successfully!
Available Teams:
1. Chennai Super Kings
2. Delhi Capitals
3. Gujarat Titans
4. Kings XI Punjab
5. Kochi Tuskers Kerala
6. Kolkata Knight Riders
7. Lucknow Super Giants
8. Mumbai Indians
9. Rajasthan Royals
10. Royal Challengers Bangalore
11. Sunrisers Hyderabad
Choose an option by number: 8
Selected Team 1: Mumbai Indians
Available Teams:
1. Chennai Super Kings
2. Delhi Capitals
3. Gujarat Titans
4. Kings XI Punjab
5. Kochi Tuskers Kerala
6. Kolkata Knight Riders
7. Lucknow Super Giants
8. Mumbai Indians
9. Rajasthan Royals
10. Royal Challengers Bangalore
11. Sunrisers Hyderabad
Choose an option by number: 10
Selected Team 2: Royal Challengers Bangalore
Available Venues:
1. Arun Jaitley Stadium, Delhi
2. Barabati Stadium
3. Brabourne Stadium
4. Buffalo Park
5. De Beers Diamond Oval
6. Dr DY Patil Sports Academy, Mumbai
7. Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium
8. Dubai International Cricket Stadium
9. Eden Gardens, Kolkata
10. Feroz Shah Kotla
11