In [1]:
# Import essential libraries
import pandas as pd  # Data manipulation
import numpy as np  # Numerical operations
import matplotlib.pyplot as plt  # Data visualization
import seaborn as sns  # Statistical data visualization

# Import machine learning tools
from sklearn.model_selection import train_test_split  # Splitting data into training and testing sets
from sklearn.preprocessing import OneHotEncoder, StandardScaler  # Data preprocessing
from sklearn.compose import ColumnTransformer  # Handling categorical and numerical features
from sklearn.linear_model import LogisticRegression, LinearRegression  # Classification & Regression models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor  # Random Forest models
from sklearn.model_selection import RandomizedSearchCV  # Hyperparameter tuning
from xgboost import XGBRegressor, XGBClassifier  # XGBoost models
from sklearn.pipeline import Pipeline  # Creating machine learning pipelines
from sklearn.metrics import accuracy_score, r2_score  # Model evaluation metrics

import joblib  # Saving and loading models

# Suppress warnings for clean output
import warnings
warnings.filterwarnings('ignore')

# Display full column width in pandas DataFrame
pd.set_option('display.max_colwidth', None)

In [2]:
# Load the datasets
deliveries = pd.read_csv('deliveries.csv')  # Ball-by-ball match details
matches = pd.read_csv('cricket_matches.csv')  # Match summary data

In [3]:
# Get unique match IDs from the deliveries dataset
unique_match_ids = deliveries["match_id"].unique()

In [4]:
# Randomly select 55 match IDs to remove from the dataset
match_ids_to_remove = np.random.choice(unique_match_ids, 55, replace=False)

In [5]:
# Remove the selected matches from the deliveries dataset
deliveries = deliveries[~deliveries["match_id"].isin(match_ids_to_remove)]

In [6]:
# Calculate total runs scored in the first innings for each match
total_score_df = deliveries.groupby(['match_id', 'inning']).sum()['total_runs'].reset_index()

In [7]:
# Filter out only first innings scores
total_score_df = total_score_df[total_score_df['inning'] == 1]

In [8]:
# Merge the total first innings scores with the matches dataset
match_df = matches.merge(total_score_df[['match_id', 'total_runs']], left_on='id', right_on='match_id')

In [9]:
# Define the list of teams to retain in the dataset
teams = [
    'Royal Challengers Bangalore', 'Mumbai Indians', 'Kolkata Knight Riders',
    'Rajasthan Royals', 'Chennai Super Kings', 'Sunrisers Hyderabad',
    'Delhi Capitals', 'Punjab Kings'
]

# Standardizing team names for consistency
# Updating 'Delhi Daredevils' to 'Delhi Capitals'
match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils', 'Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils', 'Delhi Capitals')
deliveries['bowling_team'] = deliveries['bowling_team'].str.replace('Delhi Daredevils', 'Delhi Capitals')
deliveries['batting_team'] = deliveries['batting_team'].str.replace('Delhi Daredevils', 'Delhi Capitals')

# Updating 'Deccan Chargers' to 'Sunrisers Hyderabad'
match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
deliveries['bowling_team'] = deliveries['bowling_team'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
deliveries['batting_team'] = deliveries['batting_team'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')

# Updating 'Kings XI Punjab' to 'Punjab Kings'
match_df['team1'] = match_df['team1'].str.replace('Kings XI Punjab', 'Punjab Kings')
match_df['team2'] = match_df['team2'].str.replace('Kings XI Punjab', 'Punjab Kings')
deliveries['bowling_team'] = deliveries['bowling_team'].str.replace('Kings XI Punjab', 'Punjab Kings')
deliveries['batting_team'] = deliveries['batting_team'].str.replace('Kings XI Punjab', 'Punjab Kings')

# Fixing typo: Updating 'Royal Challengers Bangaluru' to 'Royal Challengers Bangalore'
match_df['team1'] = match_df['team1'].str.replace('Royal Challengers Bangaluru', 'Royal Challengers Bangalore')
match_df['team2'] = match_df['team2'].str.replace('Royal Challengers Bangaluru', 'Royal Challengers Bangalore')
deliveries['bowling_team'] = deliveries['bowling_team'].str.replace('Royal Challengers Bangaluru', 'Royal Challengers Bangalore')
deliveries['batting_team'] = deliveries['batting_team'].str.replace('Royal Challengers Bangaluru', 'Royal Challengers Bangalore')

# Filtering datasets to keep only the selected teams
match_df = match_df[match_df['team1'].isin(teams)]
match_df = match_df[match_df['team2'].isin(teams)]
deliveries = deliveries[deliveries['bowling_team'].isin(teams)]
deliveries = deliveries[deliveries['batting_team'].isin(teams)]


In [10]:
# Remove tied matches to focus only on decisive results
match_df = match_df[match_df['result'] != 'tie']

# Remove matches affected by the Duckworth-Lewis (D/L) method to ensure consistency in score analysis
match_df = match_df[match_df['method'] != 'D/L']

# Retain only the necessary columns for further analysis
match_df = match_df[['match_id', 'city', 'winner', 'total_runs']]


In [11]:
# Merge match data with delivery-level data using 'match_id'
deliveries_df = match_df.merge(deliveries, on='match_id')

# Filter only the second innings data (since target chasing happens in the second innings)
deliveries_df = deliveries_df[deliveries_df['inning'] == 2]

In [12]:
# Convert 'total_runs_y' column to numeric type, coercing errors to NaN (if any non-numeric values exist)
deliveries_df['total_runs_y'] = pd.to_numeric(deliveries_df['total_runs_y'], errors='coerce')

# Replace NaN values (if any) with 0 to ensure clean calculations
deliveries_df['total_runs_y'] = deliveries_df['total_runs_y'].fillna(0)

# Calculate the cumulative score for each match in the second innings
deliveries_df['current_score'] = deliveries_df.groupby('match_id')['total_runs_y'].cumsum()

In [13]:
# Calculate the remaining runs needed to win (adding 1 to account for tie-breaking)
deliveries_df['runs_left'] = deliveries_df['total_runs_x'] - deliveries_df['current_score'] + 1

# Identify legitimate deliveries (excluding wides and no-balls)
deliveries_df['is_legit'] = ~deliveries_df['extras_type'].isin(['wides', 'noballs'])

# Compute cumulative count of legitimate balls bowled per match and innings
deliveries_df['legit_balls_bowled'] = deliveries_df.groupby(['match_id', 'inning'])['is_legit'].cumsum()

# Calculate remaining balls in the innings (max 120 balls in T20 cricket)
deliveries_df['balls_left'] = 120 - deliveries_df['legit_balls_bowled']

# Track cumulative wickets fallen in the match
deliveries_df['wickets_fallen'] = deliveries_df.groupby('match_id')['is_wicket'].cumsum()

# Compute Current Run Rate (CRR) = (Runs scored * 6) / Overs faced
deliveries_df['current_run_rate'] = np.round((deliveries_df['current_score'] * 6) / (120 - deliveries_df['balls_left']), 2)

# Handle division by zero cases where CRR might be infinite
deliveries_df['current_run_rate'] = np.where(
    deliveries_df['current_run_rate'] == np.inf, 0, deliveries_df['current_run_rate']
)

# Compute Required Run Rate (RRR) = (Runs left * 6) / Balls left
deliveries_df['required_run_rate'] = np.round((deliveries_df['runs_left'] * 6) / deliveries_df['balls_left'], 2)

# Calculate the number of wickets remaining
deliveries_df['wickets_left'] = 10 - deliveries_df['wickets_fallen']

In [14]:
# Function to check if the batting team won the match
def target_completed(row):
    row['winner'] = 1 if row['winner'] == row['batting_team'] else 0
    return row  

# Apply the function to update the 'winner' column for each row in deliveries_df
deliveries_df = deliveries_df.apply(target_completed, axis=1)

In [15]:
# Selecting relevant columns for model training
deliveries_df = deliveries_df[['batting_team', 'bowling_team', 'current_score', 'runs_left', 
                               'balls_left', 'wickets_left', 'required_run_rate', 'current_run_rate', 
                               'winner', 'city', 'batter', 'bowler', 'non_striker']]

# Dropping rows with missing values to ensure data quality
deliveries_df.dropna(inplace=True)

In [16]:
# Removing rows where the match has already been decided
deliveries_df = deliveries_df[deliveries_df['balls_left'] > 0]  # Ensuring overs are still remaining
deliveries_df = deliveries_df[deliveries_df['runs_left'] >= 0]  # Ensuring target is still chaseable

In [17]:
# Defining features (X) and target variable (y)
X = deliveries_df[['batting_team', 'bowling_team', 'current_score', 'runs_left', 
                   'balls_left', 'wickets_left', 'required_run_rate', 'current_run_rate']]
y = deliveries_df[['winner']]  # Binary target (1 if batting team wins, else 0)

# Splitting data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

# Preprocessing: Scale numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)

# Define the Logistic Regression model
logreg = LogisticRegression()

# Create a pipeline with preprocessing and model
logreg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', logreg)
])

# Define hyperparameter grid for RandomizedSearchCV
param_dist = {
    'model__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'model__penalty': ['l1', 'l2', 'elasticnet'],  # Regularization type
    'model__solver': ['liblinear', 'saga']  # Suitable solvers for small datasets
}

# Perform Randomized Search for hyperparameter tuning (3-fold cross-validation)
logreg_random = RandomizedSearchCV(
    logreg_pipeline, param_distributions=param_dist, cv=3, n_jobs=-1
)

# Train the model using the best-found hyperparameters
logreg_random.fit(X_train, y_train)

In [19]:
# Retrieve the best model from RandomizedSearchCV
best_model = logreg_random.best_estimator_

# Get the best hyperparameters
best_params = logreg_random.best_params_
print("Best Parameters:", best_params)

# Get the best cross-validation score
best_score = logreg_random.best_score_
print("Best Score:", best_score)

Best Parameters: {'model__solver': 'liblinear', 'model__penalty': 'l1', 'model__C': 100}
Best Score: 0.7989884424368023


In [20]:
# Extract best parameters and clean key names by removing 'model__' prefix
best_params = logreg_random.best_params_
best_params_cleaned = {key.replace('model__', ''): value for key, value in best_params.items()}

# Define the final pipeline with the best parameters
winner_predictor_pipeline = Pipeline(
    steps=[
        ('step 1', preprocessor),  # Preprocessing step
        ('step 2', LogisticRegression(**best_params_cleaned))  # Logistic Regression with optimized parameters
    ]
)

In [21]:
# Train the final model pipeline with the training data
winner_predictor_pipeline.fit(X_train, y_train)

# Predict the match winner for the test dataset
y_pred = winner_predictor_pipeline.predict(X_test)

In [22]:
# Evaluate model performance using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.7975717689776467


In [23]:
# Save the trained pipeline for future predictions
joblib.dump(winner_predictor_pipeline, "winner_predictor_pipeline.pkl")

['winner_predictor_pipeline.pkl']

In [47]:
# Save DataFrame as a compressed CSV file using gzip
deliveries_df.to_csv('deliveries.csv.gz', compression='gzip', index=False)

In [51]:
# Storing the version
import sklearn
print(sklearn.__version__)

1.6.1
