Web Scraping Data From https://www.espncricinfo.com/ of australia with data of last 4 years only against all opponents in test/odi/t20s

In [84]:
import pandas as pd

# df_batting = pd.read_html('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=11;home_or_away=1;home_or_away=2;home_or_away=3;opposition=1;result=1;result=2;result=3;result=4;spanmin1=19+Jul+2019;spanval1=span;team=2;template=results;type=batting')[2]
df_batting = pd.read_html('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=11;spanmax1=06+Aug+2023;spanmin1=06+Aug+2019;spanval1=span;team=2;template=results;type=batting')[2]
df_batting
df_batting.to_csv('bat_scrappeddata.csv')


In [85]:
import pandas as pd

# df_bowling = pd.read_html('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=11;home_or_away=1;home_or_away=2;home_or_away=3;opposition=1;orderby=bbi;result=1;result=2;result=3;result=4;spanmin1=19+Jul+2019;spanval1=span;team=2;template=results;type=bowling')[2]
df_bowling = pd.read_html('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=11;spanmax1=06+Aug+2023;spanmin1=06+Aug+2019;spanval1=span;team=2;template=results;type=bowling')[2]
df_bowling
df_bowling.to_csv('bowlingscrappeddata.csv')

In [86]:
df_fielders = pd.read_html('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=11;spanmax1=06+Aug+2023;spanmin1=06+Aug+2019;spanval1=span;team=2;template=results;type=fielding')[2]
df_fielders
df_fielders.to_csv('fieldersscrappeddata.csv')

In [87]:
import pandas as pd

# df_allrounders = pd.read_html('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=11;home_or_away=1;home_or_away=2;home_or_away=3;opposition=1;result=1;result=2;result=3;result=4;spanmin1=19+Jul+2019;spanval1=span;team=2;template=results;type=allround')[2]
df_allrounders = pd.read_html('https://stats.espncricinfo.com/ci/engine/stats/index.html?class=11;spanmax1=06+Aug+2023;spanmin1=06+Aug+2019;spanval1=span;team=2;template=results;type=allround')[2]
df_allrounders
df_allrounders.to_csv('allroundscarppeddata.csv')

Preprocessing Dataset Batting Data
Remove empty columns ('Unnamed: 15' in this case).
Handle missing values (if any).
Convert the 'HS' column to only integer values.
Rename the columns according to the provided format.

In [90]:
import pandas as pd
import numpy as np

# Load the batting dataset
batting_data = pd.read_csv('C:/Users/Premr/bat_scrappeddata.csv')

# Step 1: Remove empty columns
batting_data = batting_data.dropna(axis=1, how='all')



# Step 3: Convert 'HS' column to only integer values
def convert_highest_score(hs):
    if pd.notna(hs) and isinstance(hs, str) and '*' in hs:
        return int(hs[:-1])
    return hs

batting_data['HS'] = batting_data['HS'].apply(convert_highest_score)

# Step 2: Handle missing values (replace '-' with NaN)
batting_data = batting_data.replace(['-', np.nan], 0)

# Create a new column 'role' and assign role 'wicketkeeper' based on the condition
batting_data['role'] = 'Batsmen'

# Step 4: Rename columns
column_mapping = {
    'Mat': 'Matches Played',
    'Inns': 'Innings Batted',
    'NO': 'Not Outs',
    'HS': 'Highest innings score',
    'Ave': 'Batting Average',
    'BF': 'Balls Faced',
    'SR': 'Strike Rate',
    '100': 'Centuries',
    '50': 'Half Centuries',
    '0': 'Duck Out',
    '4s': 'Boundary Fours',
    '6s': 'Boundary Sixes'
}

batting_data = batting_data.rename(columns=column_mapping)

# Step 6: Remove the unnamed column without a header
batting_data = batting_data.iloc[:, 1:]

# Print the preprocessed batting dataset
batting_data

batting_data.to_csv('preprocessed_updated_battingdata.csv' , index=False)



Preprocessing Dataset Bowling Data
Remove empty columns
Handle missing values
Preprocess 'BBI' column
Preprocess 'BBM' column
Rename columns
Remove rows with NaN values
Remove the unnamed column without a header

In [91]:
import pandas as pd
import numpy as np

# Load the bowling dataset
bowling_data = pd.read_csv('C:/Users/Premr/bowlingscrappeddata.csv')

# Step 1: Remove empty columns
bowling_data = bowling_data.dropna(axis=1, how='all')



# Step 3: Preprocess the 'BBI' column
def extract_runs_wickets(bbi_value):
    if pd.notna(bbi_value) and isinstance(bbi_value, str) and '-' not in bbi_value:
        runs, wickets = bbi_value.split('/')
        return int(runs), int(wickets)
    else:
        return np.nan, np.nan

bowling_data['BBI_Runs'], bowling_data['BBI_Wickets'] = zip(*bowling_data['BBI'].apply(extract_runs_wickets))
bowling_data = bowling_data.drop('BBI', axis=1)

# Step 4: Preprocess the 'BBM' column
def extract_runs_wickets(bbm_value):
    if pd.notna(bbm_value) and isinstance(bbm_value, str) and '/' in bbm_value:
        runs, wickets = bbm_value.split('/')
        return int(runs), int(wickets)
    else:
        return np.nan, np.nan

bowling_data['BBM_Runs'], bowling_data['BBM_Wickets'] = zip(*bowling_data['BBM'].apply(extract_runs_wickets))
bowling_data = bowling_data.drop('BBM', axis=1)

# Step 2: Handle missing values (replace '-' with NaN)
bowling_data = bowling_data.replace(['-', np.nan], 0)

# Create a new column 'role' and assign role 'wicketkeeper' based on the condition
bowling_data['role'] = 'Bowler'

# Step 5: Rename columns
column_mapping = {
    'Mat': 'Matches Played',
    'Inns': 'Innings Bowled',
    'Overs': 'Overs Bowled',
    'Mdns': 'Maidens Earned',
    'Runs': 'Runs Conceded',
    'Wkts': 'Wickets Taken',
    'Ave': 'Bowling Average',
    'Econ': 'Economy Rate',
    'SR': 'Bowling Strike Rate',
    '5': 'Five Wickets',
    '10': 'Ten Wickets'
}

bowling_data = bowling_data.rename(columns=column_mapping)

# Step 7: Remove the unnamed column without a header
bowling_data = bowling_data.iloc[:, 1:]

# Create a new DataFrame with the processed data
preprocessed_bowling_data = bowling_data.copy()

# Print the preprocessed bowling dataset
preprocessed_bowling_data

preprocessed_bowling_data.to_csv('preprocessed_updated_bowlingdata.csv', index=False)

Preprocessing Dataset AllRounder Data
Remove empty columns
Handle missing values
Preprocess the 'BBI' column
Rename columns
Remove rows with NaN values
Remove the unnamed column without a header

In [96]:
import pandas as pd
import numpy as np

# Load the allround dataset
allround_data = pd.read_csv('C:/Users/Premr/allroundscarppeddata.csv')

# Step 1: Remove empty columns
allround_data = allround_data.dropna(axis=1, how='all')



# Step 3: Preprocess the 'BBI' column
def extract_runs_wickets(bbi_value):
    if pd.notna(bbi_value) and isinstance(bbi_value, str) and '-' not in bbi_value:
        runs, wickets = bbi_value.split('/')
        return int(runs), int(wickets)
    else:
        return np.nan, np.nan

allround_data['BBI_Runs'], allround_data['BBI_Wickets'] = zip(*allround_data['BBI'].apply(extract_runs_wickets))
allround_data = allround_data.drop('BBI', axis=1)

# Step 3.1: Convert 'HS' column to only integer values
def convert_highest_score(hs):
    if pd.notna(hs) and isinstance(hs, str) and '*' in hs:
        return int(hs[:-1])
    return hs

allround_data['HS'] = allround_data['HS'].apply(convert_highest_score)

# Step 2: Handle missing values
allround_data = allround_data.replace(['-', np.nan], 0)

# Create a new column 'role' and assign role 'wicketkeeper' based on the condition
allround_data['role'] = 'allrounder'


# Step 4: Rename columns
column_mapping = {
    'Mat': 'Matches Played',
    'Runs': 'Runs Scored',
    'HS': 'Highest innings score',
    'Bat Av': 'Batting Average',
    '100': 'Centuries',
    'Wkts': 'Wickets Taken',
    'Bowl Av': 'Bowling Average',
    '5': 'Five Wickets Taken',
    'Ct': 'Catches Taken',
    'St': 'Stumpings Made',
    'Ave Diff': 'Bat-Bowl Average'
}

allround_data = allround_data.rename(columns=column_mapping)


# Step 6: Remove the unnamed column without a header
allround_data = allround_data.iloc[:, 1:]

# Create a new DataFrame with the processed data
preprocessed_allround_data = allround_data.copy()

# Print the preprocessed allround dataset
preprocessed_allround_data

# Save the preprocessed allround data to a new CSV file
preprocessed_allround_data.to_csv('preprocessed_updated_allround_data.csv', index=False)

Preprocessing Dataset Fielder Data
Remove empty columns
Handle missing values
Preprocess the columns
Rename columns
Remove rows with NaN values
Remove the unnamed column without a header

In [97]:
import pandas as pd
import numpy as np

# Load the allround dataset
fielders_data = pd.read_csv('C:/Users/Premr/fieldersscrappeddata.csv')

# Step 1: Remove empty columns
fielders_data = fielders_data.dropna(axis=1, how='all')

# drop the 'MD' column
fielders_data = fielders_data.drop('MD', axis=1)

# Step 2: Handle missing values
fielders_data = fielders_data.replace(['-', np.nan], 0)

# Create a new column 'role' and assign role 'wicketkeeper' based on the condition
fielders_data['role'] = 'fielder'
fielders_data.loc[fielders_data['Ct Wk'] >= 1, 'role'] = 'wicketkeeper'

# Step 4: Rename columns
column_mapping = {
    'Mat': 'Matches Played',
    'Inns': 'Innings Fielded',
    'Dis': 'Fielding Dismissals Made',
    'Ct': 'Catches Taken',
    'St': 'Stumpings Made',
    'Ct Wk': 'Catches as a keeper',
    'Ct Fi': 'Catches as a fielder',
    'D/I': 'Dismissals per innings',
}

fielders_data = fielders_data.rename(columns=column_mapping)

# Step 6: Remove the unnamed column without a header
fielders_data = fielders_data.iloc[:, 1:]

# Create a new DataFrame with the processed data
preprocessed_fielders_data = fielders_data.copy()

# Print the preprocessed allround dataset
preprocessed_fielders_data

# Save the preprocessed allround data to a new CSV file
preprocessed_fielders_data.to_csv('preprocessed_fielders_updated_data.csv', index=False)

BEST 5 BATSMEN

RANDOM FOREST REGRESSOR

The concept of accuracy is typically used in classification problems, where the goal is to predict discrete classes. In your case, you're working with a regression problem (predicting a continuous variable, batting average), so we usually evaluate regression models using metrics like Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), etc., rather than accuracy.

In [108]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_battingdata.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Runs', 'Centuries', 'Half Centuries', 'Boundary Fours', 'Boundary Sixes']
target = 'Batting Average'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict batting averages
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted batting averages
data['Predicted Batting Average'] = model.predict(X)
top_batsmen = data.nlargest(5, 'Predicted Batting Average')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_batsmen['Player']]

print("Top 5 Batsmen:")
for player_name, predicted_average in zip(player_names, top_batsmen['Predicted Batting Average']):
    print(f"Player: {player_name}, Predicted Batting Average: {predicted_average}")

Mean Absolute Error: 3.3545000000000007
Root Mean Squared Error (RMSE): 4.3609704213626586
Mean Squared Error (MSE): 19.018063016000003
Coefficient of Determination (R-squared): 0.7216204409606601
Top 5 Batsmen:
Player: UT Khawaja, Predicted Batting Average: 48.67650000000007
Player: M Labuschagne, Predicted Batting Average: 47.63760000000002
Player: SPD Smith, Predicted Batting Average: 46.74870000000007
Player: TM Head, Predicted Batting Average: 46.71350000000001
Player: DA Warner, Predicted Batting Average: 43.03650000000005


XG BOOST

In [117]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_battingdata.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Runs', 'Centuries', 'Half Centuries', 'Boundary Fours', 'Boundary Sixes']
target = 'Batting Average'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = XGBRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict batting averages
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted batting averages
data['Predicted Batting Average'] = model.predict(X)
top_batsmen = data.nlargest(5, 'Predicted Batting Average')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_batsmen['Player']]

print("Top 4 Batsmen:")
for player_name, predicted_average in zip(player_names, top_batsmen['Predicted Batting Average']):
    print(f"Player: {player_name}, Predicted Batting Average: {predicted_average}")


Mean Absolute Error: 3.0459433994293215
Root Mean Squared Error (RMSE): 4.2683635495086785
Mean Squared Error (MSE): 18.218927390774322
Coefficient of Determination (R-squared): 0.7333179005166512
Top 4 Batsmen:
Player: UT Khawaja, Predicted Batting Average: 56.048465728759766
Player: M Labuschagne, Predicted Batting Average: 48.65013885498047
Player: SPD Smith, Predicted Batting Average: 47.799293518066406
Player: TM Head, Predicted Batting Average: 47.00031280517578
Player: DA Warner, Predicted Batting Average: 41.3201904296875


SVR

In [116]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix , classification_report , accuracy_score

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_battingdata.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Runs', 'Centuries', 'Half Centuries', 'Boundary Fours', 'Boundary Sixes']
target = 'Batting Average'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = SVR()
model.fit(X_train, y_train)

# Predict batting averages
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")
# Rank players based on predicted batting averages
data['Predicted Batting Average'] = model.predict(X)
top_batsmen = data.nlargest(5, 'Predicted Batting Average')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_batsmen['Player']]

print("Top 5 Batsmen:")
for player_name, predicted_average in zip(player_names, top_batsmen['Predicted Batting Average']):
    print(f"Player: {player_name}, Predicted Batting Average: {predicted_average}")




Mean Absolute Error: 7.282830694256361
Root Mean Squared Error (RMSE): 9.296664084405878
Mean Squared Error (MSE): 86.42796309828219
Coefficient of Determination (R-squared): -0.26510140573866603
Top 5 Batsmen:
Player: AJ Finch, Predicted Batting Average: 31.024124925821248
Player: TM Head, Predicted Batting Average: 30.94636000175267
Player: UT Khawaja, Predicted Batting Average: 30.914886116701012
Player: AT Carey, Predicted Batting Average: 30.51028716088826
Player: DA Warner, Predicted Batting Average: 30.01789501366855


Gradient Boosting

In [118]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_battingdata.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Runs', 'Centuries', 'Half Centuries', 'Boundary Fours', 'Boundary Sixes']
target = 'Batting Average'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict batting averages
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted batting averages
data['Predicted Batting Average'] = model.predict(X)
top_batsmen = data.nlargest(5, 'Predicted Batting Average')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_batsmen['Player']]

print("Top 4 Batsmen:")
for player_name, predicted_average in zip(player_names, top_batsmen['Predicted Batting Average']):
    print(f"Player: {player_name}, Predicted Batting Average: {predicted_average}")


Mean Absolute Error: 3.7549122501695806
Root Mean Squared Error (RMSE): 4.8031795158618795
Mean Squared Error (MSE): 23.070533461595158
Coefficient of Determination (R-squared): 0.6623018376562311
Top 4 Batsmen:
Player: UT Khawaja, Predicted Batting Average: 55.892451117864354
Player: M Labuschagne, Predicted Batting Average: 48.53860147028111
Player: SPD Smith, Predicted Batting Average: 47.67811019674603
Player: TM Head, Predicted Batting Average: 47.07681462287189
Player: DA Warner, Predicted Batting Average: 41.41540981407636


BEST 4 BOWLERS AND 1 SUB

HYPERPARAMETER USING RANDOM FOREST

In [101]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_bowlingdata.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Overs Bowled', 'Maidens Earned', 'Runs Conceded', 'Wickets Taken',
            'Bowling Average', 'Economy Rate', 'Bowling Strike Rate', 'Five Wickets', 'Ten Wickets']
target = 'Wickets Taken'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Initialize GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_error', cv=5)

# Fit the model to the data and perform hyperparameter tuning
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Initialize a new model with the best hyperparameters
best_model = RandomForestRegressor(**best_params, random_state=42)

# Train the best model
best_model.fit(X_train, y_train)

# Predict wickets taken
y_pred = best_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted wickets taken
data['Predicted Wickets Taken'] = best_model.predict(X)

# Select top 4 bowlers and 1 substitute
selected_players = data.nlargest(5, 'Predicted Wickets Taken')

# Get player names using their IDs
selected_player_names = [player_mapping[player_id] for player_id in selected_players['Player']]

print("Top 4 Bowlers + 1 Substitute:")
for player_name, predicted_wickets in zip(selected_player_names, selected_players['Predicted Wickets Taken']):
    print(f"Player: {player_name}, Predicted Wickets Taken: {predicted_wickets}")

Mean Absolute Error: 0.9860000000000001
Root Mean Squared Error (RMSE): 1.2275117107384355
Mean Squared Error (MSE): 1.5067850000000005
Coefficient of Determination (R-squared): 0.9669854294478527
Top 4 Bowlers + 1 Substitute:
Player: MA Starc, Predicted Wickets Taken: 196.285
Player: PJ Cummins, Predicted Wickets Taken: 189.455
Player: JR Hazlewood, Predicted Wickets Taken: 162.315
Player: NM Lyon, Predicted Wickets Taken: 154.495
Player: A Zampa, Predicted Wickets Taken: 113.24


SVR

In [102]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_bowlingdata.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Overs Bowled', 'Maidens Earned', 'Runs Conceded', 'Bowling Average',
            'Economy Rate', 'Bowling Strike Rate', 'Five Wickets', 'Ten Wickets']
target = 'Wickets Taken'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the SVR model
model = SVR()
model.fit(X_train, y_train)

# Predict bowling averages
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted wickets taken
data['Predicted Wickets Taken'] = model.predict(X)
top_bowlers = data.nlargest(5, 'Predicted Wickets Taken')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_bowlers['Player']]

print("Top 4 Bowlers + 1 Substitute:")
for player_name, predicted_wickets in zip(player_names, top_bowlers['Predicted Wickets Taken']):
    print(f"Player: {player_name}, Predicted Wickets Taken: {predicted_wickets}")

Mean Absolute Error: 5.370964647778326
Root Mean Squared Error (RMSE): 6.084172415243607
Mean Squared Error (MSE): 37.01715397841123
Coefficient of Determination (R-squared): 0.18893177084988533
Top 4 Bowlers + 1 Substitute:
Player: A Zampa, Predicted Wickets Taken: 17.352575171539932
Player: JR Hazlewood, Predicted Wickets Taken: 16.953695172552152
Player: NM Lyon, Predicted Wickets Taken: 16.763708217988516
Player: C Green, Predicted Wickets Taken: 16.72429778259164
Player: PJ Cummins, Predicted Wickets Taken: 16.589089082152256


Gradient Boosting

In [119]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_bowlingdata.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Overs Bowled', 'Maidens Earned', 'Runs Conceded', 'Bowling Strike Rate',
            'Economy Rate', 'Bowling Average', 'Five Wickets', 'Ten Wickets']
target = 'Wickets Taken'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict bowling averages
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted wickets taken
data['Predicted Wickets Taken'] = model.predict(X)
top_bowlers = data.nlargest(5, 'Predicted Wickets Taken')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_bowlers['Player']]

print("Top 4 Bowlers + 1 Substitute:")
for player_name, predicted_wickets in zip(player_names, top_bowlers['Predicted Wickets Taken']):
    print(f"Player: {player_name}, Predicted Wickets Taken: {predicted_wickets}")

Mean Absolute Error: 0.7446908488472843
Root Mean Squared Error (RMSE): 1.01066245068977
Mean Squared Error (MSE): 1.0214385892342517
Coefficient of Determination (R-squared): 0.9776196628125712
Top 4 Bowlers + 1 Substitute:
Player: MA Starc, Predicted Wickets Taken: 210.99440758061817
Player: PJ Cummins, Predicted Wickets Taken: 195.99540822181172
Player: JR Hazlewood, Predicted Wickets Taken: 159.99638043890224
Player: NM Lyon, Predicted Wickets Taken: 143.99709481309125
Player: A Zampa, Predicted Wickets Taken: 124.99776458031982


XG Boost

In [120]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_bowlingdata.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Overs Bowled', 'Maidens Earned', 'Runs Conceded', 'Bowling Average',
            'Economy Rate', 'Bowling Strike Rate', 'Five Wickets', 'Ten Wickets']
target = 'Wickets Taken'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

# Predict wickets taken
y_pred = model.predict(X_test)


#Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted wickets taken
data['Predicted Wickets Taken'] = model.predict(X)
top_bowlers = data.nlargest(5, 'Predicted Wickets Taken')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_bowlers['Player']]

print("Top 4 Bowlers + 1 Substitute:")
for player_name, predicted_wickets in zip(player_names, top_bowlers['Predicted Wickets Taken']):
    print(f"Player: {player_name}, Predicted Wickets Taken: {predicted_wickets}")


Mean Absolute Error: 1.121386395310401
Root Mean Squared Error (RMSE): 1.6584185101877893
Mean Squared Error (MSE): 2.7503519549334863
Coefficient of Determination (R-squared): 0.9397381254396695
Top 4 Bowlers + 1 Substitute:
Player: MA Starc, Predicted Wickets Taken: 210.9987030029297
Player: PJ Cummins, Predicted Wickets Taken: 195.99981689453125
Player: JR Hazlewood, Predicted Wickets Taken: 160.00022888183594
Player: NM Lyon, Predicted Wickets Taken: 143.99978637695312
Player: A Zampa, Predicted Wickets Taken: 124.99947357177734


2 + 1 ALL ROUNDERS

RANDOM FOREST

In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_allround_data.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Runs Scored', 'Batting Average', 'Centuries',
            'Wickets Taken', 'Bowling Average', 'Five Wickets Taken', 'Catches Taken', 'Stumpings Made']
target = 'Bat-Bowl Average'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict bat-bowl averages
y_pred = model.predict(X_test)

#Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted bat-bowl averages
data['Predicted Bat-Bowl Average'] = model.predict(X)
top_allrounders = data.nsmallest(3, 'Predicted Bat-Bowl Average')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_allrounders['Player']]

print("Top 2 All-rounders and 1 Substitute:")
for player_name, predicted_avg in zip(player_names, top_allrounders['Predicted Bat-Bowl Average']):
    print(f"Player: {player_name}, Predicted Bat-Bowl Average: {predicted_avg}")


Mean Absolute Error: 3.9378100000000003
Root Mean Squared Error (RMSE): 5.250811380443979
Mean Squared Error (MSE): 27.571020153000006
Coefficient of Determination (R-squared): 0.4272310787791751
Top 2 All-rounders and 1 Substitute:
Player: M Labuschagne, Predicted Bat-Bowl Average: -39.174900000000015
Player: MP Kuhnemann, Predicted Bat-Bowl Average: -23.18719999999998
Player: MJ Swepson, Predicted Bat-Bowl Average: -20.99720000000002


SVR

In [105]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_allround_data.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Runs Scored', 'Batting Average', 'Centuries',
            'Wickets Taken', 'Bowling Average', 'Five Wickets Taken', 'Catches Taken', 'Stumpings Made']
target = 'Bat-Bowl Average'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Ridge model with L2 regularization
model = Ridge(alpha=1.0)  # You can adjust the alpha parameter as needed

# Train the model
model.fit(X_train, y_train)

# Predict bat-bowl averages
y_pred = model.predict(X_test)

# Evaluate the model
#Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted bat-bowl averages
data['Predicted Bat-Bowl Average'] = model.predict(X)
top_allrounders = data.nsmallest(3, 'Predicted Bat-Bowl Average')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_allrounders['Player']]

print("Top 2 All-rounders and 1 Substitute:")
for player_name, predicted_avg in zip(player_names, top_allrounders['Predicted Bat-Bowl Average']):
    print(f"Player: {player_name}, Predicted Bat-Bowl Average: {predicted_avg}")


Mean Absolute Error: 3.430465257321462
Root Mean Squared Error (RMSE): 4.5494039981717505
Mean Squared Error (MSE): 20.69707673858111
Coefficient of Determination (R-squared): 0.5700325105782555
Top 2 All-rounders and 1 Substitute:
Player: M Labuschagne, Predicted Bat-Bowl Average: -50.118000209620426
Player: DR Sams, Predicted Bat-Bowl Average: -20.351768946075076
Player: PM Siddle, Predicted Bat-Bowl Average: -19.453382741330223


XG Boost

In [121]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_allround_data.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Runs Scored', 'Batting Average', 'Centuries',
            'Wickets Taken', 'Bowling Average', 'Five Wickets Taken', 'Catches Taken', 'Stumpings Made']
target = 'Bat-Bowl Average'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

# Predict bat-bowl averages
y_pred = model.predict(X_test)

#Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted bat-bowl averages
data['Predicted Bat-Bowl Average'] = model.predict(X)
top_allrounders = data.nsmallest(3, 'Predicted Bat-Bowl Average')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_allrounders['Player']]

print("Top 2 All-rounders and 1 Substitute:")
for player_name, predicted_avg in zip(player_names, top_allrounders['Predicted Bat-Bowl Average']):
    print(f"Player: {player_name}, Predicted Bat-Bowl Average: {predicted_avg}")


Mean Absolute Error: 3.4601156911998983
Root Mean Squared Error (RMSE): 5.160104669183342
Mean Squared Error (MSE): 26.62668019692773
Coefficient of Determination (R-squared): 0.4468490898249645
Top 2 All-rounders and 1 Substitute:
Player: M Labuschagne, Predicted Bat-Bowl Average: -66.49870300292969
Player: MP Kuhnemann, Predicted Bat-Bowl Average: -25.787696838378906
Player: MJ Swepson, Predicted Bat-Bowl Average: -23.920198440551758


Gradient Boosting

In [122]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_updated_allround_data.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features and target
features = ['Player', 'Matches Played', 'Runs Scored', 'Batting Average', 'Centuries',
            'Wickets Taken', 'Bowling Average', 'Five Wickets Taken', 'Catches Taken', 'Stumpings Made']
target = 'Bat-Bowl Average'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Ridge model with L2 regularization
model = Ridge(alpha=1.0)  # You can adjust the alpha parameter as needed

# Train the model
model.fit(X_train, y_train)

# Predict bat-bowl averages
y_pred = model.predict(X_test)

# Evaluate the model
#Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
r2 = r2_score(y_test, y_pred)
print(f"Coefficient of Determination (R-squared): {r2}")

# Rank players based on predicted bat-bowl averages
data['Predicted Bat-Bowl Average'] = model.predict(X)
top_allrounders = data.nsmallest(3, 'Predicted Bat-Bowl Average')

# Get player names using their IDs
player_names = [player_mapping[player_id] for player_id in top_allrounders['Player']]

print("Top 2 All-rounders and 1 Substitute:")
for player_name, predicted_avg in zip(player_names, top_allrounders['Predicted Bat-Bowl Average']):
    print(f"Player: {player_name}, Predicted Bat-Bowl Average: {predicted_avg}")

Mean Absolute Error: 3.430465257321462
Root Mean Squared Error (RMSE): 4.5494039981717505
Mean Squared Error (MSE): 20.69707673858111
Coefficient of Determination (R-squared): 0.5700325105782555
Top 2 All-rounders and 1 Substitute:
Player: M Labuschagne, Predicted Bat-Bowl Average: -50.118000209620426
Player: DR Sams, Predicted Bat-Bowl Average: -20.351768946075076
Player: PM Siddle, Predicted Bat-Bowl Average: -19.453382741330223


1 + 1 WICKETKEEPER

RANDOM FOREST WITH HYPERPARAMETER

In [106]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_fielders_updated_data.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features
features = ['Player', 'Matches Played', 'Innings Fielded', 'Fielding Dismissals Made', 'Catches Taken', 'Stumpings Made',
            'Catches as a fielder', 'Dismissals per innings']

X = data[features]
y = data['Catches as a keeper']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameters for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [42]
}

# Initialize the Random Forest Regressor model
model = RandomForestRegressor()

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Predict 'Catches as a keeper'
y_pred = best_model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Coefficient of Determination (R-squared): {r2}")


# Rank players based on predicted 'Catches as a keeper'
data['Predicted Catches as a keeper'] = best_model.predict(X)
sorted_data = data.sort_values(by='Predicted Catches as a keeper', ascending=False)

# Select the top wicketkeeper and substitute wicketkeeper
top_wicketkeeper = sorted_data.iloc[0]
substitute_wicketkeeper = sorted_data.iloc[1]

# Get player names using their IDs
top_wicketkeeper_name = player_mapping[top_wicketkeeper['Player']]
substitute_wicketkeeper_name = player_mapping[substitute_wicketkeeper['Player']]

print("\nTop Wicketkeeper:")
print(f"Player: {top_wicketkeeper_name}, Predicted Catches as a Keeper: {top_wicketkeeper['Predicted Catches as a keeper']}")

print("\nSubstitute Wicketkeeper:")
print(f"Player: {substitute_wicketkeeper_name}, Predicted Catches as a Keeper: {substitute_wicketkeeper['Predicted Catches as a keeper']}")


Mean Absolute Error (MAE): 0.45758333333333334
Mean Squared Error (MSE): 1.839947291666667
Root Mean Squared Error (RMSE): 1.356446567936484
Coefficient of Determination (R-squared): 0.0

Top Wicketkeeper:
Player: AT Carey, Predicted Catches as a Keeper: 82.11849999999998

Substitute Wicketkeeper:
Player: TD Paine, Predicted Catches as a Keeper: 60.39849999999999


SVR

In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_fielders_updated_data.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features
features = ['Player', 'Matches Played', 'Innings Fielded', 'Fielding Dismissals Made', 'Catches Taken', 'Stumpings Made',
            'Catches as a fielder', 'Dismissals per innings']

X = data[features]
y = data['Catches as a keeper']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the SVR model
model = SVR()

# Train the model
model.fit(X_train, y_train)

# Predict 'Catches as a keeper'
y_pred = model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Coefficient of Determination (R-squared): {r2}")


# Rank players based on predicted 'Catches as a keeper'
data['Predicted Catches as a keeper'] = model.predict(X)
sorted_data = data.sort_values(by='Predicted Catches as a keeper', ascending=False)

# Select the top wicketkeeper and substitute wicketkeeper
top_wicketkeeper = sorted_data.iloc[0]
substitute_wicketkeeper = sorted_data.iloc[1]

# Get player names using their IDs
top_wicketkeeper_name = player_mapping[top_wicketkeeper['Player']]
substitute_wicketkeeper_name = player_mapping[substitute_wicketkeeper['Player']]

print("\nTop Wicketkeeper:")
print(f"Player: {top_wicketkeeper_name}, Predicted Catches as a Keeper: {top_wicketkeeper['Predicted Catches as a keeper']}")

print("\nSubstitute Wicketkeeper:")
print(f"Player: {substitute_wicketkeeper_name}, Predicted Catches as a Keeper: {substitute_wicketkeeper['Predicted Catches as a keeper']}")

Mean Absolute Error (MAE): 0.08266761261741266
Mean Squared Error (MSE): 0.007110605361576138
Root Mean Squared Error (RMSE): 0.08432440549198161
Coefficient of Determination (R-squared): 0.0

Top Wicketkeeper:
Player: AT Carey, Predicted Catches as a Keeper: 1.5819917616311225

Substitute Wicketkeeper:
Player: TD Paine, Predicted Catches as a Keeper: 1.2534253860485118


XG BOOST

In [123]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_fielders_updated_data.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features
features = ['Player', 'Matches Played', 'Innings Fielded', 'Fielding Dismissals Made', 'Catches Taken', 'Stumpings Made',
            'Catches as a fielder', 'Dismissals per innings']

X = data[features]
y = data['Catches as a keeper']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost Regressor model
model = XGBRegressor(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed

# Train the model
model.fit(X_train, y_train)

# Predict 'Catches as a keeper'
y_pred = model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Coefficient of Determination (R-squared): {r2}")


# Rank players based on predicted 'Catches as a keeper'
data['Predicted Catches as a keeper'] = model.predict(X)
sorted_data = data.sort_values(by='Predicted Catches as a keeper', ascending=False)

# Select the top wicketkeeper and substitute wicketkeeper
top_wicketkeeper = sorted_data.iloc[0]
substitute_wicketkeeper = sorted_data.iloc[1]

# Get player names using their IDs
top_wicketkeeper_name = player_mapping[top_wicketkeeper['Player']]
substitute_wicketkeeper_name = player_mapping[substitute_wicketkeeper['Player']]

print("\nTop Wicketkeeper:")
print(f"Player: {top_wicketkeeper_name}, Predicted Catches as a Keeper: {top_wicketkeeper['Predicted Catches as a keeper']}")

print("\nSubstitute Wicketkeeper:")
print(f"Player: {substitute_wicketkeeper_name}, Predicted Catches as a Keeper: {substitute_wicketkeeper['Predicted Catches as a keeper']}")

Mean Absolute Error (MAE): 0.0009481525950832293
Mean Squared Error (MSE): 6.357191069930243e-06
Root Mean Squared Error (RMSE): 0.002521347074468377
Coefficient of Determination (R-squared): 0.0

Top Wicketkeeper:
Player: AT Carey, Predicted Catches as a Keeper: 128.99880981445312

Substitute Wicketkeeper:
Player: TD Paine, Predicted Catches as a Keeper: 59.99977111816406


Gradient Boosting

In [124]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV

# Load your dataset (replace 'data.csv' with your actual filename)
data = pd.read_csv('C:/Users/Premr/preprocessed_fielders_updated_data.csv')

# Convert 'Player' column to numerical IDs
player_mapping = dict(enumerate(data['Player'].unique()))
data['Player'] = data['Player'].map({v: k for k, v in player_mapping.items()})

# Define features
features = ['Player', 'Matches Played', 'Innings Fielded', 'Fielding Dismissals Made', 'Catches Taken', 'Stumpings Made',
            'Catches as a fielder', 'Dismissals per innings']

X = data[features]
y = data['Catches as a keeper']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameters for Gradient Boosting
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [42]
}

# Initialize the Gradient Boosting Regressor model
model = GradientBoostingRegressor()

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Predict 'Catches as a keeper'
y_pred = best_model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Coefficient of Determination (R-squared): {r2}")


# Rank players based on predicted 'Catches as a keeper'
data['Predicted Catches as a keeper'] = best_model.predict(X)
sorted_data = data.sort_values(by='Predicted Catches as a keeper', ascending=False)

# Select the top wicketkeeper and substitute wicketkeeper
top_wicketkeeper = sorted_data.iloc[0]
substitute_wicketkeeper = sorted_data.iloc[1]

# Get player names using their IDs
top_wicketkeeper_name = player_mapping[top_wicketkeeper['Player']]
substitute_wicketkeeper_name = player_mapping[substitute_wicketkeeper['Player']]

print("\nTop Wicketkeeper:")
print(f"Player: {top_wicketkeeper_name}, Predicted Catches as a Keeper: {top_wicketkeeper['Predicted Catches as a keeper']}")

print("\nSubstitute Wicketkeeper:")
print(f"Player: {substitute_wicketkeeper_name}, Predicted Catches as a Keeper: {substitute_wicketkeeper['Predicted Catches as a keeper']}")


Mean Absolute Error (MAE): 1.0633431035552339
Mean Squared Error (MSE): 8.326404836999464
Root Mean Squared Error (RMSE): 2.8855510456409297
Coefficient of Determination (R-squared): 0.0

Top Wicketkeeper:
Player: AT Carey, Predicted Catches as a Keeper: 127.45877933332827

Substitute Wicketkeeper:
Player: TD Paine, Predicted Catches as a Keeper: 60.47641880092647
