In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('../combined_standardized_innings_output_of_all_types.csv')

# Drop the 'file' column
df.drop(columns=['file'], inplace=True)

# Fill missing values
df.fillna(0, inplace=True)

# Encode categorical variables
le_team = LabelEncoder()
le_player = LabelEncoder()

df['team'] = le_team.fit_transform(df['team'])
df['opponent_team'] = le_team.transform(df['opponent_team'])
df['batter'] = le_player.fit_transform(df['batter'])
df['bowler'] = le_player.transform(df['bowler'])
df['non_striker'] = le_player.transform(df['non_striker'])

# Normalize numerical columns
scaler = MinMaxScaler()
df[['over', 'batter_runs', 'extras_runs', 'total_runs', 'wicket',
    'extra_wides_runs', 'extra_byes_runs', 'extra_noballs_runs',
    'extra_legbyes_runs', 'extra_penalty_runs', 'fantasy_points']] = scaler.fit_transform(
    df[['over', 'batter_runs', 'extras_runs', 'total_runs', 'wicket',
        'extra_wides_runs', 'extra_byes_runs', 'extra_noballs_runs',
        'extra_legbyes_runs', 'extra_penalty_runs', 'fantasy_points']]
)

# Group by batter and opponent_team to get average fantasy points
average_points = df.groupby(['batter', 'opponent_team'])['fantasy_points'].mean().reset_index()


ValueError: y contains previously unseen labels: 'Unknown'

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense

model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(None, X_train.shape[2])))
model.add(GRU(64))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mse')


In [None]:
# Creating sequences for LSTM/GRU
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])
    return np.array(sequences)

seq_length = 10
data = df.drop(columns=['fantasy_points']).values
sequences = create_sequences(data, seq_length)

X = sequences[:, :-1, :]
y = sequences[:, -1, -1]  # Only the fantasy_points column

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=64)


In [None]:
def predict_fantasy_points(batter, opponent_team):
    batter_encoded = le_player.transform([batter])[0]
    opponent_team_encoded = le_team.transform([opponent_team])[0]

    # Get average fantasy points for the player against the given opponent
    avg_points = average_points[
        (average_points['batter'] == batter_encoded) &
        (average_points['opponent_team'] == opponent_team_encoded)
    ]

    if not avg_points.empty:
        return avg_points['fantasy_points'].values[0]  # Return the average points
    else:
        return np.nan  # No data available for this combination

# Example usage
fantasy_points = predict_fantasy_points('SC Cook', 'Unknown')
print(f'Predicted Fantasy Points: {fantasy_points}')


-----------------------------------------------------Previous Code Tests -----------------------------------------------------

In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, LSTM, GRU, Input, Concatenate, Reshape
import joblib
import os

features = ['Batting Average', 'Bowling Average', 'Strike Rate', 'Economy Rate', 
            'Centuries Scored', 'Half Centuries Scored', 'Ducks Scored', 
            'Wickets Taken Last Match', 'Runs Scored Last Match', 
            'Player Age', 'Player Experience', 'Player Role', 'Team']
y = data[['Runs Scored', 'Wickets Taken', 'Balls Faced', 'Balls Bowled', 
          'Overs Bowled', 'Maidens Bowled', 'Runs Conceded']]
# If the model exists, load it
model = load_model('cricket_model.h5')

# Load the scalers
scaler_X = joblib.load('scaler_X.joblib')
scaler_y = joblib.load('scaler_y.joblib')

# Load the LabelEncoder
label_enc = joblib.load('label_encoder.joblib')

# Fantasy points calculation function
def calculate_fantasy_points(row, match_type):
    points = 0
    points += 4  # Appearance points
    points += row['Runs Scored']
    points += row['Wickets Taken'] * 25 if match_type != 'Test' else row['Wickets Taken'] * 16
    
    # Duck penalty
    if row['Runs Scored'] == 0 and row['Balls Faced'] > 0:
        points -= 2 if match_type in ['T20', 'T10'] else 3 if match_type == 'ODI' else 4
    
    # Batting bonuses
    if row['Runs Scored'] >= 50:
        points += 8 if match_type == 'T20' else 4 if match_type in ['ODI', 'Test'] else 16
    if row['Runs Scored'] >= 100:
        points += 16 if match_type == 'T20' else 8
    
    # Bowling bonuses
    if row['Maidens Bowled'] > 0:
        points += row['Maidens Bowled'] * (12 if match_type == 'T20' else 4 if match_type == 'ODI' else 16)
    if row['Wickets Taken'] >= 4:
        points += 8 if match_type == 'T20' else 4
    if row['Wickets Taken'] >= 5:
        points += 16 if match_type == 'T20' else 8
    
    # Economy rate bonuses/penalties
    if row['Overs Bowled'] >= 2:
        economy_rate = row['Runs Conceded'] / row['Overs Bowled']
        if economy_rate < 6:
            points += 4
        elif economy_rate > 9:
            points -= 2
    
    return points

# Function to select top 11 players from specific teams
def get_top_11_players_for_teams(data: pd.DataFrame, team_1, team_2, match_type='ODI'):
    # Filter data for the specified teams
    team_data = data[data['Team'].isin([team_1, team_2])]
    # Prepare the data for prediction
    team_1_encoded = label_enc.transform([team_1])[0]
    team_2_encoded = label_enc.transform([team_2])[0]
    team_data.replace(team_1, team_1_encoded, inplace=True)
    team_data.replace(team_2, team_2_encoded, inplace=True)
    X_team = team_data[features]    
    X_team_scaled = scaler_X.transform(X_team)
    # Predict stats for the filtered dataset
    predicted_stats_scaled = model.predict(X_team_scaled)
    predicted_stats = scaler_y.inverse_transform(predicted_stats_scaled)
    
    predicted_df = pd.DataFrame(predicted_stats, columns=y.columns)
    predicted_df['Player Name'] = team_data['Player Name'].values
    predicted_df['Team'] = team_data['Team'].values
    predicted_df['Player Role'] = team_data['Player Role'].values
    
    for col in y.columns:
        if col in ['Runs Scored', 'Balls Faced', 'Balls Bowled', 'Runs Conceded']:
            predicted_df[col] = predicted_df[col].round().astype(int)
        elif col in ['Wickets Taken', 'Maidens Bowled']:
            predicted_df[col] = predicted_df[col].round(1)
        elif col == 'Overs Bowled':
            predicted_df[col] = predicted_df[col].round(1)
    
    predicted_df['Predicted Fantasy Points'] = predicted_df.apply(lambda row: calculate_fantasy_points(row, match_type), axis=1)
    
    return select_top_11_with_roles(predicted_df)

# Top 11 player selection function with role consideration
def select_top_11_with_roles(df):
    sorted_players = df.sort_values(by='Predicted Fantasy Points', ascending=False)
    
    team = []
    roles_covered = set()
    teams_covered = set()
    
    # Ensure at least one player from each team is selected
    for team_id in sorted_players['Team'].unique():
        team_players = sorted_players[sorted_players['Team'] == team_id]
        if not team_players.empty:
            best_player = team_players.iloc[0]
            team.append(best_player)
            roles_covered.add(best_player['Player Role'])
            teams_covered.add(team_id)
    
    # Fill the remaining spots
    for _, player in sorted_players.iterrows():
        if len(team) >= 11 and len(roles_covered) == len(df['Player Role'].unique()):
            break
        if player['Player Role'] not in roles_covered or len(team) < 11:
            if player['Player Name'] not in [p['Player Name'] for p in team]:
                team.append(player)
                roles_covered.add(player['Player Role'])
    
    # If we still don't have 11 players, add the next best available
    while len(team) < 11:
        for _, player in sorted_players.iterrows():
            if player['Player Name'] not in [p['Player Name'] for p in team]:
                team.append(player)
                break
    
    return pd.DataFrame(team)

In [35]:
# Example usage
team_1 = 'Sri Lanka'
team_2 = 'Australia'
data = pd.read_csv('./data.csv')
# Filter data based on team names
top_players = get_top_11_players_for_teams(data, team_1, team_2)

print("Top 11 Players based on Predicted Fantasy Points:")
print(top_players[['Player Name', 'Team', 'Predicted Fantasy Points', 'Player Role']])

print("\nPredicted Stats for Top Players (per match):")
print(top_players[['Player Name', 'Runs Scored', 'Wickets Taken', 'Balls Faced', 'Balls Bowled', 
                   'Overs Bowled', 'Maidens Bowled', 'Runs Conceded']])

role_distribution = top_players['Player Role'].value_counts()
print("\nRole Distribution:")
print(role_distribution)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data.replace(team_1, team_1_encoded, inplace=True)
  team_data.replace(team_2, team_2_encoded, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data.replace(team_2, team_2_encoded, inplace=True)


Top 11 Players based on Predicted Fantasy Points:
           Player Name  Team  Predicted Fantasy Points  Player Role
33      James Faulkner     0                 73.299999            3
10     Janith Liyanage     2                 55.000000            3
14         Steve Smith     0                 72.400000            1
20         Aaron Finch     0                 70.000001            1
23          Adam Zampa     0                 69.500001            4
31     Kane Richardson     0                 68.200002            4
18      Mitchell Starc     0                 67.900000            4
24      Josh Hazlewood     0                 67.700000            4
32         Sean Abbott     0                 63.200000            4
15        David Warner     0                 63.000000            1
16         Pat Cummins     0                 62.899999            4
7   Maheesh Theekshana     2                 37.800000            2

Predicted Stats for Top Players (per match):
           Player Na