In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/AmEx/663e2b6d54457_train_data_with_samplefeatures.csv")
test_data = pd.read_csv("/content/drive/MyDrive/AmEx/6644a1e287df6_test_data_with_samplefeatures.csv")
batsman_df= pd.read_csv("/content/drive/MyDrive/AmEx/663e2b548c98c_batsman_level_scorecard.csv")
bowler_df = pd.read_csv("/content/drive/MyDrive/AmEx/663e2b2c60743_bowler_level_scorecard.csv")
match_df = pd.read_csv("/content/drive/MyDrive/AmEx/664389efa0868_match_level_scorecard.csv")
round2_data = pd.read_csv("/content/drive/MyDrive/AmEx/667a986f0b981_r2_data_with_samplefeatures.csv")
print(df_train.shape, test_data.shape, round2_data.shape)

(948, 23) (271, 21) (207, 21)


In [None]:
def preprocess_batsman(batsman_df):
    # DTYPE CONVERSION
    numeric_columns = ['match id', 'is_batsman_captain', 'is_batsman_keeper', 'inning',
                      'runs', 'balls_faced', 'over_faced_first', 'bowler_id', 'is_bowler_keeper',
                      'is_bowler_captain', 'strike_rate', 'Fours', 'Sixes']
    date_column = 'match_dt'

    # Convert specified columns to float
    batsman_df[numeric_columns] = batsman_df[numeric_columns].astype(float)
    batsman_df['batsman_id'] = batsman_df['batsman_id'].astype(int)
    # Convert date column to datetime
    batsman_df[date_column] = pd.to_datetime(batsman_df[date_column])
    data_info_after = batsman_df.dtypes

    # Fill missing values in batsman_df with 0 for calculation purposes
    batsman_df['Fours'].fillna(0, inplace=True)
    batsman_df['Sixes'].fillna(0, inplace=True)
    batsman_df['balls_faced'].fillna(0, inplace=True)
    batsman_df['strike_rate'].fillna(0, inplace=True)

    #  DROPING OF COLUMNS
    if "batsman" in batsman_df.columns:
      batsman_df = batsman_df.drop("batsman", axis=1)

    return batsman_df

batsman_df = preprocess_batsman(batsman_df)

In [None]:
def preprocess_bolwers(bowler_df):
  date_column = "match_dt"
  bowler_df[date_column] = pd.to_datetime(bowler_df[date_column])
  return bowler_df

bowler_df = preprocess_bolwers(bowler_df)

In [None]:
match_df = preprocess_bolwers(match_df)

In [None]:
import pandas as pd
def preprocess_train(df_train, match_df):
    # CONVERSION OF DATA_TYPE
    numeric_columns = ['team1_id', 'team2_id', 'ground_id',
                       'team_count_50runs_last15', 'team_winp_last5',
                       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
                       'ground_avg_runs_last15']
    date_column = "match_dt"

    # Convert specified columns to float
    df_train[numeric_columns] = df_train[numeric_columns].astype(float)
    df_train[date_column] = pd.to_datetime(df_train[date_column])



    # Function to convert string to a list of integers
    def convert_to_int_list(roster_ids_str):
        return [int(float(id_str)) for id_str in roster_ids_str.split(':')]

    # Apply the conversion to the 'team1_roster_ids' and 'team2_roster_ids' columns
    if not isinstance(df_train['team1_roster_ids'].iloc[0], list):
      df_train['team1_roster_ids'] = df_train['team1_roster_ids'].apply(convert_to_int_list)

    if not isinstance(df_train['team2_roster_ids'].iloc[0], list):
        df_train['team2_roster_ids'] = df_train['team2_roster_ids'].apply(convert_to_int_list)

    return df_train

In [None]:
def one_hot_encode_stuff(df_train):
  unique_values = df_train['lighting'].unique()
  value_mapping = {value: float(i) for i, value in enumerate(unique_values)}
  df_train['lighting'] = df_train['lighting'].map(value_mapping).astype('float64')

  if 'winner' in df_train.columns and 'winner_id' in df_train.columns:
        df_train['winner'] = (df_train['winner_id'] == df_train['team1_id']).astype('float64')

  df_train['toss winner'] = (df_train['toss winner'] == df_train['team1']).astype('float64')
  df_train['toss decision'] = (df_train['toss decision'] == 'bat').astype('float64')

  return df_train

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def plot_win_percentage_vs_feature(df, feature_name, target_name='winner', bins=20):

    # Create bins for the specified feature
    feature_min = df[feature_name].min()
    feature_max = df[feature_name].max()
    bin_edges = np.linspace(feature_min, feature_max, bins + 1)
    # Bin the data
    df['binned_feature'] = pd.cut(df[feature_name], bins=bin_edges)

    # Calculate win percentage for each bin
    win_percentage = df.groupby('binned_feature')[target_name].mean() * 100

    # Prepare data for plotting
    bin_centers = [interval.mid for interval in win_percentage.index]
    win_percentage_values = win_percentage.values

    # Plotting
    plt.figure(figsize=(14, 7))
    plt.plot(bin_centers, win_percentage_values, marker='o', linestyle='-', color='b')
    plt.title(f'{feature_name} vs Win Percentage of Team 1')
    plt.xlabel(feature_name)
    plt.ylabel('Win Percentage of Team 1')
    plt.grid(True)
    plt.show()

    # Clean up temporary column
    df.drop(columns='binned_feature', inplace=True)


# plot_win_percentage_vs_feature(train_df, 'Overall_performance_relative_VenueVise')


In [None]:
import pandas as pd

def calculate_venue_based_bat_first_win_probability(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Iterate through each row in df_train
    win_probabilities = []
    for index, row in df_train.iterrows():
        venue = row['venue']
        current_match_dt = row['match_dt']

        # Filter match_df for the specific venue and where match_dt is before current_match_dt
        venue_matches = match_df[(match_df['venue'] == venue) & (match_df['match_dt'] < current_match_dt)]

        # Filter matches where teams chose to bat first or bowl first
        bat_first_matches = venue_matches[venue_matches['toss decision'] == 'bat']
        field_first_matches = venue_matches[venue_matches['toss decision'] == 'field']

        # Total number of matches where teams chose to bat or field first at the venue
        total_bat_first_matches = len(bat_first_matches)
        total_field_first_matches = len(field_first_matches)

        # Number of matches won by the team that chose to bat first
        bat_first_win_count = len(bat_first_matches[(bat_first_matches['toss decision'] == 'bat') &
                                                    (bat_first_matches['winner'] == bat_first_matches['toss winner'])])

        # Number of matches lost by the team that chose to field first
        field_first_loss_count = len(field_first_matches[(field_first_matches['toss decision'] == 'field') &
                                                         (field_first_matches['winner'] != field_first_matches['toss winner'])])

        total_relevant_matches = total_bat_first_matches + total_field_first_matches
        total_wins = bat_first_win_count + field_first_loss_count

        if total_relevant_matches > 0:
            # Calculate the win probability
            win_probability = total_wins / total_relevant_matches
        else:
            win_probability = 0.0

        # Append the win probability to the list
        win_probabilities.append(win_probability)

    # Add the new column 'venue_based_bat_first_win_probability' to df_train
    df_train['venue_based_bat_first_win_probability'] = win_probabilities

    return df_train

In [None]:
def team1_toss_based_win_chances(df_train):
  def calculate_team1_win_chances(row):
      if (row['toss winner'] == 1 and row['toss decision'] == 1) or \
        (row['toss winner'] == 0 and row['toss decision'] == 0):
          return row['venue_based_bat_first_win_probability']
      else:
          return 1 - row['venue_based_bat_first_win_probability']

  df_train['team1_toss_based_win_chance'] = df_train.apply(calculate_team1_win_chances, axis=1)

  return df_train

In [None]:
# venue based inning 1 runs
def calculate_avg_inning1_runs_venue(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize a list to store the calculated averages
    avg_inning1_runs = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        venue = row['venue']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the venue matches and the match date is before the current match date
        venue_matches = match_df[(match_df['venue'] == venue) &
                                 (match_df['match_dt'] < current_match_dt)]

        # Calculate the average runs for the first inning at this venue using past matches only
        if not venue_matches.empty:
            avg_runs = venue_matches['inning1_runs'].mean()
        else:
            avg_runs = float(0.0)  # or you can choose to set a default value, e.g., 0

        # Append the average runs to the list
        avg_inning1_runs.append(avg_runs)

    if 'avg_inning1_runs_venue' not in df_train.columns:
        df_train['avg_inning1_runs_venue'] = pd.Series(avg_inning1_runs, dtype='float64')
    else:
        df_train['avg_inning1_runs_venue'] = df_train['avg_inning1_runs_venue'].astype('float64')

    return df_train

In [None]:
def calculate_avg_inning1_runs_venue_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize a list to store the calculated averages
    avg_inning1_runs_last5 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        venue = row['venue']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the venue matches and the match date is before the current match date
        venue_matches = match_df[(match_df['venue'] == venue) &
                                 (match_df['match_dt'] < current_match_dt)]

        # Get the latest at most 5 matches at this venue
        latest_venue_matches = venue_matches.sort_values(by='match_dt', ascending=False).head(5)

        # Calculate the average runs for the first inning at this venue using the latest 5 past matches only
        if not latest_venue_matches.empty:
            avg_runs = latest_venue_matches['inning1_runs'].mean()
        else:
            avg_runs = float(0.0)  # or you can choose to set a default value, e.g., 0

        # Append the average runs to the list
        avg_inning1_runs_last5.append(avg_runs)

    # Add the new column to df_train if it doesn't already exist
    if 'avg_inning1_runs_venue_last5' not in df_train.columns:
        df_train['avg_inning1_runs_venue_last5'] = pd.Series(avg_inning1_runs_last5, dtype='float64')
    else:
        df_train['avg_inning1_runs_venue_last5'] = df_train['avg_inning1_runs_venue_last5'].astype('float64')

    return df_train

In [None]:
# venue based inning 2 runs
def calculate_avg_inning2_runs_venue(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize a list to store the calculated averages
    avg_inning2_runs = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        venue = row['venue']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the venue matches and the match date is before the current match date
        venue_matches = match_df[(match_df['venue'] == venue) &
                                 (match_df['match_dt'] < current_match_dt)]

        # Calculate the average runs for the first inning at this venue using past matches only
        if not venue_matches.empty:
            avg_runs = venue_matches['inning2_runs'].mean()
        else:
            avg_runs = float(0.0)  # or you can choose to set a default value, e.g., 0

        # Append the average runs to the list
        avg_inning2_runs.append(avg_runs)

    if 'avg_inning2_runs_venue' not in df_train.columns:
        df_train['avg_inning2_runs_venue'] = pd.Series(avg_inning2_runs, dtype='float64')
    else:
        df_train['avg_inning2_runs_venue'] = df_train['avg_inning2_runs_venue'].astype('float64')

    return df_train

In [None]:
def calculate_avg_inning2_runs_venue_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize a list to store the calculated averages
    avg_inning2_runs_last5 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        venue = row['venue']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the venue matches and the match date is before the current match date
        venue_matches = match_df[(match_df['venue'] == venue) &
                                 (match_df['match_dt'] < current_match_dt)]

        # Get the latest at most 5 matches at this venue
        latest_venue_matches = venue_matches.sort_values(by='match_dt', ascending=False).head(5)

        # Calculate the average runs for the second inning at this venue using the latest 5 past matches only
        if not latest_venue_matches.empty:
            avg_runs = latest_venue_matches['inning2_runs'].mean()
        else:
            avg_runs = float(0.0)  # or you can choose to set a default value, e.g., 0

        # Append the average runs to the list
        avg_inning2_runs_last5.append(avg_runs)

    # Add the new column to df_train if it doesn't already exist
    if 'avg_inning2_runs_venue_last5' not in df_train.columns:
        df_train['avg_inning2_runs_venue_last5'] = pd.Series(avg_inning2_runs_last5, dtype='float64')
    else:
        df_train['avg_inning2_runs_venue_last5'] = df_train['avg_inning2_runs_venue_last5'].astype('float64')

    return df_train

In [None]:
# venue based inning 1 wickets
def calculate_avg_inning1_wickets_venue(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize a list to store the calculated averages
    avg_inning1_wickets = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        venue = row['venue']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the venue matches and the match date is before the current match date
        venue_matches = match_df[(match_df['venue'] == venue) &
                                 (match_df['match_dt'] < current_match_dt)]

        # Calculate the average runs for the first inning at this venue using past matches only
        if not venue_matches.empty:
            avg_wickets = venue_matches['inning1_wickets'].mean()
        else:
            avg_wickets = float(0.0)  # or you can choose to set a default value, e.g., 0

        # Append the average runs to the list
        avg_inning1_wickets.append(avg_wickets)

    if 'avg_inning1_wickets_venue' not in df_train.columns:
        df_train['avg_inning1_wickets_venue'] = pd.Series(avg_inning1_wickets, dtype='float64')
    else:
        df_train['avg_inning1_wickets_venue'] = df_train['avg_inning1_wickets_venue'].astype('float64')

    return df_train

In [None]:
def calculate_avg_inning1_wickets_venue_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize a list to store the calculated averages
    avg_inning1_wickets_last5 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        venue = row['venue']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the venue matches and the match date is before the current match date
        venue_matches = match_df[(match_df['venue'] == venue) &
                                 (match_df['match_dt'] < current_match_dt)]

        # Get the latest at most 5 matches at this venue
        latest_venue_matches = venue_matches.sort_values(by='match_dt', ascending=False).head(5)

        # Calculate the average wickets for the first inning at this venue using the latest 5 past matches only
        if not latest_venue_matches.empty:
            avg_wickets = latest_venue_matches['inning1_wickets'].mean()
        else:
            avg_wickets = float(0.0)  # or you can choose to set a default value, e.g., 0

        # Append the average wickets to the list
        avg_inning1_wickets_last5.append(avg_wickets)

    # Add the new column to df_train if it doesn't already exist
    if 'avg_inning1_wickets_venue_last5' not in df_train.columns:
        df_train['avg_inning1_wickets_venue_last5'] = pd.Series(avg_inning1_wickets_last5, dtype='float64')
    else:
        df_train['avg_inning1_wickets_venue_last5'] = df_train['avg_inning1_wickets_venue_last5'].astype('float64')

    return df_train

In [None]:
# venue based inning 2 wickets
def calculate_avg_inning2_wickets_venue(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize a list to store the calculated averages
    avg_inning2_wickets = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        venue = row['venue']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the venue matches and the match date is before the current match date
        venue_matches = match_df[(match_df['venue'] == venue) &
                                 (match_df['match_dt'] < current_match_dt)]

        # Calculate the average runs for the first inning at this venue using past matches only
        if not venue_matches.empty:
            avg_wickets = venue_matches['inning2_wickets'].mean()
        else:
            avg_wickets = float(0.0)  # or you can choose to set a default value, e.g., 0

        # Append the average runs to the list
        avg_inning2_wickets.append(avg_wickets)

    if 'avg_inning2_wickets_venue' not in df_train.columns:
        df_train['avg_inning2_wickets_venue'] = pd.Series(avg_inning2_wickets, dtype='float64')
    else:
        df_train['avg_inning2_wickets_venue'] = df_train['avg_inning2_wickets_venue'].astype('float64')

    return df_train

In [None]:
import pandas as pd

def calculate_avg_inning2_wickets_venue_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize a list to store the calculated averages
    avg_inning2_wickets_last5 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        venue = row['venue']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the venue matches and the match date is before the current match date
        venue_matches = match_df[(match_df['venue'] == venue) &
                                 (match_df['match_dt'] < current_match_dt)]

        # Get the latest at most 5 matches at this venue
        latest_venue_matches = venue_matches.sort_values(by='match_dt', ascending=False).head(5)

        # Calculate the average wickets for the second inning at this venue using the latest 5 past matches only
        if not latest_venue_matches.empty:
            avg_wickets = latest_venue_matches['inning2_wickets'].mean()
        else:
            avg_wickets = float(0.0)  # or you can choose to set a default value, e.g., 0

        # Append the average wickets to the list
        avg_inning2_wickets_last5.append(avg_wickets)

    # Add the new column to df_train if it doesn't already exist
    if 'avg_inning2_wickets_venue_last5' not in df_train.columns:
        df_train['avg_inning2_wickets_venue_last5'] = pd.Series(avg_inning2_wickets_last5, dtype='float64')
    else:
        df_train['avg_inning2_wickets_venue_last5'] = df_train['avg_inning2_wickets_venue_last5'].astype('float64')

    return df_train

In [None]:
# victory by runs and victory by wickets in the past
def calculate_victory_columns(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated averages
    victory_by_runs_team1 = []
    victory_by_runs_team2 = []
    victory_by_wickets_team1 = []
    victory_by_wickets_team2 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Calculate averages for team1
        team1_matches_runs = past_matches[(past_matches['winner'] == team1) & (past_matches['by'] == 'runs')]
        if not team1_matches_runs.empty:
            runs_wins_team1 = team1_matches_runs['win amount'].mean()
        else:
            runs_wins_team1 = float(0.0)

        team1_matches_wickets = past_matches[(past_matches['winner'] == team1) & (past_matches['by'] == 'wickets')]
        if not team1_matches_wickets.empty:
            wickets_wins_team1 = team1_matches_wickets['win amount'].mean()
        else:
            wickets_wins_team1 = float(0.0)

        # Calculate averages for team2
        team2_matches_runs = past_matches[(past_matches['winner'] == team2) & (past_matches['by'] == 'runs')]
        if not team2_matches_runs.empty:
            runs_wins_team2 = team2_matches_runs['win amount'].mean()
        else:
            runs_wins_team2 = float(0.0)

        team2_matches_wickets = past_matches[(past_matches['winner'] == team2) & (past_matches['by'] == 'wickets')]
        if not team1_matches_wickets.empty:
            wickets_wins_team2 = team2_matches_wickets['win amount'].mean()
        else:
            wickets_wins_team2 = float(0.0)

        # Append the calculated values to the lists
        victory_by_runs_team1.append(runs_wins_team1)
        victory_by_runs_team2.append(runs_wins_team2)
        victory_by_wickets_team1.append(wickets_wins_team1)
        victory_by_wickets_team2.append(wickets_wins_team2)

    # Add the new columns to df_train if they don't already exist
    if 'victory_by_runs_team1' not in df_train.columns:
        df_train['victory_by_runs_team1'] = pd.Series(victory_by_runs_team1, dtype='float64')
    else:
        df_train['victory_by_runs_team1'] = df_train['victory_by_runs_team1'].astype('float64')

    if 'victory_by_runs_team2' not in df_train.columns:
        df_train['victory_by_runs_team2'] = pd.Series(victory_by_runs_team2, dtype='float64')
    else:
        df_train['victory_by_runs_team2'] = df_train['victory_by_runs_team2'].astype('float64')

    if 'victory_by_wickets_team1' not in df_train.columns:
        df_train['victory_by_wickets_team1'] = pd.Series(victory_by_wickets_team1, dtype='float64')
    else:
        df_train['victory_by_wickets_team1'] = df_train['victory_by_wickets_team1'].astype('float64')

    if 'victory_by_wickets_team2' not in df_train.columns:
        df_train['victory_by_wickets_team2'] = pd.Series(victory_by_wickets_team2, dtype='float64')
    else:
        df_train['victory_by_wickets_team2'] = df_train['victory_by_wickets_team2'].astype('float64')

    return df_train

In [None]:
def calculate_victory_columns_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated averages
    victory_by_runs_team1_last5 = []
    victory_by_runs_team2_last5 = []
    victory_by_wickets_team1_last5 = []
    victory_by_wickets_team2_last5 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Get the latest at most 5 matches for team1
        team1_matches_runs = past_matches[(past_matches['winner'] == team1) & (past_matches['by'] == 'runs')].sort_values(by='match_dt', ascending=False).head(5)
        if not team1_matches_runs.empty:
            runs_wins_team1_last5 = team1_matches_runs['win amount'].mean()
        else:
            runs_wins_team1_last5 = float(0.0)

        team1_matches_wickets = past_matches[(past_matches['winner'] == team1) & (past_matches['by'] == 'wickets')].sort_values(by='match_dt', ascending=False).head(5)
        if not team1_matches_wickets.empty:
            wickets_wins_team1_last5 = team1_matches_wickets['win amount'].mean()
        else:
            wickets_wins_team1_last5 = float(0.0)

        # Get the latest at most 5 matches for team2
        team2_matches_runs = past_matches[(past_matches['winner'] == team2) & (past_matches['by'] == 'runs')].sort_values(by='match_dt', ascending=False).head(5)
        if not team2_matches_runs.empty:
            runs_wins_team2_last5 = team2_matches_runs['win amount'].mean()
        else:
            runs_wins_team2_last5 = float(0.0)

        team2_matches_wickets = past_matches[(past_matches['winner'] == team2) & (past_matches['by'] == 'wickets')].sort_values(by='match_dt', ascending=False).head(5)
        if not team2_matches_wickets.empty:
            wickets_wins_team2_last5 = team2_matches_wickets['win amount'].mean()
        else:
            wickets_wins_team2_last5 = float(0.0)

        # Append the calculated values to the lists
        victory_by_runs_team1_last5.append(runs_wins_team1_last5)
        victory_by_runs_team2_last5.append(runs_wins_team2_last5)
        victory_by_wickets_team1_last5.append(wickets_wins_team1_last5)
        victory_by_wickets_team2_last5.append(wickets_wins_team2_last5)

    # Add the new columns to df_train if they don't already exist
    if 'victory_by_runs_team1_last5' not in df_train.columns:
        df_train['victory_by_runs_team1_last5'] = pd.Series(victory_by_runs_team1_last5, dtype='float64')
    else:
        df_train['victory_by_runs_team1_last5'] = df_train['victory_by_runs_team1_last5'].astype('float64')

    if 'victory_by_runs_team2_last5' not in df_train.columns:
        df_train['victory_by_runs_team2_last5'] = pd.Series(victory_by_runs_team2_last5, dtype='float64')
    else:
        df_train['victory_by_runs_team2_last5'] = df_train['victory_by_runs_team2_last5'].astype('float64')

    if 'victory_by_wickets_team1_last5' not in df_train.columns:
        df_train['victory_by_wickets_team1_last5'] = pd.Series(victory_by_wickets_team1_last5, dtype='float64')
    else:
        df_train['victory_by_wickets_team1_last5'] = df_train['victory_by_wickets_team1_last5'].astype('float64')

    if 'victory_by_wickets_team2_last5' not in df_train.columns:
        df_train['victory_by_wickets_team2_last5'] = pd.Series(victory_by_wickets_team2_last5, dtype='float64')
    else:
        df_train['victory_by_wickets_team2_last5'] = df_train['victory_by_wickets_team2_last5'].astype('float64')

    return df_train

In [None]:
# inning1 avg runs by team1 and team2
def calculate_inning1_avg_runs(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated averages
    inning1_avg_team1 = []
    inning1_avg_team2 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Calculate average first inning runs for team1
        team1_inning1_matches = past_matches[
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'field'))
        ]
        if not team1_inning1_matches.empty:
            avg_inning1_runs_team1 = team1_inning1_matches['inning1_runs'].mean()
        else:
            avg_inning1_runs_team1 = float(0.0)

        # Calculate average first inning runs for team2
        team2_inning1_matches = past_matches[
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'field'))
        ]
        if not team2_inning1_matches.empty:
            avg_inning1_runs_team2 = team2_inning1_matches['inning1_runs'].mean()
        else:
            avg_inning1_runs_team2 = float(0.0)

        # Append the calculated values to the lists
        inning1_avg_team1.append(avg_inning1_runs_team1)
        inning1_avg_team2.append(avg_inning1_runs_team2)

    # Add the new columns to df_train if they don't already exist
    if 'inning1_avg_runs_team1' not in df_train.columns:
        df_train['inning1_avg_runs_team1'] = pd.Series(inning1_avg_team1, dtype='float64')
    else:
        df_train['inning1_avg_runs_team1'] = df_train['inning1_avg_runs_team1'].astype('float64')

    if 'inning1_avg_runs_team2' not in df_train.columns:
        df_train['inning1_avg_runs_team2'] = pd.Series(inning1_avg_team2, dtype='float64')
    else:
        df_train['inning1_avg_runs_team2'] = df_train['inning1_avg_runs_team2'].astype('float64')

    return df_train

In [None]:
def calculate_inning1_avg_runs_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated averages
    inning1_avg_team1_last5 = []
    inning1_avg_team2_last5 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Get the latest at most 5 matches for team1
        team1_matches = past_matches[
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'field'))
        ].sort_values(by='match_dt', ascending=False).head(5)

        # Calculate average first inning runs for team1 from the latest 5 matches
        if not team1_matches.empty:
            avg_inning1_runs_team1 = team1_matches['inning1_runs'].mean()
        else:
            avg_inning1_runs_team1 = float(0.0)

        # Get the latest at most 5 matches for team2
        team2_matches = past_matches[
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'field'))
        ].sort_values(by='match_dt', ascending=False).head(5)

        # Calculate average first inning runs for team2 from the latest 5 matches
        if not team2_matches.empty:
            avg_inning1_runs_team2 = team2_matches['inning1_runs'].mean()
        else:
            avg_inning1_runs_team2 = float(0.0)

        # Append the calculated values to the lists
        inning1_avg_team1_last5.append(avg_inning1_runs_team1)
        inning1_avg_team2_last5.append(avg_inning1_runs_team2)

    # Add the new columns to df_train if they don't already exist
    if 'inning1_avg_runs_team1_last5' not in df_train.columns:
        df_train['inning1_avg_runs_team1_last5'] = pd.Series(inning1_avg_team1_last5, dtype='float64')
    else:
        df_train['inning1_avg_runs_team1_last5'] = df_train['inning1_avg_runs_team1_last5'].astype('float64')

    if 'inning1_avg_runs_team2_last5' not in df_train.columns:
        df_train['inning1_avg_runs_team2_last5'] = pd.Series(inning1_avg_team2_last5, dtype='float64')
    else:
        df_train['inning1_avg_runs_team2_last5'] = df_train['inning1_avg_runs_team2_last5'].astype('float64')

    return df_train

In [None]:
# inning2 avg runs by team1 and team2
def calculate_inning2_avg_runs(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated averages
    inning2_avg_team1 = []
    inning2_avg_team2 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Calculate average first inning runs for team1
        team1_inning2_matches = past_matches[
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'bat'))
        ]
        if not team1_inning2_matches.empty:
            avg_inning2_runs_team1 = team1_inning2_matches['inning2_runs'].mean()
        else:
            avg_inning2_runs_team1 = float(0.0)

        # Calculate average first inning runs for team2
        team2_inning2_matches = past_matches[
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'bat'))
        ]
        if not team2_inning2_matches.empty:
            avg_inning2_runs_team2 = team2_inning2_matches['inning2_runs'].mean()
        else:
            avg_inning2_runs_team2 = float(0.0)

        # Append the calculated values to the lists
        inning2_avg_team1.append(avg_inning2_runs_team1)
        inning2_avg_team2.append(avg_inning2_runs_team2)

    # Add the new columns to df_train if they don't already exist
    if 'inning2_avg_runs_team1' not in df_train.columns:
        df_train['inning2_avg_runs_team1'] = pd.Series(inning2_avg_team1, dtype='float64')
    else:
        df_train['inning2_avg_runs_team1'] = df_train['inning2_avg_runs_team1'].astype('float64')

    if 'inning2_avg_runs_team2' not in df_train.columns:
        df_train['inning2_avg_runs_team2'] = pd.Series(inning2_avg_team2, dtype='float64')
    else:
        df_train['inning2_avg_runs_team2'] = df_train['inning2_avg_runs_team2'].astype('float64')

    return df_train

In [None]:
def calculate_inning2_avg_runs_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated averages
    inning2_avg_team1_last5 = []
    inning2_avg_team2_last5 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Get the latest at most 5 matches for team1
        team1_matches = past_matches[
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'bat'))
        ].sort_values(by='match_dt', ascending=False).head(5)

        # Calculate average second inning runs for team1 from the latest 5 matches
        if not team1_matches.empty:
            avg_inning2_runs_team1 = team1_matches['inning2_runs'].mean()
        else:
            avg_inning2_runs_team1 = float(0.0)

        # Get the latest at most 5 matches for team2
        team2_matches = past_matches[
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'bat'))
        ].sort_values(by='match_dt', ascending=False).head(5)

        # Calculate average second inning runs for team2 from the latest 5 matches
        if not team2_matches.empty:
            avg_inning2_runs_team2 = team2_matches['inning2_runs'].mean()
        else:
            avg_inning2_runs_team2 = float(0.0)

        # Append the calculated values to the lists
        inning2_avg_team1_last5.append(avg_inning2_runs_team1)
        inning2_avg_team2_last5.append(avg_inning2_runs_team2)

    # Add the new columns to df_train if they don't already exist
    if 'inning2_avg_runs_team1_last5' not in df_train.columns:
        df_train['inning2_avg_runs_team1_last5'] = pd.Series(inning2_avg_team1_last5, dtype='float64')
    else:
        df_train['inning2_avg_runs_team1_last5'] = df_train['inning2_avg_runs_team1_last5'].astype('float64')

    if 'inning2_avg_runs_team2_last5' not in df_train.columns:
        df_train['inning2_avg_runs_team2_last5'] = pd.Series(inning2_avg_team2_last5, dtype='float64')
    else:
        df_train['inning2_avg_runs_team2_last5'] = df_train['inning2_avg_runs_team2_last5'].astype('float64')

    return df_train

In [None]:
# inning1 avg wickets by team1 and team2
def calculate_inning1_avg_wickets(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated averages
    inning1_avg_team1 = []
    inning1_avg_team2 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Calculate average first inning runs for team1
        team1_inning1_matches = past_matches[
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'bat'))
        ]
        if not team1_inning1_matches.empty:
            avg_inning1_wickets_team1 = team1_inning1_matches['inning1_wickets'].mean()
        else:
            avg_inning1_wickets_team1 = float(0.0)

        # Calculate average first inning runs for team2
        team2_inning1_matches = past_matches[
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'bat'))
        ]
        if not team2_inning1_matches.empty:
            avg_inning1_wickets_team2 = team2_inning1_matches['inning1_wickets'].mean()
        else:
            avg_inning1_wickets_team2 = float(0.0)

        # Append the calculated values to the lists
        inning1_avg_team1.append(avg_inning1_wickets_team1)
        inning1_avg_team2.append(avg_inning1_wickets_team2)

    # Add the new columns to df_train if they don't already exist
    if 'inning1_avg_wickets_team1' not in df_train.columns:
        df_train['inning1_avg_wickets_team1'] = pd.Series(inning1_avg_team1, dtype='float64')
    else:
        df_train['inning1_avg_wickets_team1'] = df_train['inning1_avg_wickets_team1'].astype('float64')

    if 'inning1_avg_wickets_team2' not in df_train.columns:
        df_train['inning1_avg_wickets_team2'] = pd.Series(inning1_avg_team2, dtype='float64')
    else:
        df_train['inning1_avg_wickets_team2'] = df_train['inning1_avg_wickets_team2'].astype('float64')

    return df_train

In [None]:
def calculate_inning1_avg_wickets_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated averages
    inning1_avg_team1_last5 = []
    inning1_avg_team2_last5 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Get the latest at most 5 matches for team1
        team1_matches = past_matches[
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'bat'))
        ].sort_values(by='match_dt', ascending=False).head(5)

        # Calculate average first inning wickets for team1 from the latest 5 matches
        if not team1_matches.empty:
            avg_inning1_wickets_team1 = team1_matches['inning1_wickets'].mean()
        else:
            avg_inning1_wickets_team1 = float(0.0)

        # Get the latest at most 5 matches for team2
        team2_matches = past_matches[
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'bat'))
        ].sort_values(by='match_dt', ascending=False).head(5)

        # Calculate average first inning wickets for team2 from the latest 5 matches
        if not team2_matches.empty:
            avg_inning1_wickets_team2 = team2_matches['inning1_wickets'].mean()
        else:
            avg_inning1_wickets_team2 = float(0.0)

        # Append the calculated values to the lists
        inning1_avg_team1_last5.append(avg_inning1_wickets_team1)
        inning1_avg_team2_last5.append(avg_inning1_wickets_team2)

    # Add the new columns to df_train if they don't already exist
    if 'inning1_avg_wickets_team1_last5' not in df_train.columns:
        df_train['inning1_avg_wickets_team1_last5'] = pd.Series(inning1_avg_team1_last5, dtype='float64')
    else:
        df_train['inning1_avg_wickets_team1_last5'] = df_train['inning1_avg_wickets_team1_last5'].astype('float64')

    if 'inning1_avg_wickets_team2_last5' not in df_train.columns:
        df_train['inning1_avg_wickets_team2_last5'] = pd.Series(inning1_avg_team2_last5, dtype='float64')
    else:
        df_train['inning1_avg_wickets_team2_last5'] = df_train['inning1_avg_wickets_team2_last5'].astype('float64')

    return df_train

In [None]:
# inning2 avg wickets by team1 and team2
def calculate_inning2_avg_wickets(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated averages
    inning2_avg_team1 = []
    inning2_avg_team2 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Calculate average first inning runs for team1
        team1_inning2_matches = past_matches[
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'field'))
        ]
        if not team1_inning2_matches.empty:
            avg_inning2_wickets_team1 = team1_inning2_matches['inning2_wickets'].mean()
        else:
            avg_inning2_wickets_team1 = float(0.0)

        # Calculate average first inning runs for team2
        team2_inning2_matches = past_matches[
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'field'))
        ]
        if not team2_inning2_matches.empty:
            avg_inning2_wickets_team2 = team2_inning2_matches['inning2_wickets'].mean()
        else:
            avg_inning2_wickets_team2 = float(0.0)

        # Append the calculated values to the lists
        inning2_avg_team1.append(avg_inning2_wickets_team1)
        inning2_avg_team2.append(avg_inning2_wickets_team2)

    # Add the new columns to df_train if they don't already exist
    if 'inning2_avg_wickets_team1' not in df_train.columns:
        df_train['inning2_avg_wickets_team1'] = pd.Series(inning2_avg_team1, dtype='float64')
    else:
        df_train['inning2_avg_wickets_team1'] = df_train['inning2_avg_wickets_team1'].astype('float64')

    if 'inning2_avg_wickets_team2' not in df_train.columns:
        df_train['inning2_avg_wickets_team2'] = pd.Series(inning2_avg_team2, dtype='float64')
    else:
        df_train['inning2_avg_wickets_team2'] = df_train['inning2_avg_wickets_team2'].astype('float64')

    return df_train

In [None]:
def calculate_inning2_avg_wickets_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated averages
    inning2_avg_team1_last5 = []
    inning2_avg_team2_last5 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Get the latest at most 5 matches for team1
        team1_matches = past_matches[
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] == team1) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team1'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team1) & (past_matches['toss winner'] != team1) & (past_matches['toss decision'] == 'field'))
        ].sort_values(by='match_dt', ascending=False).head(5)

        # Calculate average second inning wickets for team1 from the latest 5 matches
        if not team1_matches.empty:
            avg_inning2_wickets_team1 = team1_matches['inning2_wickets'].mean()
        else:
            avg_inning2_wickets_team1 = float(0.0)

        # Get the latest at most 5 matches for team2
        team2_matches = past_matches[
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] == team2) & (past_matches['toss decision'] == 'bat')) |
            ((past_matches['team1'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'field')) |
            ((past_matches['team2'] == team2) & (past_matches['toss winner'] != team2) & (past_matches['toss decision'] == 'field'))
        ].sort_values(by='match_dt', ascending=False).head(5)

        # Calculate average second inning wickets for team2 from the latest 5 matches
        if not team2_matches.empty:
            avg_inning2_wickets_team2 = team2_matches['inning2_wickets'].mean()
        else:
            avg_inning2_wickets_team2 = float(0.0)

        # Append the calculated values to the lists
        inning2_avg_team1_last5.append(avg_inning2_wickets_team1)
        inning2_avg_team2_last5.append(avg_inning2_wickets_team2)

    # Add the new columns to df_train if they don't already exist
    if 'inning2_avg_wickets_team1_last5' not in df_train.columns:
        df_train['inning2_avg_wickets_team1_last5'] = pd.Series(inning2_avg_team1_last5, dtype='float64')
    else:
        df_train['inning2_avg_wickets_team1_last5'] = df_train['inning2_avg_wickets_team1_last5'].astype('float64')

    if 'inning2_avg_wickets_team2_last5' not in df_train.columns:
        df_train['inning2_avg_wickets_team2_last5'] = pd.Series(inning2_avg_team2_last5, dtype='float64')
    else:
        df_train['inning2_avg_wickets_team2_last5'] = df_train['inning2_avg_wickets_team2_last5'].astype('float64')

    return df_train

In [None]:
# generate team_won_in_past_ratio
def calculate_past_wins(df_train, match_df):
    # Ensure the 'match_date' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated win ratios
    team1_won_in_past_list = []
    team2_won_in_past_list = []

    # Iterate through each row in df_train
    for _, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        match_date = row['match_dt']

        # Filter past matches for team1
        team1_past_matches = match_df[((match_df['team1'] == team1) | (match_df['team2'] == team1)) & (match_df['match_dt'] < match_date)]
        team1_wins = team1_past_matches[((team1_past_matches['team1'] == team1) & (team1_past_matches['winner'] == team1_past_matches['team1'])) |
                                        ((team1_past_matches['team2'] == team1) & (team1_past_matches['winner'] == team1_past_matches['team2']))].shape[0]
        team1_total = team1_past_matches.shape[0]
        team1_win_ratio = team1_wins / team1_total if team1_total > 0 else 0.0

        # Filter past matches for team2
        team2_past_matches = match_df[((match_df['team1'] == team2) | (match_df['team2'] == team2)) & (match_df['match_dt'] < match_date)]
        team2_wins = team2_past_matches[((team2_past_matches['team1'] == team2) & (team2_past_matches['winner'] == team2_past_matches['team1'])) |
                                        ((team2_past_matches['team2'] == team2) & (team2_past_matches['winner'] == team2_past_matches['team2']))].shape[0]
        team2_total = team2_past_matches.shape[0]
        team2_win_ratio = team2_wins / team2_total if team2_total > 0 else 0.0

        # Append the calculated values to the lists
        team1_won_in_past_list.append(float(team1_win_ratio))
        team2_won_in_past_list.append(float(team2_win_ratio))

    # Add the new columns to df_train
    df_train['team1_won_in_past'] = pd.Series(team1_won_in_past_list, dtype='float64')
    df_train['team2_won_in_past'] = pd.Series(team2_won_in_past_list, dtype='float64')

    return df_train

In [None]:
def calculate_past_wins_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated win ratios
    team1_won_in_past_last5 = []
    team2_won_in_past_last5 = []

    # Iterate through each row in df_train
    for _, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        match_date = row['match_dt']

        # Filter past matches for team1 and get the latest 5 matches
        team1_past_matches = match_df[((match_df['team1'] == team1) | (match_df['team2'] == team1)) & (match_df['match_dt'] < match_date)]
        team1_past_matches_sorted = team1_past_matches.sort_values(by='match_dt', ascending=False).head(5)
        team1_wins = team1_past_matches_sorted[((team1_past_matches_sorted['team1'] == team1) & (team1_past_matches_sorted['winner'] == team1_past_matches_sorted['team1'])) |
                                               ((team1_past_matches_sorted['team2'] == team1) & (team1_past_matches_sorted['winner'] == team1_past_matches_sorted['team2']))].shape[0]
        team1_total = team1_past_matches_sorted.shape[0]
        team1_win_ratio = team1_wins / team1_total if team1_total > 0 else 0.0

        # Filter past matches for team2 and get the latest 5 matches
        team2_past_matches = match_df[((match_df['team1'] == team2) | (match_df['team2'] == team2)) & (match_df['match_dt'] < match_date)]
        team2_past_matches_sorted = team2_past_matches.sort_values(by='match_dt', ascending=False).head(5)
        team2_wins = team2_past_matches_sorted[((team2_past_matches_sorted['team1'] == team2) & (team2_past_matches_sorted['winner'] == team2_past_matches_sorted['team1'])) |
                                               ((team2_past_matches_sorted['team2'] == team2) & (team2_past_matches_sorted['winner'] == team2_past_matches_sorted['team2']))].shape[0]
        team2_total = team2_past_matches_sorted.shape[0]
        team2_win_ratio = team2_wins / team2_total if team2_total > 0 else 0.0

        # Append the calculated values to the lists
        team1_won_in_past_last5.append(float(team1_win_ratio))
        team2_won_in_past_last5.append(float(team2_win_ratio))

    # Add the new columns to df_train
    df_train['team1_won_in_past_last5'] = pd.Series(team1_won_in_past_last5, dtype='float64')
    df_train['team2_won_in_past_last5'] = pd.Series(team2_won_in_past_last5, dtype='float64')

    return df_train

In [None]:
# calculate nrr
def calculate_nrr(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated NRRs
    nrr_team1 = []
    nrr_team2 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        current_match_dt = row['match_dt']
        team1 = row['team1']
        team2 = row['team2']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Initialize variables to accumulate runs and balls for both teams
        total_inning1_runs_team1 = 0.0
        total_inning1_balls_team1 = 0.0
        total_inning2_runs_team1 = 0.0
        total_inning2_balls_team1 = 0.0

        total_inning1_runs_team2 = 0.0
        total_inning1_balls_team2 = 0.0
        total_inning2_runs_team2 = 0.0
        total_inning2_balls_team2 = 0.0

        # Check if there are past matches
        if not past_matches.empty:
            # Filter matches involving team1
            team1_matches = past_matches[(past_matches['team1'] == team1) | (past_matches['team2'] == team1)]

            # Aggregate runs and balls for team1
            for _, match in team1_matches.iterrows():
                if match['team1'] == team1:
                    total_inning1_runs_team1 += match['inning1_runs']
                    total_inning1_balls_team1 += match['inning1_balls']
                if match['team2'] == team1:
                    total_inning2_runs_team1 += match['inning2_runs']
                    total_inning2_balls_team1 += match['inning2_balls']

            # Filter matches involving team2
            team2_matches = past_matches[(past_matches['team1'] == team2) | (past_matches['team2'] == team2)]

            # Aggregate runs and balls for team2
            for _, match in team2_matches.iterrows():
                if match['team1'] == team2:
                    total_inning1_runs_team2 += match['inning1_runs']
                    total_inning1_balls_team2 += match['inning1_balls']
                if match['team2'] == team2:
                    total_inning2_runs_team2 += match['inning2_runs']
                    total_inning2_balls_team2 += match['inning2_balls']

            # Calculate NRR for both teams using aggregated values
            if total_inning1_balls_team1 > 0 and total_inning2_balls_team1 > 0:
                nrr1 = (total_inning1_runs_team1 / total_inning1_balls_team1) - (total_inning2_runs_team1 / total_inning2_balls_team1)
            else:
                nrr1 = 0.0

            if total_inning1_balls_team2 > 0 and total_inning2_balls_team2 > 0:
                nrr2 = (total_inning2_runs_team2 / total_inning2_balls_team2) - (total_inning1_runs_team2 / total_inning1_balls_team2)
            else:
                nrr2 = 0.0
        else:
            nrr1 = 0.0
            nrr2 = 0.0

        # Append the calculated NRRs to the lists
        nrr_team1.append(nrr1)
        nrr_team2.append(nrr2)

    # Add the calculated NRRs to the df_train dataframe
    df_train['nrr_team1'] = pd.Series(nrr_team1, dtype='float64')
    df_train['nrr_team2'] = pd.Series(nrr_team2, dtype='float64')

    return df_train

In [None]:
def calculate_nrr_last5(df_train, match_df):
    # Ensure the 'match_dt' columns are in datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize lists to store the calculated NRRs
    nrr_team1_last5 = []
    nrr_team2_last5 = []

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        current_match_dt = row['match_dt']
        team1 = row['team1']
        team2 = row['team2']

        # Filter match_df to get rows where the match date is before the current match date
        past_matches = match_df[match_df['match_dt'] < current_match_dt]

        # Get the latest 5 matches for team1
        team1_past_matches_sorted = past_matches[(past_matches['team1'] == team1) | (past_matches['team2'] == team1)].sort_values(by='match_dt', ascending=False).head(5)

        # Initialize variables to accumulate runs and balls for team1
        total_inning1_runs_team1 = team1_past_matches_sorted['inning1_runs'].sum()
        total_inning1_balls_team1 = team1_past_matches_sorted['inning1_balls'].sum()
        total_inning2_runs_team1 = team1_past_matches_sorted['inning2_runs'].sum()
        total_inning2_balls_team1 = team1_past_matches_sorted['inning2_balls'].sum()

        # Get the latest 5 matches for team2
        team2_past_matches_sorted = past_matches[(past_matches['team1'] == team2) | (past_matches['team2'] == team2)].sort_values(by='match_dt', ascending=False).head(5)

        # Initialize variables to accumulate runs and balls for team2
        total_inning1_runs_team2 = team2_past_matches_sorted['inning1_runs'].sum()
        total_inning1_balls_team2 = team2_past_matches_sorted['inning1_balls'].sum()
        total_inning2_runs_team2 = team2_past_matches_sorted['inning2_runs'].sum()
        total_inning2_balls_team2 = team2_past_matches_sorted['inning2_balls'].sum()

        # Calculate NRR for team1 using aggregated values
        if total_inning1_balls_team1 > 0 and total_inning2_balls_team1 > 0:
            nrr1 = (total_inning1_runs_team1 / total_inning1_balls_team1) - (total_inning2_runs_team1 / total_inning2_balls_team1)
        else:
            nrr1 = 0.0

        # Calculate NRR for team2 using aggregated values
        if total_inning1_balls_team2 > 0 and total_inning2_balls_team2 > 0:
            nrr2 = (total_inning2_runs_team2 / total_inning2_balls_team2) - (total_inning1_runs_team2 / total_inning1_balls_team2)
        else:
            nrr2 = 0.0

        # Append the calculated NRRs to the lists
        nrr_team1_last5.append(nrr1)
        nrr_team2_last5.append(nrr2)

    # Add the calculated NRRs to the df_train dataframe
    df_train['nrr_team1_last5'] = pd.Series(nrr_team1_last5, dtype='float64')
    df_train['nrr_team2_last5'] = pd.Series(nrr_team2_last5, dtype='float64')

    return df_train

In [None]:
# add lighting win ratios
def add_lighting_win_ratios(df_train, match_df):
    def calculate_win_ratio(team, past_matches):
        wins_day = past_matches[(past_matches['winner'] == team) & (past_matches['lighting'] == 'day match')].shape[0]
        wins_night = past_matches[(past_matches['winner'] == team) & (past_matches['lighting'] == 'night match')].shape[0]

        # To avoid division by zero, handle the case when wins_night is 0
        if wins_night == 0:
            return wins_day  # All wins are during the day or no wins at all
        elif wins_day == 0:
            return 1 / wins_night  # No wins during the day
        else:
            return wins_day / wins_night

    # Initialize the new columns
    df_train['team1_win_lighting1'] = 0
    df_train['team2_win_lighting1'] = 0

    # Convert match_dt columns to datetime
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Traverse each row of df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Get all past matches for team1 and team2 before the current match date
        past_matches_team1 = match_df[((match_df['team1'] == team1) | (match_df['team2'] == team1)) & (match_df['match_dt'] < current_match_dt)]
        past_matches_team2 = match_df[((match_df['team1'] == team2) | (match_df['team2'] == team2)) & (match_df['match_dt'] < current_match_dt)]

        # Calculate win ratios
        team1_win_ratio = calculate_win_ratio(team1, past_matches_team1)
        team2_win_ratio = calculate_win_ratio(team2, past_matches_team2)

        # Assign the calculated win ratios to the respective columns
        df_train.at[index, 'team1_win_lighting1'] = team1_win_ratio
        df_train.at[index, 'team2_win_lighting1'] = team2_win_ratio

    return df_train

In [None]:
def add_lighting_win_ratios_last5(df_train, match_df):
    def calculate_win_ratio(team, match_df, match_date):
      # Filter past matches for the team before the match_date
      past_matches = match_df[((match_df['team1'] == team) | (match_df['team2'] == team)) & (match_df['match_dt'] < match_date)]

      # Get the last 5 day matches and last 5 night matches separately
      last_5_day_matches = past_matches[past_matches['lighting'] == 'day match'].sort_values(by='match_dt', ascending=False).head(5)
      last_5_night_matches = past_matches[past_matches['lighting'] == 'night match'].sort_values(by='match_dt', ascending=False).head(5)

      # Count wins in day matches and night matches separately
      wins_day = last_5_day_matches[last_5_day_matches['winner'] == team].shape[0]
      wins_night = last_5_night_matches[last_5_night_matches['winner'] == team].shape[0]

      # To avoid division by zero, handle the case when wins_night is 0
      if wins_night == 0:
          return wins_day  # All wins are during the day or no wins at all
      elif wins_day == 0:
          return 1 / wins_night  # No wins during the day
      else:
          return wins_day / wins_night


    # Initialize the new columns
    df_train['team1_win_lighting1_last5'] = 0.0
    df_train['team2_win_lighting1_last5'] = 0.0

    # Convert match_dt columns to datetime
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Traverse each row of df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Calculate win ratios for the last 5 day matches and last 5 night matches before the current match date
        team1_win_ratio_day_last5 = calculate_win_ratio(team1, match_df, current_match_dt)
        team2_win_ratio_day_last5 = calculate_win_ratio(team2, match_df, current_match_dt)

        # Assign the calculated win ratios to the respective columns
        df_train.at[index, 'team1_win_lighting1_last5'] = team1_win_ratio_day_last5
        df_train.at[index, 'team2_win_lighting1_last5'] = team2_win_ratio_day_last5

    return df_train

In [None]:
def add_day_night_win_ratios(df_train, match_df):
    def calculate_day_night_win_ratio(team, past_matches):
        # Calculate total day/night matches and wins
        total_day_night_matches = past_matches[past_matches['lighting'] == 'day/night match'].shape[0]
        wins_day_night = past_matches[(past_matches['winner'] == team) & (past_matches['lighting'] == 'day/night match')].shape[0]

        # Calculate ratio
        ratio_day_night = wins_day_night / total_day_night_matches if total_day_night_matches != 0 else 0

        return ratio_day_night

    # Initialize the new columns
    df_train['team1_win_lighting2'] = 0
    df_train['team2_win_lighting2'] = 0

    # Convert match_dt columns to datetime
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Traverse each row of df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Get all past matches for team1 and team2 before the current match date
        past_matches_team1 = match_df[((match_df['team1'] == team1) | (match_df['team2'] == team1)) & (match_df['match_dt'] < current_match_dt)]
        past_matches_team2 = match_df[((match_df['team1'] == team2) | (match_df['team2'] == team2)) & (match_df['match_dt'] < current_match_dt)]

        # Calculate win ratios for day/night matches
        team1_win_ratio_day_night = calculate_day_night_win_ratio(team1, past_matches_team1)
        team2_win_ratio_day_night = calculate_day_night_win_ratio(team2, past_matches_team2)

        # Assign the calculated win ratios to the respective columns
        df_train.at[index, 'team1_win_lighting2'] = team1_win_ratio_day_night
        df_train.at[index, 'team2_win_lighting2'] = team2_win_ratio_day_night

    return df_train

In [None]:
def add_day_night_win_ratios_last5(df_train, match_df):
  def calculate_day_night_win_ratio(team, match_df, match_date):
    # Filter past matches for the team before the match_date
    past_matches = match_df[((match_df['team1'] == team) | (match_df['team2'] == team)) & (match_df['match_dt'] < match_date)]

    # Get the last 5 day/night matches
    last_5_day_night_matches = past_matches[past_matches['lighting'] == 'day/night match'].sort_values(by='match_dt', ascending=False).head(5)

    # Count wins in day/night matches
    wins_day_night = last_5_day_night_matches[last_5_day_night_matches['winner'] == team].shape[0]

    # Calculate ratio
    total_day_night_matches = last_5_day_night_matches.shape[0]
    ratio_day_night = wins_day_night / total_day_night_matches if total_day_night_matches != 0 else 0

    return ratio_day_night

  # Initialize the new columns
  df_train['team1_win_lighting2_last5'] = 0.0
  df_train['team2_win_lighting2_last5'] = 0.0

  # Convert match_dt columns to datetime
  df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
  match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

  # Traverse each row of df_train
  for index, row in df_train.iterrows():
      team1 = row['team1']
      team2 = row['team2']
      current_match_dt = row['match_dt']
      # Calculate win ratios for the last 5 day/night matches before the current match date
      team1_win_ratio_day_night_last5 = calculate_day_night_win_ratio(team1, match_df, current_match_dt)
      team2_win_ratio_day_night_last5 = calculate_day_night_win_ratio(team2, match_df, current_match_dt)
      # Assign the calculated win ratios to the respective columns
      df_train.at[index, 'team1_win_lighting2_last5'] = team1_win_ratio_day_night_last5
      df_train.at[index, 'team2_win_lighting2_last5'] = team2_win_ratio_day_night_last5

  return df_train

In [None]:
def calculate_wickets_avg_lighting(df_train, match_df):
    # Convert match_dt columns to datetime
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize new columns in df_train with 0 values
    df_train['team1_day match_wickets_avg'] = 0.0
    df_train['team1_night match_wickets_avg'] = 0.0
    df_train['team1_day/night match_wickets_avg'] = 0.0
    df_train['team2_day match_wickets_avg'] = 0.0
    df_train['team2_night match_wickets_avg'] = 0.0
    df_train['team2_day/night match_wickets_avg'] = 0.0

    # Iterate over each row in df_train
    for idx, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        match_date = row['match_dt']

        # Filter past matches in match_df based on match date
        past_matches = match_df[match_df['match_dt'] < match_date]

        # Calculate average wickets for team1 based on lighting condition
        for condition in ['day match', 'night match', 'day/night match']:
            conditional_team1_field_first = past_matches[ ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team1) & (past_matches['toss decision']=='field')) |
                                                          ((past_matches['team2']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team1) & (past_matches['toss decision']=='field')) |
                                                          ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team1) & (past_matches['toss decision']=='bat')) |
                                                          ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team1) & (past_matches['toss decision']=='bat')) ]
            var1 = conditional_team1_field_first['inning1_wickets'].mean() if not conditional_team1_field_first.empty else 0.0

            conditional_team1_field_second = past_matches[ ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team1) & (past_matches['toss decision']=='bat')) |
                                                          ((past_matches['team2']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team1) & (past_matches['toss decision']=='bat')) |
                                                          ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team1) & (past_matches['toss decision']=='field')) |
                                                          ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team1) & (past_matches['toss decision']=='field')) ]
            var2 = conditional_team1_field_second['inning2_wickets'].mean() if not conditional_team1_field_second.empty else 0.0

            df_train.at[idx, f'team1_{condition}_wickets_avg'] = (var1 + var2) / 2.0


            conditional_team2_field_first = past_matches[ ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team2) & (past_matches['toss decision']=='field')) |
                                                          ((past_matches['team2']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team2) & (past_matches['toss decision']=='field')) |
                                                          ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team2) & (past_matches['toss decision']=='bat')) |
                                                          ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team2) & (past_matches['toss decision']=='bat')) ]
            var1 = conditional_team2_field_first['inning1_wickets'].mean() if not conditional_team2_field_first.empty else 0.0

            conditional_team2_field_second = past_matches[ ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team2) & (past_matches['toss decision']=='bat')) |
                                                           ((past_matches['team2']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team2) & (past_matches['toss decision']=='bat')) |
                                                           ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team2) & (past_matches['toss decision']=='field')) |
                                                           ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team2) & (past_matches['toss decision']=='field')) ]
            var2 = conditional_team2_field_second['inning2_wickets'].mean() if not conditional_team2_field_second.empty else 0.0

            df_train.at[idx, f'team2_{condition}_wickets_avg'] = (var1 + var2) / 2.0

    return df_train

In [None]:
def calculate_runs_avg_lighting(df_train, match_df):
    # Convert match_dt columns to datetime
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize new columns in df_train with 0 values
    df_train['team1_day match_runs_avg'] = 0.0
    df_train['team1_night match_runs_avg'] = 0.0
    df_train['team1_day/night match_runs_avg'] = 0.0
    df_train['team2_day match_runs_avg'] = 0.0
    df_train['team2_night match_runs_avg'] = 0.0
    df_train['team2_day/night match_runs_avg'] = 0.0

    # Iterate over each row in df_train
    for idx, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        match_date = row['match_dt']

        # Filter past matches in match_df based on match date
        past_matches = match_df[match_df['match_dt'] < match_date]

        # Calculate average runs for team1 based on lighting condition
        for condition in ['day match', 'night match', 'day/night match']:
            conditional_team1_bat_first = past_matches[ ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team1) & (past_matches['toss decision']=='bat')) |
                                                          ((past_matches['team2']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team1) & (past_matches['toss decision']=='bat')) |
                                                          ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team1) & (past_matches['toss decision']=='field')) |
                                                          ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team1) & (past_matches['toss decision']=='field')) ]
            var1 = conditional_team1_bat_first['inning1_runs'].mean() if not conditional_team1_bat_first.empty else 0.0

            conditional_team1_bat_second = past_matches[ ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team1) & (past_matches['toss decision']=='field')) |
                                                          ((past_matches['team2']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team1) & (past_matches['toss decision']=='field')) |
                                                          ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team1) & (past_matches['toss decision']=='bat')) |
                                                          ((past_matches['team1']==team1) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team1) & (past_matches['toss decision']=='bat')) ]
            var2 = conditional_team1_bat_second['inning2_runs'].mean() if not conditional_team1_bat_second.empty else 0.0

            df_train.at[idx, f'team1_{condition}_runs_avg'] = (var1 + var2) / 2.0


            conditional_team2_bat_first = past_matches[ ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team2) & (past_matches['toss decision']=='bat')) |
                                                          ((past_matches['team2']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team2) & (past_matches['toss decision']=='bat')) |
                                                          ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team2) & (past_matches['toss decision']=='field')) |
                                                          ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team2) & (past_matches['toss decision']=='field')) ]
            var1 = conditional_team2_bat_first['inning1_runs'].mean() if not conditional_team2_bat_first.empty else 0.0

            conditional_team2_bat_second = past_matches[ ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team2) & (past_matches['toss decision']=='field')) |
                                                           ((past_matches['team2']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']==team2) & (past_matches['toss decision']=='field')) |
                                                           ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team2) & (past_matches['toss decision']=='bat')) |
                                                           ((past_matches['team1']==team2) & (past_matches['lighting']==condition) & (past_matches['toss winner']!=team2) & (past_matches['toss decision']=='bat')) ]
            var2 = conditional_team2_bat_second['inning2_runs'].mean() if not conditional_team2_bat_second.empty else 0.0

            df_train.at[idx, f'team2_{condition}_runs_avg'] = (var1 + var2) / 2.0

    return df_train

In [None]:
def calculate_wickets_avg_of_team_on_venue(df_train, match_df):
    # Convert match_dt columns to datetime
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize new columns in df_train with 0 values
    df_train['team1_wickets_avg_inning1_venue'] = 0.0
    df_train['team2_wickets_avg_inning1_venue'] = 0.0
    df_train['team1_wickets_avg_inning2_venue'] = 0.0
    df_train['team2_wickets_avg_inning2_venue'] = 0.0

    # Iterate over each row in df_train
    for idx, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        venue = row['venue']
        match_date = row['match_dt']

        # Filter past matches in match_df based on venue and match date
        venue_matches = match_df[(match_df['venue'] == venue) & (match_df['match_dt'] < match_date)]

        team1_inning1_field = venue_matches[((venue_matches['team1'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team1'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'bat')) ]
        df_train.at[idx, 'team1_wickets_avg_inning1_venue'] = team1_inning1_field['inning1_wickets'].mean() if not team1_inning1_field.empty else 0.0

        team1_inning2_field = venue_matches[((venue_matches['team1'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team1'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'field')) ]
        df_train.at[idx, 'team1_wickets_avg_inning2_venue'] = team1_inning2_field['inning2_wickets'].mean() if not team1_inning2_field.empty else 0.0

        team2_inning1_field = venue_matches[((venue_matches['team1'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team1'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'bat')) ]
        df_train.at[idx, 'team2_wickets_avg_inning1_venue'] = team2_inning1_field['inning1_wickets'].mean() if not team2_inning1_field.empty else 0.0

        team2_inning2_field = venue_matches[((venue_matches['team1'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team1'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'field')) ]
        df_train.at[idx, 'team2_wickets_avg_inning2_venue'] = team2_inning2_field['inning2_wickets'].mean() if not team2_inning2_field.empty else 0.0

    return df_train

In [None]:
def calculate_wickets_avg_of_team_on_venue_last5(df_train, match_df):
    # Convert match_dt columns to datetime
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize new columns in df_train with 0 values
    df_train['team1_wickets_avg_inning1_venue_last5'] = 0.0
    df_train['team2_wickets_avg_inning1_venue_last5'] = 0.0
    df_train['team1_wickets_avg_inning2_venue_last5'] = 0.0
    df_train['team2_wickets_avg_inning2_venue_last5'] = 0.0

    # Iterate over each row in df_train
    for idx, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        venue = row['venue']
        match_date = row['match_dt']

        # Filter past matches in match_df based on venue and match date
        venue_matches = match_df[(match_df['venue'] == venue) & (match_df['match_dt'] < match_date)]

        # Select and sort last 5 matches for team1 in inning 1 fielding
        team1_inning1_field = venue_matches[((venue_matches['team1'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team1'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'bat'))]
        team1_inning1_field = team1_inning1_field.sort_values(by='match_dt', ascending=False).head(5)
        df_train.at[idx, 'team1_wickets_avg_inning1_venue_last5'] = team1_inning1_field['inning1_wickets'].mean() if not team1_inning1_field.empty else 0.0

        # Select and sort last 5 matches for team1 in inning 2 fielding
        team1_inning2_field = venue_matches[((venue_matches['team1'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team1'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'field'))]
        team1_inning2_field = team1_inning2_field.sort_values(by='match_dt', ascending=False).head(5)
        df_train.at[idx, 'team1_wickets_avg_inning2_venue_last5'] = team1_inning2_field['inning2_wickets'].mean() if not team1_inning2_field.empty else 0.0

        # Select and sort last 5 matches for team2 in inning 1 fielding
        team2_inning1_field = venue_matches[((venue_matches['team1'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team1'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'bat'))]
        team2_inning1_field = team2_inning1_field.sort_values(by='match_dt', ascending=False).head(5)
        df_train.at[idx, 'team2_wickets_avg_inning1_venue_last5'] = team2_inning1_field['inning1_wickets'].mean() if not team2_inning1_field.empty else 0.0

        # Select and sort last 5 matches for team2 in inning 2 fielding
        team2_inning2_field = venue_matches[((venue_matches['team1'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team1'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'field'))]
        team2_inning2_field = team2_inning2_field.sort_values(by='match_dt', ascending=False).head(5)
        df_train.at[idx, 'team2_wickets_avg_inning2_venue_last5'] = team2_inning2_field['inning2_wickets'].mean() if not team2_inning2_field.empty else 0.0

    return df_train

In [None]:
def calculate_runs_avg_of_team_on_venue(df_train, match_df):
    # Convert match_dt columns to datetime
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize new columns in df_train with 0 values
    df_train['team1_runs_avg_inning1_venue'] = 0.0
    df_train['team2_runs_avg_inning1_venue'] = 0.0
    df_train['team1_runs_avg_inning2_venue'] = 0.0
    df_train['team2_runs_avg_inning2_venue'] = 0.0

    # Iterate over each row in df_train
    for idx, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        venue = row['venue']
        match_date = row['match_dt']

        # Filter past matches in match_df based on venue and match date
        venue_matches = match_df[(match_df['venue'] == venue) & (match_df['match_dt'] < match_date)]

        team1_inning1_bat = venue_matches[((venue_matches['team1'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team1'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'field')) ]
        df_train.at[idx, 'team1_runs_avg_inning1_venue'] = team1_inning1_bat['inning1_runs'].mean() if not team1_inning1_bat.empty else 0.0

        team1_inning2_bat = venue_matches[((venue_matches['team1'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team1'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'bat')) ]
        df_train.at[idx, 'team1_runs_avg_inning2_venue'] = team1_inning2_bat['inning2_runs'].mean() if not team1_inning2_bat.empty else 0.0

        team2_inning1_bat = venue_matches[((venue_matches['team1'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team1'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'field')) ]
        df_train.at[idx, 'team2_runs_avg_inning1_venue'] = team2_inning1_bat['inning1_runs'].mean() if not team2_inning1_bat.empty else 0.0

        team2_inning2_bat = venue_matches[((venue_matches['team1'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'field')) |
                                            ((venue_matches['team1'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'bat')) |
                                            ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'bat')) ]
        df_train.at[idx, 'team2_runs_avg_inning2_venue'] = team2_inning2_bat['inning2_runs'].mean() if not team2_inning2_bat.empty else 0.0

    return df_train

In [None]:
def calculate_runs_avg_of_team_on_venue_last5(df_train, match_df):
    # Convert match_dt columns to datetime
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize new columns in df_train with 0 values
    df_train['team1_runs_avg_inning1_venue_last5'] = 0.0
    df_train['team2_runs_avg_inning1_venue_last5'] = 0.0
    df_train['team1_runs_avg_inning2_venue_last5'] = 0.0
    df_train['team2_runs_avg_inning2_venue_last5'] = 0.0

    # Iterate over each row in df_train
    for idx, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        venue = row['venue']
        match_date = row['match_dt']

        # Filter past matches in match_df based on venue and match date
        venue_matches = match_df[(match_df['venue'] == venue) & (match_df['match_dt'] < match_date)]

        # Select and sort last 5 matches for team1 in inning 1 batting
        team1_inning1_bat = venue_matches[((venue_matches['team1'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'bat')) |
                                          ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'bat')) |
                                          ((venue_matches['team1'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'field')) |
                                          ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'field'))]
        team1_inning1_bat = team1_inning1_bat.sort_values(by='match_dt', ascending=False).head(5)
        df_train.at[idx, 'team1_runs_avg_inning1_venue_last5'] = team1_inning1_bat['inning1_runs'].mean() if not team1_inning1_bat.empty else 0.0

        # Select and sort last 5 matches for team1 in inning 2 batting
        team1_inning2_bat = venue_matches[((venue_matches['team1'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'field')) |
                                          ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] == team1) & (venue_matches['toss decision'] == 'field')) |
                                          ((venue_matches['team1'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'bat')) |
                                          ((venue_matches['team2'] == team1) & (venue_matches['toss winner'] != team1) & (venue_matches['toss decision'] == 'bat'))]
        team1_inning2_bat = team1_inning2_bat.sort_values(by='match_dt', ascending=False).head(5)
        df_train.at[idx, 'team1_runs_avg_inning2_venue_last5'] = team1_inning2_bat['inning2_runs'].mean() if not team1_inning2_bat.empty else 0.0

        # Select and sort last 5 matches for team2 in inning 1 batting
        team2_inning1_bat = venue_matches[((venue_matches['team1'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'bat')) |
                                          ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'bat')) |
                                          ((venue_matches['team1'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'field')) |
                                          ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'field'))]
        team2_inning1_bat = team2_inning1_bat.sort_values(by='match_dt', ascending=False).head(5)
        df_train.at[idx, 'team2_runs_avg_inning1_venue_last5'] = team2_inning1_bat['inning1_runs'].mean() if not team2_inning1_bat.empty else 0.0

        # Select and sort last 5 matches for team2 in inning 2 batting
        team2_inning2_bat = venue_matches[((venue_matches['team1'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'field')) |
                                          ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] == team2) & (venue_matches['toss decision'] == 'field')) |
                                          ((venue_matches['team1'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'bat')) |
                                          ((venue_matches['team2'] == team2) & (venue_matches['toss winner'] != team2) & (venue_matches['toss decision'] == 'bat'))]
        team2_inning2_bat = team2_inning2_bat.sort_values(by='match_dt', ascending=False).head(5)
        df_train.at[idx, 'team2_runs_avg_inning2_venue_last5'] = team2_inning2_bat['inning2_runs'].mean() if not team2_inning2_bat.empty else 0.0

    return df_train

In [None]:
# calculate venue team win stats
def venue_team_wins(df_train, match_df):
    def calculate_venue_win_ratio(match_df, team, venue):
        # Filter matches where the team played at the given venue
        matches_at_venue = match_df[((match_df['team1'] == team) | (match_df['team2'] == team)) & (match_df['venue'] == venue)]

        # Count total matches played at the venue
        total_matches = matches_at_venue.shape[0]

        if total_matches == 0:
            return 0  # If no matches played at the venue, return 0

        # Count matches won by the team at the venue
        matches_won = matches_at_venue[matches_at_venue['winner'] == team].shape[0]

        # Calculate win ratio
        win_ratio = matches_won / total_matches
        return win_ratio

    # Create new columns for team1_venue_wins and team2_venue_wins
    df_train['team1_venue_wins'] = 0.0
    df_train['team2_venue_wins'] = 0.0

    # Iterate through each row of df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        venue = row['venue']

        # Calculate win ratios for team1 and team2
        team1_win_ratio = calculate_venue_win_ratio(match_df, team1, venue)
        team2_win_ratio = calculate_venue_win_ratio(match_df, team2, venue)

        # Assign win ratios to the respective columns
        df_train.at[index, 'team1_venue_wins'] = team1_win_ratio
        df_train.at[index, 'team2_venue_wins'] = team2_win_ratio

    return df_train

In [None]:
# calculate venue team win stats last 5 matches
def venue_team_wins_last5(df_train, match_df):
    def calculate_venue_win_ratio(match_df, team, venue):
        # Filter matches where the team played at the given venue
        matches_at_venue = match_df[((match_df['team1'] == team) | (match_df['team2'] == team)) & (match_df['venue'] == venue)]

        # Sort matches by date and take the latest 5 matches
        matches_at_venue = matches_at_venue.sort_values(by='match_dt', ascending=False).head(5)

        # Count total matches played at the venue
        total_matches = matches_at_venue.shape[0]

        if total_matches == 0:
            return 0  # If no matches played at the venue, return 0

        # Count matches won by the team at the venue
        matches_won = matches_at_venue[matches_at_venue['winner'] == team].shape[0]

        # Calculate win ratio
        win_ratio = matches_won / total_matches
        return win_ratio

    # Create new columns for team1_venue_wins_last5 and team2_venue_wins_last5
    df_train['team1_venue_wins_last5'] = 0.0
    df_train['team2_venue_wins_last5'] = 0.0

    # Iterate through each row of df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        venue = row['venue']

        # Calculate win ratios for team1 and team2
        team1_win_ratio = calculate_venue_win_ratio(match_df, team1, venue)
        team2_win_ratio = calculate_venue_win_ratio(match_df, team2, venue)

        # Assign win ratios to the respective columns
        df_train.at[index, 'team1_venue_wins_last5'] = team1_win_ratio
        df_train.at[index, 'team2_venue_wins_last5'] = team2_win_ratio

    return df_train

In [None]:
def calculate_batting_chances_winningOfTeam_VenueVise(df_train):
    def calculate_batting_chance_winning(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            if row['avg_inning1_runs_venue_last5'] != 0:
                return (row['team1_runs_avg_inning1_venue_last5'] / row['avg_inning1_runs_venue_last5']) * 100
            else:
                return 0
        elif (row['toss winner'] == 0 and row['toss decision'] == 1) or (row['toss winner'] == 1 and row['toss decision'] == 0):
            if row['avg_inning2_runs_venue_last5'] != 0:
                return (row['team1_runs_avg_inning2_venue_last5'] / row['avg_inning2_runs_venue_last5']) * 100
            else:
                return 0
        else:
            return 0

    def calculate_team2_batting_chance_winning(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            if row['avg_inning2_runs_venue_last5'] != 0:
                return (row['team2_runs_avg_inning2_venue_last5'] / row['avg_inning2_runs_venue_last5']) * 100
            else:
                return 0
        elif (row['toss winner'] == 0 and row['toss decision'] == 1) or (row['toss winner'] == 1 and row['toss decision'] == 0):
            if row['avg_inning1_runs_venue_last5'] != 0:
                return (row['team2_runs_avg_inning1_venue_last5'] / row['avg_inning1_runs_venue_last5']) * 100
            else:
                return 0
        else:
            return 0

    # Apply the function to each row and create the new column
    df_train['team1_batting_chance_winning_VenueVise'] = df_train.apply(calculate_batting_chance_winning, axis=1)
    df_train['team2_batting_chance_winning_VenueVise'] = df_train.apply(calculate_team2_batting_chance_winning, axis=1)
    df_train['relative_batting_chance_venueVise']=df_train['team1_batting_chance_winning_VenueVise']-df_train['team2_batting_chance_winning_VenueVise']
    return df_train

In [None]:
def calculate_batting_chances_winningOfTeam_FormVise(df_train):
    def calculate_batting_chance_winning(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            if row['avg_inning1_runs_venue_last5'] != 0:
                return (row['inning1_avg_runs_team1_last5'] / row['avg_inning1_runs_venue_last5']) * 100
            else:
                return 0
        elif (row['toss winner'] == 0 and row['toss decision'] == 1) or (row['toss winner'] == 1 and row['toss decision'] == 0):
            if row['avg_inning2_runs_venue_last5'] != 0:
                return (row['inning2_avg_runs_team1_last5'] / row['avg_inning2_runs_venue_last5']) * 100
            else:
                return 0
        else:
            return 0

    def calculate_team2_batting_chance_winning(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            if row['avg_inning2_runs_venue_last5'] != 0:
                return (row['inning2_avg_runs_team2_last5'] / row['avg_inning2_runs_venue_last5']) * 100
            else:
                return 0
        elif (row['toss winner'] == 0 and row['toss decision'] == 1) or (row['toss winner'] == 1 and row['toss decision'] == 0):
            if row['avg_inning1_runs_venue_last5'] != 0:
                return (row['inning1_avg_runs_team2_last5'] / row['avg_inning1_runs_venue_last5']) * 100
            else:
                return 0
        else:
            return 0

    # Apply the function to each row and create the new column
    df_train['team1_batting_chance_winning_FormVise'] = df_train.apply(calculate_batting_chance_winning, axis=1)
    df_train['team2_batting_chance_winning_FormVise'] = df_train.apply(calculate_team2_batting_chance_winning, axis=1)
    df_train['relative_batting_chance_formvise']=df_train['team1_batting_chance_winning_FormVise']-df_train['team2_batting_chance_winning_FormVise']
    return df_train

In [None]:
def calculate_bowling_chances_winningOfTeam_VenueVise(df_train):
    def calculate_batting_chance_winning(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 0) or (row['toss winner'] == 0 and row['toss decision'] == 1):
            if row['avg_inning1_wickets_venue_last5'] != 0:
                return (row['team1_wickets_avg_inning1_venue_last5'] / row['avg_inning1_wickets_venue_last5']) * 100
            else:
                return 0
        elif (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            if row['avg_inning2_wickets_venue_last5'] != 0:
                return (row['team1_wickets_avg_inning2_venue_last5'] / row['avg_inning2_wickets_venue_last5']) * 100
            else:
                return 0
        else:
            return 0

    def calculate_team2_bowling_chance_winning(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 0) or (row['toss winner'] == 0 and row['toss decision'] == 1):
            if row['avg_inning2_wickets_venue_last5'] != 0:
                return (row['team2_wickets_avg_inning2_venue_last5'] / row['avg_inning2_wickets_venue_last5']) * 100
            else:
                return 0
        elif (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            if row['avg_inning1_wickets_venue_last5'] != 0:
                return (row['team2_wickets_avg_inning1_venue_last5'] / row['avg_inning1_wickets_venue_last5']) * 100
            else:
                return 0
        else:
            return 0

    # Apply the function to each row and create the new column
    df_train['team1_bowling_chance_winning_VenueVise'] = df_train.apply(calculate_batting_chance_winning, axis=1)
    df_train['team2_bowling_chance_winning_VenueVise'] = df_train.apply(calculate_team2_bowling_chance_winning, axis=1)
    df_train['relative_bowling_chance_venueVise']=df_train['team1_bowling_chance_winning_VenueVise']-df_train['team2_bowling_chance_winning_VenueVise']
    return df_train

In [None]:
def calculate_bowling_chances_winningOfTeam_FormVise(df_train):
    def calculate_batting_chance_winning(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 0) or (row['toss winner'] == 0 and row['toss decision'] == 1):
            if row['avg_inning1_wickets_venue_last5'] != 0:
                return (row['inning1_avg_wickets_team1_last5'] / row['avg_inning1_wickets_venue_last5']) * 100
            else:
                return 0
        elif (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            if row['avg_inning2_wickets_venue_last5'] != 0:
                return (row['inning2_avg_wickets_team1_last5'] / row['avg_inning2_wickets_venue_last5']) * 100
            else:
                return 0
        else:
            return 0

    def calculate_team2_bowling_chance_winning(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 0) or (row['toss winner'] == 0 and row['toss decision'] == 1):
            if row['avg_inning2_wickets_venue_last5'] != 0:
                return (row['inning2_avg_wickets_team2_last5'] / row['avg_inning2_wickets_venue_last5']) * 100
            else:
                return 0
        elif (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            if row['avg_inning1_wickets_venue_last5'] != 0:
                return (row['inning1_avg_wickets_team2_last5'] / row['avg_inning1_wickets_venue_last5']) * 100
            else:
                return 0
        else:
            return 0

    # Apply the function to each row and create the new column
    df_train['team1_bowling_chance_winning_FormVise'] = df_train.apply(calculate_batting_chance_winning, axis=1)
    df_train['team2_bowling_chance_winning_FormVise'] = df_train.apply(calculate_team2_bowling_chance_winning, axis=1)
    df_train['relative_bowling_chance_formvise']=df_train['team1_bowling_chance_winning_FormVise']- df_train['team2_bowling_chance_winning_FormVise']
    return df_train

In [None]:
# Define a function to calculate performance based on the given conditions
def calculate_performace_matchVise(train_df):
    def calculate_performance(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            performance = (row['team1_runs_avg_inning1_venue_last5'] - row['team2_runs_avg_inning2_venue_last5'] +
                          row['team1_wickets_avg_inning2_venue_last5'] - row['team2_wickets_avg_inning1_venue_last5'])
        elif (row['toss winner'] == 0 and row['toss decision'] == 1) or (row['toss winner'] == 1 and row['toss decision'] == 0):
            performance = -(row['team2_runs_avg_inning1_venue_last5'] + row['team1_runs_avg_inning2_venue_last5']
                          -row['team2_wickets_avg_inning2_venue_last5'] +row['team1_wickets_avg_inning1_venue_last5'])
        else:
            performance = 0 # Handle cases where conditions are not met

        return performance

    def calculate_performance_FormVise(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            performance = (row['inning1_avg_runs_team1_last5'] - row['inning2_avg_runs_team2_last5'] +
                          row['inning2_avg_wickets_team1_last5'] - row['inning1_avg_wickets_team2_last5'])
        elif (row['toss winner'] == 0 and row['toss decision'] == 1) or (row['toss winner'] == 1 and row['toss decision'] == 0):
            performance = -(row['inning1_avg_runs_team2_last5'] + row['inning2_avg_runs_team1_last5']
                          -row['inning2_avg_wickets_team2_last5'] + row['inning1_avg_wickets_team1_last5'])
        else:
            performance = None  # Handle cases where conditions are not met

        return performance

    # Apply the function to create the new column
    train_df['performance_relative_venue_vise'] = train_df.apply(calculate_performance, axis=1)
    train_df['performance_relative_formVise'] = train_df.apply(calculate_performance, axis=1)

    return train_df

In [None]:
import pandas as pd

# Define a function to calculate performance based on the given conditions
def calculate_performance_matchVise_new(train_df):
    def calculate_batting_performance(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            batting_performance = row['team1_runs_avg_inning1_venue_last5'] - row['team2_runs_avg_inning2_venue_last5']
        elif (row['toss winner'] == 0 and row['toss decision'] == 1) or (row['toss winner'] == 1 and row['toss decision'] == 0):
            batting_performance = row['team1_runs_avg_inning2_venue_last5'] - row['team2_runs_avg_inning1_venue_last5']
        else:
            batting_performance = 0  # Handle cases where conditions are not met
        return batting_performance

    def calculate_bowling_performance(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            bowling_performance = row['team1_wickets_avg_inning2_venue_last5'] - row['team2_wickets_avg_inning1_venue_last5']
        elif (row['toss winner'] == 0 and row['toss decision'] == 1) or (row['toss winner'] == 1 and row['toss decision'] == 0):
            bowling_performance = row['team1_wickets_avg_inning1_venue_last5'] - row['team2_wickets_avg_inning2_venue_last5']
        else:
            bowling_performance = 0  # Handle cases where conditions are not met
        return bowling_performance

    def calculate_batting_performance_formVise(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            batting_performance_form = row['inning1_avg_runs_team1_last5'] - row['inning2_avg_runs_team2_last5']
        elif (row['toss winner'] == 0 and row['toss decision'] == 1) or (row['toss winner'] == 1 and row['toss decision'] == 0):
            batting_performance_form = row['inning2_avg_runs_team1_last5'] - row['inning1_avg_runs_team2_last5']
        else:
            batting_performance_form = 0  # Handle cases where conditions are not met
        return batting_performance_form

    def calculate_bowling_performance_formVise(row):
        if (row['toss winner'] == 1 and row['toss decision'] == 1) or (row['toss winner'] == 0 and row['toss decision'] == 0):
            bowling_performance_form = row['inning2_avg_wickets_team1_last5'] - row['inning1_avg_wickets_team2_last5']
        elif (row['toss winner'] == 0 and row['toss decision'] == 1) or (row['toss winner'] == 1 and row['toss decision'] == 0):
            bowling_performance_form = row['inning1_avg_wickets_team1_last5'] - row['inning2_avg_wickets_team2_last5']
        else:
            bowling_performance_form = 0  # Handle cases where conditions are not met
        return bowling_performance_form

    # Apply the function to create the new columns
    train_df['performance_relative_venue_vise_batting'] = train_df.apply(calculate_batting_performance, axis=1)
    train_df['performance_relative_venue_vise_bowling'] = train_df.apply(calculate_bowling_performance, axis=1)
    train_df['performance_relative_formVise_batting'] = train_df.apply(calculate_batting_performance_formVise, axis=1)
    train_df['performance_relative_formVise_bowling'] = train_df.apply(calculate_bowling_performance_formVise, axis=1)

    return train_df



In [None]:
# creative relative columns
def create_relative_columns(df_train):
  df_train['team1_venue_wins_last5 - team2_venue_wins_last5'] = df_train['team1_venue_wins_last5'] - df_train['team2_venue_wins_last5']
  df_train['team1_venue_wins - team2_venue_wins'] = df_train['team1_venue_wins'] - df_train['team2_venue_wins']

  df_train['team1_runs_avg_inning1_venue - team2_runs_avg_inning1_venue'] = df_train['team1_runs_avg_inning1_venue'] - df_train['team2_runs_avg_inning1_venue']
  df_train['team1_runs_avg_inning2_venue - team2_runs_avg_inning2_venue'] = df_train['team1_runs_avg_inning2_venue'] - df_train['team2_runs_avg_inning2_venue']

  df_train['team1_wickets_avg_inning1_venue - team2_wickets_avg_inning1_venue'] = df_train['team1_wickets_avg_inning1_venue'] - df_train['team2_wickets_avg_inning1_venue']
  df_train['team1_wickets_avg_inning2_venue - team2_wickets_avg_inning2_venue'] = df_train['team1_wickets_avg_inning2_venue'] - df_train['team2_wickets_avg_inning2_venue']

  df_train['team1_day match_wickets_avg - team2_day match_wickets_avg'] = df_train['team1_day match_wickets_avg'] - df_train['team2_day match_wickets_avg']
  df_train['team1_night match_wickets_avg - team2_night match_wickets_avg'] = df_train['team1_night match_wickets_avg'] - df_train['team2_night match_wickets_avg']
  df_train['team1_day/night match_wickets_avg - team2_day/night match_wickets_avg'] = df_train['team1_day/night match_wickets_avg'] - df_train['team2_day/night match_wickets_avg']

  df_train['team1_day match_runs_avg - team2_day match_runs_avg'] = df_train['team1_day match_runs_avg'] - df_train['team2_day match_runs_avg']
  df_train['team1_night match_runs_avg - team2_night match_runs_avg'] = df_train['team1_night match_runs_avg'] - df_train['team2_night match_runs_avg']
  df_train['team1_day/night match_runs_avg - team2_day/night match_runs_avg'] = df_train['team1_day/night match_runs_avg'] - df_train['team2_day/night match_runs_avg']

  df_train['team1_win_lighting1 - team2_win_lighting1'] = df_train['team1_win_lighting1'] - df_train['team2_win_lighting1']
  df_train['team1_win_lighting2 - team2_win_lighting2'] = df_train['team1_win_lighting2'] - df_train['team2_win_lighting2']

  df_train['team1_win_lighting1_last5 - team2_win_lighting1_last5'] = df_train['team1_win_lighting1_last5'] - df_train['team2_win_lighting1_last5']
  df_train['team1_win_lighting2_last5 - team2_win_lighting2_last5'] = df_train['team1_win_lighting2_last5'] - df_train['team2_win_lighting2_last5']

  df_train['nrr_team1 - nrr_team2'] = df_train['nrr_team1'] - df_train['nrr_team2']
  df_train['nrr_team1_last5 - nrr_team2_last5'] = df_train['nrr_team1_last5'] - df_train['nrr_team2_last5']

  df_train['team1_won_in_past - team2_won_in_past'] = df_train['team1_won_in_past'] - df_train['team2_won_in_past']
  df_train['team1_won_in_past_last5 - team2_won_in_past_last5'] = df_train['team1_won_in_past_last5'] - df_train['team2_won_in_past_last5']

  df_train['inning1_avg_wickets_team1 - inning1_avg_wickets_team2'] = df_train['inning1_avg_wickets_team1'] - df_train['inning1_avg_wickets_team2']
  df_train['inning2_avg_wickets_team1 - inning2_avg_wickets_team2'] = df_train['inning2_avg_wickets_team1'] - df_train['inning2_avg_wickets_team2']

  df_train['inning1_avg_runs_team1 - inning1_avg_runs_team2'] = df_train['inning1_avg_runs_team1'] - df_train['inning1_avg_runs_team2']
  df_train['inning2_avg_runs_team1 - inning2_avg_runs_team2'] = df_train['inning2_avg_runs_team1'] - df_train['inning2_avg_runs_team2']

  df_train['inning1_avg_wickets_team1_last5 - inning1_avg_wickets_team2_last5'] = df_train['inning1_avg_wickets_team1_last5'] - df_train['inning1_avg_wickets_team2_last5']
  df_train['inning2_avg_wickets_team1_last5 - inning2_avg_wickets_team2_last5'] = df_train['inning2_avg_wickets_team1_last5'] - df_train['inning2_avg_wickets_team2_last5']

  df_train['inning1_avg_runs_team1_last5 - inning1_avg_runs_team2_last5'] = df_train['inning1_avg_runs_team1_last5'] - df_train['inning1_avg_runs_team2_last5']
  df_train['inning2_avg_runs_team1_last5 - inning2_avg_runs_team2_last5'] = df_train['inning2_avg_runs_team1_last5'] - df_train['inning2_avg_runs_team2_last5']

  df_train['victory_by_runs_team1 - victory_by_runs_team2'] = df_train['victory_by_runs_team1'] - df_train['victory_by_runs_team2']
  df_train['victory_by_wickets_team1 - victory_by_wickets_team2'] = df_train['victory_by_wickets_team1'] - df_train['victory_by_wickets_team2']
  df_train['victory_by_runs_team1_last5 - victory_by_runs_team2_last5'] = df_train['victory_by_runs_team1_last5'] - df_train['victory_by_runs_team2_last5']
  df_train['victory_by_wickets_team1_last5 - victory_by_wickets_team2_last5'] = df_train['victory_by_wickets_team1_last5'] - df_train['victory_by_wickets_team2_last5']

  df_train['Overall_performance_relative_VenueVise'] = df_train['team1_bowling_chance_winning_VenueVise']  - df_train['team2_bowling_chance_winning_VenueVise'] + df_train['team1_batting_chance_winning_VenueVise']  - df_train['team2_batting_chance_winning_VenueVise']
  df_train['Overall_performance_relative_FormVise']  = df_train['team1_bowling_chance_winning_FormVise']   - df_train['team2_bowling_chance_winning_FormVise']  + df_train['team1_batting_chance_winning_FormVise']   - df_train['team2_batting_chance_winning_FormVise']

  return df_train

In [None]:
def calculate_batsman_ability_score(batsman_df, player_id, date):
    player_data = batsman_df[(batsman_df['match_dt'] < date) & (batsman_df['batsman_id'] == player_id)]
    if player_data.empty:
        return 0

    avg_runs = player_data['runs'].mean()
    avg_strike_rate = player_data['strike_rate'].mean()
    avg_fours = player_data['Fours'].mean()
    avg_sixes = player_data['Sixes'].mean()
    avg_balls_faced = player_data['balls_faced'].mean()

    runs_weight = 1.5181487193128311
    strike_rate_weight = 0.3083602790196472
    fours_weight = 0.7891150637823163
    sixes_weight = 1.0788199036938635
    balls_faced_weight = 0.4561330128642989
    ability_score = (avg_runs * runs_weight +
                     avg_strike_rate * strike_rate_weight +
                     avg_fours * fours_weight +
                     avg_sixes * sixes_weight +
                     avg_balls_faced * balls_faced_weight)

    return ability_score


In [None]:
def calculate_bowler_ability_score(bowler_data, player_id, date):
    player_data = bowler_data[(bowler_data['match_dt'] < date) & (bowler_data['bowler_id'] == player_id)]
    if player_data.empty:
        return 0

    avg_wickets = player_data['wicket_count'].mean()
    avg_overs_bowled = player_data['balls_bowled'].mean()
    avg_maiden_overs = player_data['maiden'].mean()
    avg_no_balls = player_data['noballs'].mean()
    avg_wides = player_data['wides'].mean()
    avg_economy_rate = player_data['economy'].mean()
    avg_dot_balls = player_data['dots'].mean()

    wickets_weight = 2.0809690120474973
    overs_bowled_weight = 1.0785331019058244
    maiden_overs_weight = 2.6357976951696878
    no_balls_weight = -0.5321420132185873
    wides_weight =  -0.7445927503032397
    economy_rate_weight =  -0.7620290752804595
    dot_balls_weight = 1.8390768223557485

    ability_score = (avg_wickets * wickets_weight +

                     avg_overs_bowled * overs_bowled_weight +
                     avg_maiden_overs * maiden_overs_weight +
                     avg_no_balls * no_balls_weight +
                     avg_wides * wides_weight +
                     avg_economy_rate * economy_rate_weight+
                     avg_dot_balls * dot_balls_weight)
    return ability_score

In [None]:
def calculate_batsman_related_functions(df_train, batsman_df):
    def get_batsman_abilities(row, team):
        batsman_ids_key = f'{team}_roster_ids'
        match_date_key = 'match_dt'

        batsman_ids = row[batsman_ids_key]
        match_date = row[match_date_key]

        if batsman_ids and match_date:
            abilities = [calculate_batsman_ability_score(batsman_df, player_id, match_date) for player_id in batsman_ids]
        else:
            abilities = []

        return abilities

    # Use tqdm to iterate over rows with a progress bar
    tqdm.pandas(desc="Calculating Batsman Abilities")

    for team in ['team1', 'team2']:
        df_train[f'{team}_BatsmanAbilities_list'] = df_train.progress_apply(get_batsman_abilities, args=(team,), axis=1)
        df_train[f'Sum_of_BatsmanAbilities_{team}'] = df_train[f'{team}_BatsmanAbilities_list'].apply(sum)

        def get_top_3(abilities_list):
            sorted_top_3 = sorted(abilities_list, reverse=True)[:3]
            return sorted_top_3 + [0] * (3 - len(sorted_top_3))  # Fill with 0 if less than 3 elements

        df_train[[f'top_batsman1_{team}', f'top_batsman2_{team}', f'top_batsman3_{team}']] = df_train[f'{team}_BatsmanAbilities_list'].apply(get_top_3).apply(pd.Series)

    return df_train

In [None]:
import pandas as pd
from tqdm import tqdm  # Import tqdm for progress bar

def calculate_bowler_related_functions(df_train, bowler_df):
    def get_bowler_abilities(row, team):
        bowler_ids_key = f'{team}_roster_ids'
        match_date_key = 'match_dt'

        bowler_ids = row[bowler_ids_key]
        match_date = row[match_date_key]

        if bowler_ids and match_date:
            abilities = [calculate_bowler_ability_score(bowler_df, player_id, match_date) for player_id in bowler_ids]
        else:
            abilities = []

        return abilities

    # Use tqdm to iterate over rows with a progress bar
    tqdm.pandas(desc="Calculating Bowler Abilities")

    for team in ['team1', 'team2']:
        df_train[f'{team}_BowlerAbilities_list'] = df_train.progress_apply(get_bowler_abilities, args=(team,), axis=1)
        df_train[f'Sum_of_BowlerAbilities_{team}'] = df_train[f'{team}_BowlerAbilities_list'].apply(sum)

        def get_top_3(abilities_list):
            sorted_top_3 = sorted(abilities_list, reverse=True)[:3]
            return sorted_top_3 + [0] * (3 - len(sorted_top_3))  # Fill with 0 if less than 3 elements

        df_train[[f'top_bowler1_{team}', f'top_bowler2_{team}', f'top_bowler3_{team}']] = df_train[f'{team}_BowlerAbilities_list'].apply(get_top_3).apply(pd.Series)

    return df_train


In [None]:
def update_batting_bowling_depth(df):
    df['Sum_batting-bowling_team1'] = df['Sum_of_BatsmanAbilities_team1'] + df['Sum_of_BowlerAbilities_team1']
    df['Sum_batting-bowling_team2'] = df['Sum_of_BatsmanAbilities_team2'] + df['Sum_of_BowlerAbilities_team2']
    df['Batting-Bowling_diffrence'] = df['Sum_batting-bowling_team1'] - df['Sum_batting-bowling_team2']
    df['Batting_ability_diffrence']= df['Sum_of_BatsmanAbilities_team1']- df['Sum_of_BatsmanAbilities_team2']
    df['Bowling_ability_diffrence']= df['Sum_of_BowlerAbilities_team1']- df['Sum_of_BowlerAbilities_team2']

    return df

In [None]:
def under_pressure_parameters(df_train, batsman_df, bowler_df, match_df):
    # Fill missing values in batsman_df with 0 for calculation purposes
    batsman_df['Fours'].fillna(0, inplace=True)
    batsman_df['Sixes'].fillna(0, inplace=True)
    batsman_df['balls_faced'].fillna(0, inplace=True)
    batsman_df['strike_rate'].fillna(0, inplace=True)

    # Define the modified function to calculate ability score under pressure
    def calculate_ability_score_under_Pressure(batsman_df, player_id, date):
        player_data = batsman_df[(batsman_df['match_dt'] < date) & (batsman_df['batsman_id'] == player_id) & (batsman_df['over_faced_first'] > 15)]
        if player_data.empty:
            return 0

        avg_runs = player_data['runs'].mean()
        avg_strike_rate = player_data['strike_rate'].mean()
        avg_fours = player_data['Fours'].mean()
        avg_sixes = player_data['Sixes'].mean()
        avg_balls_faced = player_data['balls_faced'].mean()

        # Adjusted weights for T20 context after 15 overs
        runs_weight = 1.5
        strike_rate_weight = 1.2
        fours_weight = 5
        sixes_weight = 5
        balls_faced_weight = 0.7

        ability_score = (avg_runs * runs_weight +
                         avg_strike_rate * strike_rate_weight +
                         avg_fours * fours_weight +
                         avg_sixes * sixes_weight +
                         avg_balls_faced * balls_faced_weight)

        return ability_score

    # Define the function to calculate team performance scores
    def calculate_team_performance(df_train, batsman_df, team_column):
        team_scores = []

        for _, row in df_train.iterrows():
            team_roster_ids = row[team_column]
            match_date = row['match_dt']
            # Assuming team_roster_ids is a list of player IDs
            team_score = sum(calculate_ability_score_under_Pressure(batsman_df, player_id, match_date) for player_id in team_roster_ids)
            team_scores.append(team_score)

        return team_scores

    # Assuming team1_roster_ids and team2_roster_ids are lists of player IDs
    df_train['team1_performance_under_Pressure'] = calculate_team_performance(df_train, batsman_df, 'team1_roster_ids')
    df_train['team2_performance_under_Pressure'] = calculate_team_performance(df_train, batsman_df, 'team2_roster_ids')
    df_train['performance_under_pressure_relative']=df_train['team1_performance_under_Pressure'] -df_train['team2_performance_under_Pressure']

    return df_train

In [None]:
def venue_teams_rivalry(df_train, match_df):
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize the features
    df_train['team1_wins_vs_team2_venue'] = 0.0
    df_train['team2_wins_vs_team1_venue'] = 0.0

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        venue = row['venue']
        match_date = row['match_dt']

        # Filter matches where team1 played against team2 at the same venue
        team1_matches = match_df[(match_df['venue'] == venue) & (((match_df['team1'] == team1) & (match_df['team2'] == team2)) | ((match_df['team2'] == team1) & (match_df['team1'] == team2))) & (match_df['match_dt'] < match_date)]
        team1_wins = team1_matches[team1_matches['winner'] == team1].shape[0]
        total_team1_matches = team1_matches.shape[0]
        if total_team1_matches > 0:
            df_train.at[index, 'team1_wins_vs_team2_venue'] = team1_wins / total_team1_matches
        else:
            df_train.at[index, 'team1_wins_vs_team2_venue'] = 0.0



    return df_train

In [None]:
def calculate_avg_runs_in_rivalry(df_train, match_df):
    # Convert match_dt columns to datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize the new columns
    df_train['team1_vs_team2_avg_inning1_runs'] = 0.0
    df_train['team2_vs_team1_avg_inning1_runs'] = 0.0
    df_train['team1_vs_team2_avg_inning2_runs'] = 0.0
    df_train['team2_vs_team1_avg_inning2_runs'] = 0.0

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter past matches where team1 played against team2 before the current match date
        past_matches = match_df[((match_df['team1'] == team1) & (match_df['team2'] == team2) & (match_df['match_dt'] < current_match_dt)) |
                                ((match_df['team1'] == team2) & (match_df['team2'] == team1) & (match_df['match_dt'] < current_match_dt))]

        team1_inning1_runs = []
        team2_inning1_runs = []
        team1_inning2_runs = []
        team2_inning2_runs = []

        for _, match in past_matches.iterrows():
            toss_winner = match['toss winner']
            toss_decision = match['toss decision']

            # Determine who is batting first
            if toss_winner == team1:
                if toss_decision == 'bat':
                    team1_inning1_runs.append(match['inning1_runs'])
                    team2_inning2_runs.append(match['inning2_runs'])
                else:
                    team2_inning1_runs.append(match['inning1_runs'])
                    team1_inning2_runs.append(match['inning2_runs'])
            elif toss_winner == team2:
                if toss_decision == 'bat':
                    team2_inning1_runs.append(match['inning1_runs'])
                    team1_inning2_runs.append(match['inning2_runs'])
                else:
                    team1_inning1_runs.append(match['inning1_runs'])
                    team2_inning2_runs.append(match['inning2_runs'])

        # Calculate average runs and handle division by zero
        df_train.at[index, 'team1_vs_team2_avg_inning1_runs'] = sum(team1_inning1_runs) / len(team1_inning1_runs) if team1_inning1_runs else 0.0
        df_train.at[index, 'team2_vs_team1_avg_inning1_runs'] = sum(team2_inning1_runs) / len(team2_inning1_runs) if team2_inning1_runs else 0.0
        df_train.at[index, 'team1_vs_team2_avg_inning2_runs'] = sum(team1_inning2_runs) / len(team1_inning2_runs) if team1_inning2_runs else 0.0
        df_train.at[index, 'team2_vs_team1_avg_inning2_runs'] = sum(team2_inning2_runs) / len(team2_inning2_runs) if team2_inning2_runs else 0.0

    return df_train

In [None]:
def calculate_avg_wickets_in_rivalry(df_train, match_df):
    # Convert match_dt columns to datetime format
    df_train['match_dt'] = pd.to_datetime(df_train['match_dt'])
    match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])

    # Initialize the new columns
    df_train['team1_vs_team2_avg_inning1_wickets'] = 0.0
    df_train['team2_vs_team1_avg_inning1_wickets'] = 0.0
    df_train['team1_vs_team2_avg_inning2_wickets'] = 0.0
    df_train['team2_vs_team1_avg_inning2_wickets'] = 0.0

    # Iterate through each row in df_train
    for index, row in df_train.iterrows():
        team1 = row['team1']
        team2 = row['team2']
        current_match_dt = row['match_dt']

        # Filter past matches where team1 played against team2 before the current match date
        past_matches = match_df[((match_df['team1'] == team1) & (match_df['team2'] == team2) & (match_df['match_dt'] < current_match_dt)) |
                                ((match_df['team1'] == team2) & (match_df['team2'] == team1) & (match_df['match_dt'] < current_match_dt))]

        team1_inning1_wickets = []
        team2_inning1_wickets = []
        team1_inning2_wickets = []
        team2_inning2_wickets = []

        for _, match in past_matches.iterrows():
            toss_winner = match['toss winner']
            toss_decision = match['toss decision']

            # Determine who is batting first
            if toss_winner == team1:
                if toss_decision == 'field':
                    team1_inning1_wickets.append(match['inning1_wickets'])
                    team2_inning2_wickets.append(match['inning2_wickets'])
                else:
                    team2_inning1_wickets.append(match['inning1_wickets'])
                    team1_inning2_wickets.append(match['inning2_wickets'])
            elif toss_winner == team2:
                if toss_decision == 'field':
                    team2_inning1_wickets.append(match['inning1_wickets'])
                    team1_inning2_wickets.append(match['inning2_wickets'])
                else:
                    team1_inning1_wickets.append(match['inning1_wickets'])
                    team2_inning2_wickets.append(match['inning2_wickets'])

        # Calculate average wickets and handle division by zero
        df_train.at[index, 'team1_vs_team2_avg_inning1_wickets'] = sum(team1_inning1_wickets) / len(team1_inning1_wickets) if team1_inning1_wickets else 0.0
        df_train.at[index, 'team2_vs_team1_avg_inning1_wickets'] = sum(team2_inning1_wickets) / len(team2_inning1_wickets) if team2_inning1_wickets else 0.0
        df_train.at[index, 'team1_vs_team2_avg_inning2_wickets'] = sum(team1_inning2_wickets) / len(team1_inning2_wickets) if team1_inning2_wickets else 0.0
        df_train.at[index, 'team2_vs_team1_avg_inning2_wickets'] = sum(team2_inning2_wickets) / len(team2_inning2_wickets) if team2_inning2_wickets else 0.0

    return df_train

In [None]:
import pandas as pd

def calculate_past_head_on_vs(df_train):
    # Initialize the new column
    df_train['past_head_on_vs'] = 0.0

    # Define a function to calculate past head-on value
    def calculate_past_head_on_value(row):
        team1_batting_first = (row['toss winner'] == row['team1'] and row['toss decision'] == 'bat') or \
                              (row['toss winner'] != row['team1'] and row['toss decision'] == 'field')

        if team1_batting_first:
            past_head_on_vs_value = (
                row['team1_vs_team2_avg_inning1_runs'] +
                row['team1_vs_team2_avg_inning2_wickets'] -
                row['team2_vs_team1_avg_inning2_runs'] -
                row['team2_vs_team1_avg_inning1_wickets']
            )
        else:
            past_head_on_vs_value = (
                row['team1_vs_team2_avg_inning2_runs'] +
                row['team1_vs_team2_avg_inning1_wickets'] -
                row['team2_vs_team1_avg_inning1_runs'] -
                row['team2_vs_team1_avg_inning2_wickets']
            )

        return past_head_on_vs_value

    # Apply the function to calculate the past_head_on_vs value
    df_train['past_head_on_vs'] = df_train.apply(calculate_past_head_on_value, axis=1)

    return df_train

# Example usage:
# Assuming df_train is defined and contains the necessary columns
# df_train = calculate_past_head_on_vs(df_train)


In [None]:
import pandas as pd

def calculate_past_head_on_vs_new(df_train):
    # Initialize the new columns
    df_train['past_head_on_vs_batting'] = 0.0
    df_train['past_head_on_vs_bowling'] = 0.0

    # Define a function to calculate batting and bowling performance
    def calculate_performance(row):
        team1_batting_first = (row['toss winner'] == row['team1'] and row['toss decision'] == 'bat') or \
                              (row['toss winner'] != row['team1'] and row['toss decision'] == 'field')

        if team1_batting_first:
            batting_performance = row['team1_vs_team2_avg_inning1_runs'] - row['team2_vs_team1_avg_inning2_runs']
            bowling_performance = row['team1_vs_team2_avg_inning2_wickets'] - row['team2_vs_team1_avg_inning1_wickets']
        else:
            batting_performance = row['team1_vs_team2_avg_inning2_runs'] - row['team2_vs_team1_avg_inning1_runs']
            bowling_performance = row['team1_vs_team2_avg_inning1_wickets'] - row['team2_vs_team1_avg_inning2_wickets']

        return pd.Series([batting_performance, bowling_performance])

    # Apply the function to calculate the performance
    df_train[['past_head_on_vs_batting', 'past_head_on_vs_bowling']] = df_train.apply(calculate_performance, axis=1)

    return df_train

# Example usage:
# Assuming df_train is defined and contains the necessary columns
# df_train = calculate_past_head_on_vs(df_train)


In [None]:
def total_process(df_train, match_df, batsman_df, bowler_df):
    df_train = preprocess_train(df_train, match_df)
    df_train = one_hot_encode_stuff(df_train)
    df_train = calculate_avg_inning1_runs_venue(df_train, match_df)
    df_train = calculate_avg_inning1_runs_venue_last5(df_train, match_df)
    df_train = calculate_avg_inning2_runs_venue(df_train, match_df)
    df_train = calculate_avg_inning2_runs_venue_last5(df_train, match_df)
    df_train = calculate_avg_inning1_wickets_venue(df_train, match_df)
    df_train = calculate_avg_inning1_wickets_venue_last5(df_train, match_df)
    df_train = calculate_avg_inning2_wickets_venue(df_train, match_df)
    df_train = calculate_avg_inning2_wickets_venue_last5(df_train, match_df)
    df_train = calculate_victory_columns(df_train, match_df)
    df_train = calculate_victory_columns_last5(df_train, match_df)
    df_train = calculate_inning1_avg_runs(df_train, match_df)
    df_train = calculate_inning1_avg_runs_last5(df_train, match_df)
    df_train = calculate_inning2_avg_runs(df_train, match_df)
    df_train = calculate_inning2_avg_runs_last5(df_train, match_df)
    df_train = calculate_inning1_avg_wickets(df_train, match_df)
    df_train = calculate_inning1_avg_wickets_last5(df_train, match_df)
    df_train = calculate_inning2_avg_wickets(df_train, match_df)
    df_train = calculate_inning2_avg_wickets_last5(df_train, match_df)
    df_train = calculate_past_wins(df_train, match_df)
    df_train = calculate_past_wins_last5(df_train, match_df)
    df_train = calculate_nrr(df_train, match_df)
    df_train = calculate_nrr_last5(df_train, match_df)
    df_train = add_lighting_win_ratios(df_train, match_df)
    df_train = add_lighting_win_ratios_last5(df_train, match_df)
    df_train = add_day_night_win_ratios(df_train, match_df)
    df_train = add_day_night_win_ratios_last5(df_train, match_df)
    df_train = calculate_wickets_avg_lighting(df_train, match_df)
    df_train = calculate_runs_avg_lighting(df_train, match_df)
    df_train = calculate_wickets_avg_of_team_on_venue(df_train, match_df)
    df_train = calculate_runs_avg_of_team_on_venue(df_train, match_df)
    df_train = venue_team_wins(df_train, match_df)
    df_train = venue_team_wins_last5(df_train, match_df)
    df_train = calculate_wickets_avg_of_team_on_venue_last5(df_train, match_df)
    df_train = calculate_runs_avg_of_team_on_venue_last5(df_train, match_df)
    df_train = calculate_batting_chances_winningOfTeam_VenueVise(df_train)
    df_train = calculate_batting_chances_winningOfTeam_FormVise(df_train)
    df_train = calculate_bowling_chances_winningOfTeam_VenueVise(df_train)
    df_train = calculate_bowling_chances_winningOfTeam_FormVise(df_train)
    df_train = calculate_performace_matchVise(df_train)
    df_train = create_relative_columns(df_train)
    df_train = calculate_venue_based_bat_first_win_probability(df_train, match_df)
    df_train = team1_toss_based_win_chances(df_train)
    df_train = preprocess_train(df_train,match_df)
    df_train = calculate_bowler_related_functions(df_train, bowler_df)
    df_train = calculate_batsman_related_functions(df_train, batsman_df)
    df_train = update_batting_bowling_depth(df_train)
    df_train = under_pressure_parameters(df_train, batsman_df, bowler_df, match_df)
    df_train = venue_teams_rivalry(df_train, match_df)
    df_train = calculate_avg_runs_in_rivalry(df_train, match_df)
    df_train = calculate_avg_wickets_in_rivalry(df_train, match_df)
    df_train = calculate_past_head_on_vs(df_train)
    df_train=calculate_performance_matchVise_new(df_train)
    df_train=calculate_past_head_on_vs_new(df_train)
    if 'winner_id' in df_train.columns:
      df_train.drop(columns = 'winner_id', inplace=True)

    return df_train

In [None]:
df_train = total_process(df_train, match_df, batsman_df, bowler_df)

  df_train['team2_runs_avg_inning2_venue_last5'] = 0.0
  df_train['team1_batting_chance_winning_VenueVise'] = df_train.apply(calculate_batting_chance_winning, axis=1)
  df_train['team2_batting_chance_winning_VenueVise'] = df_train.apply(calculate_team2_batting_chance_winning, axis=1)
  df_train['relative_batting_chance_venueVise']=df_train['team1_batting_chance_winning_VenueVise']-df_train['team2_batting_chance_winning_VenueVise']
  df_train['team1_batting_chance_winning_FormVise'] = df_train.apply(calculate_batting_chance_winning, axis=1)
  df_train['team2_batting_chance_winning_FormVise'] = df_train.apply(calculate_team2_batting_chance_winning, axis=1)
  df_train['relative_batting_chance_formvise']=df_train['team1_batting_chance_winning_FormVise']-df_train['team2_batting_chance_winning_FormVise']
  df_train['team1_bowling_chance_winning_VenueVise'] = df_train.apply(calculate_batting_chance_winning, axis=1)
  df_train['team2_bowling_chance_winning_VenueVise'] = df_train.apply(calculat

In [None]:
test_data = total_process(test_data, match_df, batsman_df, bowler_df)

  df_train['team2_batting_chance_winning_VenueVise'] = df_train.apply(calculate_team2_batting_chance_winning, axis=1)
  df_train['relative_batting_chance_venueVise']=df_train['team1_batting_chance_winning_VenueVise']-df_train['team2_batting_chance_winning_VenueVise']
  df_train['team1_batting_chance_winning_FormVise'] = df_train.apply(calculate_batting_chance_winning, axis=1)
  df_train['team2_batting_chance_winning_FormVise'] = df_train.apply(calculate_team2_batting_chance_winning, axis=1)
  df_train['relative_batting_chance_formvise']=df_train['team1_batting_chance_winning_FormVise']-df_train['team2_batting_chance_winning_FormVise']
  df_train['team1_bowling_chance_winning_VenueVise'] = df_train.apply(calculate_batting_chance_winning, axis=1)
  df_train['team2_bowling_chance_winning_VenueVise'] = df_train.apply(calculate_team2_bowling_chance_winning, axis=1)
  df_train['relative_bowling_chance_venueVise']=df_train['team1_bowling_chance_winning_VenueVise']-df_train['team2_bowling_chan

In [None]:
round2_data = total_process(round2_data, match_df, batsman_df, bowler_df)

In [None]:
df_train.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)
round2_data.fillna(0, inplace=True)

In [None]:
print(df_train.shape)
print(test_data.shape)
print(round2_data.shape)

In [None]:
df_train.to_csv('/content/drive/MyDrive/AmEx/processed_train_f.csv', index=False)
test_data.to_csv('/content/drive/MyDrive/AmEx/processed_test_f.csv', index=False)
round2_data.to_csv('/content/drive/MyDrive/AmEx/round2_processed_f.csv', index=False)