# Select features we will use

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import skew

train = pd.read_csv("train_data.csv")

# Load the data
file_path = '2024_test_data.csv'  # Adjust the path if necessary
df = pd.read_csv(file_path)

testing_max_id = df['id'].max()

# Filter rows where all specified columns are NaN
filtered_rows = df[df[['home_team_rest', 'away_team_rest', 'home_pitcher_rest', 'away_pitcher_rest']].isna().all(axis=1)][:-1]

# 'home_batting', 'away_batting', 
columns_to_drop = [col for col in df.columns if col.startswith(('home_pitcher', 'away_pitcher'))]
df = df.drop(columns=columns_to_drop)

# Store filtered rows separately
remaining_rows = df[~df.index.isin(filtered_rows.index)]
print("Rows with all specified columns as NaN:")
print(filtered_rows)

# Remove these rows from the DataFrame
df = df.drop(filtered_rows.index)
df['home_team_win'] = np.nan
df['season'] = 2024

# Step 1: Normalize and restore home_team_wins_mean
# (x-9689)/(2429-9689)
home_mean_max = train.loc[train['id'] == 2429, 'home_team_wins_mean'].iloc[0]
home_mean_min = train.loc[train['id'] == 9689, 'home_team_wins_mean'].iloc[0]
df['home_team_wins_mean_original'] = (df['home_team_wins_mean'] - home_mean_min) / (home_mean_max - home_mean_min)

# Step 2: Normalize and restore home_team_wins_std
home_std_max = train.loc[train['id'] == 6338, 'home_team_wins_std'].iloc[0]
home_std_min = train.loc[train['id'] == 9689, 'home_team_wins_std'].iloc[0]
df['home_team_wins_std_original'] = (df['home_team_wins_std'] - home_std_min) / (2 * (home_std_max - home_std_min))

# Find the value closest to 0
home_shift_skew = train.loc[train['id'] == 4316, 'home_team_wins_skew'].iloc[0]
home_skew_bias_original = skew([0, 0, 1])
home_skew_for_one_thrid = train.loc[train['id'] == 6782, 'home_team_wins_skew'].iloc[0]
df['home_team_wins_skew_original'] = df['home_team_wins_skew'].apply(
    lambda x: ( (x - home_shift_skew) * home_skew_bias_original) / (home_skew_for_one_thrid - home_shift_skew) if pd.notna(x) else np.nan
)

# Step 3: Replace NaN values in home_team_wins_mean_original using home_team_wins_std_original
df['home_std_error'] = df['home_team_wins_std_original'] - (df['home_team_wins_mean_original'] * (1 - df['home_team_wins_mean_original'])) ** 0.5
df['home_skew_error'] = df['home_team_wins_skew_original'] - df['home_team_wins_mean_original'] * (1 - df['home_team_wins_mean_original']) * (1 - 2 * df['home_team_wins_mean_original']) / (df['home_team_wins_std_original'] ** 3)
df['home_mean_error'] = df['home_team_wins_mean_original'] - (1 - df['home_team_wins_skew_original'] * df['home_team_wins_std_original']) / 2
df.loc[df['home_team_wins_mean_original'].isna(), 'home_team_wins_mean_original'] = (
    (1 - df['home_team_wins_skew_original'] * df['home_team_wins_std_original']) / 2
)

# Step 4: Normalize and restore away_team_wins_mean
# (x-2454)/(9689-2454)
away_mean_max = train.loc[train['id'] == 9689, 'away_team_wins_mean'].iloc[0]
away_mean_min = train.loc[train['id'] == 2454, 'away_team_wins_mean'].iloc[0]
df['away_team_wins_mean_original'] = (df['away_team_wins_mean'] - away_mean_min) / (away_mean_max - away_mean_min)

# Step 5: Normalize and restore away_team_wins_std
away_std_max = train.loc[train['id'] == 4316, 'away_team_wins_std'].iloc[0]
away_std_min = train.loc[train['id'] == 9689, 'away_team_wins_std'].iloc[0]
df['away_team_wins_std_original'] = (df['away_team_wins_std'] - away_std_min) / (2 * (away_std_max - away_std_min))

# Find the value closest to 0
away_shift_skew = train.loc[train['id'] == 4316, 'away_team_wins_skew'].iloc[0]
away_skew_bias_original = skew([0, 0, 1])
away_skew_for_one_thrid = train.loc[train['id'] == 1366, 'away_team_wins_skew'].iloc[0]
df['away_team_wins_skew_original'] = df['away_team_wins_skew'].apply(
    lambda x: ( (x - away_shift_skew) * away_skew_bias_original) / (away_skew_for_one_thrid - away_shift_skew) if pd.notna(x) else np.nan
)

# Step 6: Replace NaN values in away_team_wins_mean_original using away_team_wins_std_original
df['away_std_error'] = df['away_team_wins_std_original'] - (df['away_team_wins_mean_original'] * (1 - df['away_team_wins_mean_original'])) ** 0.5
df['away_skew_error'] = df['away_team_wins_skew_original'] - df['away_team_wins_mean_original'] * (1 - df['away_team_wins_mean_original']) * (1 - 2 * df['away_team_wins_mean_original']) / (df['away_team_wins_std_original'] ** 3)
df['away_mean_error'] = df['away_team_wins_mean_original'] - (1 - df['away_team_wins_skew_original'] * df['away_team_wins_std_original']) / 2
df.loc[df['away_team_wins_mean_original'].isna(), 'away_team_wins_mean_original'] = (
    (1 - df['away_team_wins_skew_original'] * df['away_team_wins_std_original']) / 2
)

# (x-2429)/(9689-2429)
home_errors_mean_max = train.loc[train['id'] == 9689, 'home_team_errors_mean'].iloc[0]
home_errors_mean_min = train.loc[train['id'] == 2429, 'home_team_errors_mean'].iloc[0]
df['home_team_errors_mean_original'] = (df['home_team_errors_mean'] - home_errors_mean_min) / (home_errors_mean_max - home_errors_mean_min)

# (x-9296)/(2429-9296)
away_errors_mean_max = train.loc[train['id'] == 2429, 'away_team_errors_mean'].iloc[0]
away_errors_mean_min = train.loc[train['id'] == 9296, 'away_team_errors_mean'].iloc[0]
df['away_team_errors_mean_original'] = (df['away_team_errors_mean'] - away_errors_mean_min) / (away_errors_mean_max - away_errors_mean_min)

# (x-2111)/[(1366-2111)*2]
home_batting_RBI_mean_max = train.loc[train['id'] == 1366, 'home_batting_RBI_mean'].iloc[0]
home_batting_RBI_mean_min = train.loc[train['id'] == 2111, 'home_batting_RBI_mean'].iloc[0]
df['home_batting_RBI_mean_original'] = (df['home_batting_RBI_mean'] - home_batting_RBI_mean_min) / [(home_batting_RBI_mean_max - home_batting_RBI_mean_min) * 2]

# (x-5837)/(9694-5837)
away_batting_RBI_mean_max = train.loc[train['id'] == 9694, 'away_batting_RBI_mean'].iloc[0]
away_batting_RBI_mean_min = train.loc[train['id'] == 5837, 'away_batting_RBI_mean'].iloc[0]
df['away_batting_RBI_mean_original'] = (df['away_batting_RBI_mean'] - away_batting_RBI_mean_min) / (away_batting_RBI_mean_max - away_batting_RBI_mean_min)

# Save the updated DataFrame
df.to_csv(f'data.csv', index=False)

print('home_team_errors_mean_original')
print(df['home_team_errors_mean_original'].mean())
print('away_team_errors_mean_original')
print(df['away_team_errors_mean_original'].mean())
print('home_batting_RBI_mean_original')
print(df['home_batting_RBI_mean_original'].mean())
print('away_batting_RBI_mean_original')
print(df['away_batting_RBI_mean_original'].mean())

Rows with all specified columns as NaN:
        id home_team_abbr away_team_abbr is_night_game home_pitcher  \
146    146            PJT            GLO         False     rossty01   
154    154            XFB            GKO          True     wilkad01   
447    447            KJP            RAV         False    johnser04   
594    594            RKN            RLJ          True    buehlwa01   
749    749            YHA            JEM          True    campbpa02   
904    904            HXK            UPV         False    graveke01   
1224  1224            GUT            VQC         False    gonzame01   
1305  1305            JBM            ZQF         False    litteza01   
1373  1373            QDH            FBW           NaN    scholje01   
1406  1406            KFH            SAJ         False    neideni01   
1638  1638            DPS            MOO          True    boshebu01   
1679  1679            ECN            HAN         False    alvarjo02   
1948  1948            STC            

In [2]:
import bisect
import pandas as pd
from math import isqrt
from math import gcd

numerators = range(1, 2000)
denominators = range(1, 162)
fraction_values = []
fraction_pairs = []
for numerator in numerators:
    for denominator in denominators:
        fraction_values.append(numerator / denominator)
        fraction_pairs.append((numerator, denominator))

fraction_values, fraction_pairs = zip(*sorted(zip(fraction_values, fraction_pairs)))

def find_closest_fraction(value):
    odd = value
    value = value % 1 # Ensure the value is positive

    # Perform binary search to find the closest value
    idx = bisect.bisect_left(fraction_values, value)

    # Determine the closest fraction
    if idx == 0:
        closest_fraction = fraction_pairs[0]
    elif idx == len(fraction_values):
        closest_fraction = fraction_pairs[-1]
    else:
        left = fraction_values[idx - 1]
        right = fraction_values[idx]
        closest_fraction = (
            fraction_pairs[idx - 1]
            if abs(value - left) < abs(value - right)
            else fraction_pairs[idx]
        )

    # Extract numerator and denominator
    closest_wins, closest_total_games = closest_fraction[0], closest_fraction[1]

    # Reduce the fraction to its simplest form
    common_divisor = gcd(closest_wins, closest_total_games)
    reduced_wins = closest_wins // common_divisor
    reduced_total_games = closest_total_games // common_divisor
    if reduced_total_games == 161:
        print(f'Old:{odd} , Original value: {value}, Closest fraction: {reduced_wins}/{reduced_total_games}')

    # Return the denominator of the reduced fraction
    return reduced_total_games

df['home_team_errors_mean_den'] = df['home_team_errors_mean_original'].apply(
    lambda x: 1 if pd.isna(x) or abs(x % 1) < 1e-16 else find_closest_fraction(x))
df['away_team_errors_mean_den'] = df['away_team_errors_mean_original'].apply(
    lambda x: 1 if pd.isna(x) or abs(x % 1) < 1e-16 else find_closest_fraction(x))
df['home_batting_RBI_mean_den'] = df['home_batting_RBI_mean_original'].apply(
    lambda x: 1 if pd.isna(x) or abs(x % 1) < 1e-16 else find_closest_fraction(x))
df['away_batting_RBI_mean_den'] = df['away_batting_RBI_mean_original'].apply(
    lambda x: 1 if pd.isna(x) or abs(x % 1) < 1e-16 else find_closest_fraction(x))

# 如果原始值是 NaN，則保持分母列為 NaN
df['home_team_errors_mean_den'] = np.where(df['home_team_errors_mean'].isna(), np.nan, df['home_team_errors_mean_den'])
df['away_team_errors_mean_den'] = np.where(df['away_team_errors_mean'].isna(), np.nan, df['away_team_errors_mean_den'])
df['home_batting_RBI_mean_den'] = np.where(df['home_batting_RBI_mean'].isna(), np.nan, df['home_batting_RBI_mean_den'])
df['away_batting_RBI_mean_den'] = np.where(df['away_batting_RBI_mean'].isna(), np.nan, df['away_batting_RBI_mean_den'])

# 保存到新的 CSV 文件
df.to_csv(f'data.csv', index=False)

Old:0.6770186335403727 , Original value: 0.6770186335403727, Closest fraction: 109/161
Old:0.5093167701863354 , Original value: 0.5093167701863354, Closest fraction: 82/161
Old:0.49068322981366463 , Original value: 0.49068322981366463, Closest fraction: 79/161
Old:0.5527950310559008 , Original value: 0.5527950310559008, Closest fraction: 89/161
Old:0.47204968944099385 , Original value: 0.47204968944099385, Closest fraction: 76/161
Old:0.596273291925466 , Original value: 0.596273291925466, Closest fraction: 96/161
Old:0.3850931677018634 , Original value: 0.3850931677018634, Closest fraction: 62/161
Old:0.7080745341614907 , Original value: 0.7080745341614907, Closest fraction: 114/161
Old:0.5403726708074534 , Original value: 0.5403726708074534, Closest fraction: 87/161
Old:0.5403726708074534 , Original value: 0.5403726708074534, Closest fraction: 87/161
Old:0.5590062111801242 , Original value: 0.5590062111801242, Closest fraction: 90/161
Old:0.5279503105590062 , Original value: 0.5279503

# **Calculating** the winner

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from math import gcd
from bisect import bisect_left

# Load the data
file_path = 'data.csv'  # adjust path if necessary

df = pd.read_csv(file_path)

# Precompute possible win rates for all combinations of wins and total games
def precompute_win_rates(max_games=200):
    win_rate_list = []
    for total_games in range(1, max_games + 1):
        for wins in range(0, total_games + 1):
            rate = wins / total_games
            win_rate_list.append((rate, wins, total_games))
    # Sort by win rate
    win_rate_list.sort(key=lambda x: x[0])
    return win_rate_list

# Precompute the win rates
precomputed_win_rates = precompute_win_rates()
precomputed_rates = [rate[0] for rate in precomputed_win_rates]

# Define a function to 
# culate the closest number of wins and total games using binary search
def find_closest_games(winning_rate):
    # Use binary search to find the closest rate
    pos = bisect_left(precomputed_rates, winning_rate)
    if pos == 0:
        closest = precomputed_win_rates[0]
    elif pos == len(precomputed_rates):
        closest = precomputed_win_rates[-1]
    else:
        before = precomputed_win_rates[pos - 1]
        after = precomputed_win_rates[pos]
        closest = before if abs(before[0] - winning_rate) < abs(after[0] - winning_rate) else after

    closest_wins, closest_total_games = closest[1], closest[2]
    # Reduce the ratio of wins and total games
    common_divisor = gcd(closest_wins, closest_total_games)
    reduced_wins = closest_wins // common_divisor
    reduced_total_games = closest_total_games // common_divisor

    return closest_wins, closest_total_games, reduced_wins, reduced_total_games

# Calculate closest wins and total games for each row, for both home and away teams
df[['home_closest_wins', 'home_closest_total_games', 'home_reduced_wins', 'home_reduced_total_games']] = df.apply(
    lambda row: pd.Series(find_closest_games(row['home_team_wins_mean_original'])) if pd.notnull(row['home_team_wins_mean_original']) else pd.Series([np.nan, np.nan, np.nan, np.nan]), axis=1
)

# Calculate closest wins and total games for away team
# Use the restored away_team_wins_mean_original column for calculation
df[['away_closest_wins', 'away_closest_total_games', 'away_reduced_wins', 'away_reduced_total_games']] = df.apply(
    lambda row: pd.Series(find_closest_games(row['away_team_wins_mean_original'])) if pd.notnull(row['away_team_wins_mean_original']) else pd.Series([np.nan, np.nan, np.nan, np.nan]), axis=1
)

# Calculate the estimated win rate and error for home and away teams
def calculate_estimated_rate_and_error(row, original_col, closest_wins_col, closest_total_games_col):
    if pd.notnull(row[original_col]):
        estimated_wins = row[closest_wins_col]
        estimated_total_games = row[closest_total_games_col]
        estimated_rate = estimated_wins / estimated_total_games if estimated_total_games > 0 else np.nan
        error = abs(estimated_rate - row[original_col])
        return pd.Series([estimated_rate, error])
    else:
        return pd.Series([np.nan, np.nan])

# Apply the function to calculate error for home team
df[['home_estimated_win_rate', 'home_rate_error']] = df.apply(
    lambda row: calculate_estimated_rate_and_error(row, 'home_team_wins_mean_original', 'home_closest_wins', 'home_closest_total_games'), axis=1
)

# Apply the function to calculate error for away team
df[['away_estimated_win_rate', 'away_rate_error']] = df.apply(
    lambda row: calculate_estimated_rate_and_error(row, 'away_team_wins_mean_original', 'away_closest_wins', 'away_closest_total_games'), axis=1
)

assert (df['home_mean_error'].dropna() < 1e-14).all(), "Not all values are less than 10^-14"
assert (df['away_mean_error'].dropna() < 1e-14).all(), "Not all values are less than 10^-14"
assert (df['home_std_error'].dropna() < 1e-14).all(), "Not all values are less than 10^-14"
assert (df['away_std_error'].dropna() < 1e-14).all(), "Not all values are less than 10^-14"
assert (df['home_skew_error'].dropna() < 1e-14).all(), "Not all values are less than 10^-14"
assert (df['away_skew_error'].dropna() < 1e-14).all(), "Not all values are less than 10^-14"
assert (df['home_rate_error'].dropna() < 1e-14).all(), "Not all values are less than 10^-14"
assert (df['away_rate_error'].dropna() < 1e-14).all(), "Not all values are less than 10^-14"

def consensus(x, y):
    if x == np.nan or y == np.nan:
        return np.nan
    elif x % y == 0:
        return x
    elif y % x  == 0:
        return y
    else:
        return x

def lcm(x, y):
    if np.isnan(x):
        return y
    elif np.isnan(y):
        return np.nan
    else:
        num = abs(x * y) // gcd(int(x), int(y))
        assert num <= 161
        return num

df['home_consensus_other'] = df.apply(lambda row: consensus(row['home_team_errors_mean_den'], row['home_batting_RBI_mean_den']), axis=1)
df['home_lcm'] = df.apply(lambda row: lcm(row['home_consensus_other'], row['home_reduced_total_games']), axis=1)

df['home_reduced_wins_expanded'] = df.apply(
    lambda row: row['home_reduced_wins'] * (row['home_lcm']  / row['home_reduced_total_games'])
    if not np.isnan(row['home_reduced_total_games']) else np.nan, axis=1)
df['home_reduced_total_games_expanded'] = df['home_lcm'] 


df['away_consensus_other'] = df.apply(lambda row: consensus(row['away_team_errors_mean_den'], row['away_batting_RBI_mean_den']), axis=1)
df['away_lcm'] = df.apply(lambda row: lcm(row['away_consensus_other'], row['away_reduced_total_games']), axis=1)

df['away_reduced_wins_expanded'] = df.apply(
    lambda row: row['away_reduced_wins'] * (row['away_lcm'] / row['away_reduced_total_games'])
    if not np.isnan(row['away_reduced_total_games']) else np.nan, axis=1)
df['away_reduced_total_games_expanded'] = df['away_lcm']


df['home_reduced_wins_original'] = df['home_reduced_wins']
df['home_reduced_total_games_original'] = df['home_reduced_total_games']
df['away_reduced_wins_original'] = df['away_reduced_wins']
df['away_reduced_total_games_original'] = df['away_reduced_total_games']

df['home_reduced_wins'] = df['home_reduced_wins_expanded']
df['home_reduced_total_games'] = df['home_reduced_total_games_expanded']
df['away_reduced_wins'] = df['away_reduced_wins_expanded']
df['away_reduced_total_games'] = df['away_reduced_total_games_expanded']

# Keep only the required columns
columns_to_keep = ['id', 'home_team_abbr', 'away_team_abbr', 'home_team_win', 'season', 
                   # 'home_team_wins_mean', 
                   'home_team_wins_mean_original',
                   # 'away_team_wins_mean', 
                   'away_team_wins_mean_original',
                   # 'home_team_wins_std', 'home_team_wins_std_original',
                   # 'away_team_wins_std', 'away_team_wins_std_original',
                   # 'home_std_error', 'away_std_error',
                   # 'home_team_wins_skew', 'home_team_wins_skew_original',
                   # 'away_team_wins_skew', 'away_team_wins_skew_original',
                   # 'home_skew_error', 'away_skew_error',
                   # 'home_mean_error', 'away_mean_error',
                   'home_reduced_wins_expanded', 'away_reduced_wins_expanded',
                   'home_reduced_total_games_expanded', 'away_reduced_total_games_expanded',
                   'home_lcm', 'away_lcm',
                   'home_team_errors_mean_den', 'away_team_errors_mean_den',
                   'home_reduced_wins', 'home_reduced_total_games',
                   'home_reduced_wins_original', 'home_reduced_total_games_original',
                   'away_reduced_wins', 'away_reduced_total_games',
                   'away_reduced_wins_original', 'away_reduced_total_games_original',
                   'home_estimated_win_rate', 'home_rate_error',
                   'away_estimated_win_rate', 'away_rate_error'
                   ]
df = df[columns_to_keep]

# Save the filtered DataFrame to a new CSV file
filtered_file_path = 'filtered_data.csv'
df.to_csv(filtered_file_path, index=False)

In [4]:
from collections import Counter

# 使用 Counter 來計算每個元素的出現次數
counter = Counter(df['home_reduced_total_games_expanded'])

# 按鍵值從大到小排序
sorted_by_key = dict(sorted(counter.items(), key=lambda x: x[0], reverse=True))

# 顯示結果
for element, frequency in sorted_by_key.items():
    print(f"Element: {element}, Frequency: {frequency}")

Element: 161.0, Frequency: 14
Element: 155.0, Frequency: 15
Element: 154.0, Frequency: 11
Element: 153.0, Frequency: 14
Element: 152.0, Frequency: 10
Element: 150.0, Frequency: 11
Element: 149.0, Frequency: 14
Element: 148.0, Frequency: 13
Element: 147.0, Frequency: 13
Element: 145.0, Frequency: 14
Element: 144.0, Frequency: 14
Element: 143.0, Frequency: 15
Element: 142.0, Frequency: 8
Element: 141.0, Frequency: 15
Element: 140.0, Frequency: 15
Element: 131.0, Frequency: 17
Element: 129.0, Frequency: 14
Element: 127.0, Frequency: 15
Element: 126.0, Frequency: 13
Element: 124.0, Frequency: 11
Element: 123.0, Frequency: 12
Element: 121.0, Frequency: 16
Element: 119.0, Frequency: 15
Element: 118.0, Frequency: 14
Element: 116.0, Frequency: 10
Element: 115.0, Frequency: 14
Element: 111.0, Frequency: 12
Element: 106.0, Frequency: 11
Element: 101.0, Frequency: 14
Element: 99.0, Frequency: 14
Element: 97.0, Frequency: 13
Element: 96.0, Frequency: 9
Element: 93.0, Frequency: 14
Element: 91.0, F

In [5]:
import pandas as pd
import numpy as np
from math import floor, ceil

# 定義處理的函數
def adjust_reduced_games(row):
    home_total = row['home_reduced_total_games']
    away_total = row['away_reduced_total_games']
    home_wins = row['home_reduced_wins']
    away_wins = row['away_reduced_wins']

    # 找出較大值與較小值
    max_games = max(home_total, away_total)
    min_games = min(home_total, away_total)
    thres = 3
    if max_games > 5 and abs(home_total - away_total) > thres and min_games >= 3:
        
        # 計算比值
        ratio = max_games / min_games
        
        # 找到可能的擴分因子，floor 和 ceil
        factors_to_test = [floor(ratio), ceil(ratio)]
        
        # 初始化變量來存儲最佳擴分
        best_scaling_factor = None
        min_difference = float('inf')  # 初始設置為無限大，便於比較
        
        # 測試兩個可能的擴分因子
        for factor in factors_to_test:
            new_min_games = min_games * factor
            
            # 檢查新的擴分後是否符合條件
            if abs(new_min_games - max_games) <= thres:
                # 計算擴分後的差異
                difference = abs(new_min_games - max_games)
                
                # 如果當前的擴分後差異比之前的小，則更新最佳因子
                if difference < min_difference:
                    min_difference = difference
                    best_scaling_factor = factor

        # 如果找到合適的擴分因子，進行擴分
        if best_scaling_factor is not None:
            if min_games == home_total:
                row['home_reduced_total_games'] = round(home_total * best_scaling_factor)
                row['home_reduced_wins'] = round(home_wins * best_scaling_factor) if pd.notna(home_wins) else np.nan
            elif min_games == away_total:
                row['away_reduced_total_games'] = round(away_total * best_scaling_factor)
                row['away_reduced_wins'] = round(away_wins * best_scaling_factor) if pd.notna(away_wins) else np.nan

    return row

# 應用處理函數到 DataFrame
df = df.apply(adjust_reduced_games, axis=1)


# Save the filtered DataFrame to a new CSV file
filtered_file_path = 'expand_data.csv'
df.to_csv(filtered_file_path, index=False)

In [6]:
import pandas as pd
import numpy as np
from datetime import datetime
from math import gcd

# Load the filtered data for further processing
filtered_file_path = 'expand_data.csv'
df = pd.read_csv(filtered_file_path)

# Function to generate unfilled game sequences
def generate_unfilled_game_sequences(df):
    combined_games_dict = {}
    for year in df['season'].unique():
        year_data = df[df['season'] == year]
        for team in pd.concat([year_data['home_team_abbr'], year_data['away_team_abbr']]).unique():
            # Find rows where the team is either the home or away team
            home_games = year_data[year_data['home_team_abbr'] == team][['id', 'home_reduced_wins', 'home_reduced_total_games']]
            away_games = year_data[year_data['away_team_abbr'] == team][['id', 'away_reduced_wins', 'away_reduced_total_games']]
            
            # Rename columns for home and away games to have uniform naming
            home_games = home_games.rename(columns={'home_reduced_wins': 'wins', 'home_reduced_total_games': 'games'})
            away_games = away_games.rename(columns={'away_reduced_wins': 'wins', 'away_reduced_total_games': 'games'})
            
            # Add columns to indicate if the game was won or lost and the opponent team
            home_games['result'] = -1
            home_games['opponent'] = year_data.loc[year_data['home_team_abbr'] == team, 'away_team_abbr'].values
            away_games['result'] = -1
            away_games['opponent'] = year_data.loc[year_data['away_team_abbr'] == team, 'home_team_abbr'].values

            # Concatenate home and away games
            combined_games = pd.concat([home_games, away_games])
            
            # Sort by total games played, with NaN values at the top
            combined_games = combined_games.sort_values(by=['games', 'id'], ascending=[True, True], na_position='first').reset_index(drop=True)
            
            # Store the combined game data for later processing
            combined_games_dict[(team, year)] = combined_games
    
    return combined_games_dict

# Function to print details about combined_games_dict
def print_combined_games_details(combined_games_dict):
    for (team, year), games_df in combined_games_dict.items():
        num_rows = games_df.shape[0]
        max_games = games_df['games'].max()
        num_nan_wins = games_df['wins'].isna().sum()
        num_nan_games = games_df['games'].isna().sum()
        print(f"Year: {year}, Team: {team}")
        print(f"  Number of rows: {num_rows}")
        print(f"  Max games value: {max_games}")
        print(f"  Number of NaN in 'wins': {num_nan_wins}")
        print(f"  Number of NaN in 'games': {num_nan_games}")
        print("-" * 40)

# Generate unfilled game sequences
combined_games_dict = generate_unfilled_game_sequences(df)

# Print details
print_combined_games_details(combined_games_dict)

Year: 2024, Team: DPS
  Number of rows: 161
  Max games value: 161.0
  Number of NaN in 'wins': 1
  Number of NaN in 'games': 1
----------------------------------------
Year: 2024, Team: JEM
  Number of rows: 161
  Max games value: 161.0
  Number of NaN in 'wins': 1
  Number of NaN in 'games': 1
----------------------------------------
Year: 2024, Team: MZG
  Number of rows: 161
  Max games value: 161.0
  Number of NaN in 'wins': 1
  Number of NaN in 'games': 1
----------------------------------------
Year: 2024, Team: GKO
  Number of rows: 161
  Max games value: 161.0
  Number of NaN in 'wins': 1
  Number of NaN in 'games': 1
----------------------------------------
Year: 2024, Team: UPV
  Number of rows: 161
  Max games value: 161.0
  Number of NaN in 'wins': 0
  Number of NaN in 'games': 0
----------------------------------------
Year: 2024, Team: VQC
  Number of rows: 161
  Max games value: 161.0
  Number of NaN in 'wins': 2
  Number of NaN in 'games': 2
---------------------------

In [7]:
from sympy import divisors
from math import gcd

# Backtracking function to fill game sequence
def backtrack_fill_sequence(current_games, current_sequence, remaining_values, upper_bound):
    if current_games == 1:
        new_sequence = current_sequence + [(remaining_values[-1]['wins'], remaining_values[-1]['games'])]
        yield new_sequence
        return
    #print(current_games, len(remaining_values))
    assert(current_games == len(remaining_values))
    #print(f'\rCurrent games: {current_games}', end='', flush=True)  # 覆蓋同一行輸出
    #print(remaining_values[-1]['games'], current_games)
    # Special case for the first index to ensure the last game is always (?, 161)
    if remaining_values[-1]['games'] == current_games:
        if current_games != upper_bound:
            if remaining_values[-1]['wins'] != current_sequence[-1][0] and remaining_values[-1]['wins'] + 1 != current_sequence[-1][0]:
                return
        #print("here")
        new_sequence = current_sequence + [(remaining_values[-1]['wins'], remaining_values[-1]['games'])]
        yield from backtrack_fill_sequence(current_games - 1, new_sequence, remaining_values[:-1], upper_bound)
        return
    else:
        if not (remaining_values[-1]['games'] < current_games):
            return
        if not (current_games + 1 == current_sequence[-1][1]):
            return
        candidates = [] # 找出符合條件的候選者

        if  current_sequence[-1][0] - remaining_values[-1]['wins'] == current_sequence[-1][1] - remaining_values[-1]['games']:
            # 連勝情況
            val = {'wins': int(current_sequence[-1][0] - 1), 'games': current_games}
            factor = gcd(val['wins'], val['games'])
            if factor == 1:
                return
            val['wins'] //= factor
            val['games'] //= factor
            if val not in remaining_values:
                return
            candidates.append(val)
        elif current_sequence[-1][0] == remaining_values[-1]['wins']:
            # 連敗情況
            val = {'wins': int(current_sequence[-1][0]), 'games': current_games}
            factor = gcd(val['wins'], val['games'])
            if factor == 1:
                return
            val['wins'] //= factor
            val['games'] //= factor
            if val not in remaining_values:
                return
            candidates.append(val)
        else:
            seen_pairs = set()
            for val in reversed(remaining_values):
                pair = (int(val['wins']), int(val['games']))
                if pair not in seen_pairs:
                    seen_pairs.add(pair)
                else:
                    continue
                
                if current_games % int(val['games']) == 0:
                    multiplier = current_games // int(val['games'])
                    if multiplier * int(val['wins']) > current_sequence[-1][0]:
                        continue
                    if multiplier * int(val['wins']) + 1 < current_sequence[-1][0]:
                        continue
                    candidates.append(val)

        # 遍歷候選者進行回溯
        for candidate in candidates:
            multiplier = current_games // int(candidate['games'])
            new_sequence = current_sequence + [(candidate['wins'] * multiplier, candidate['games'] * multiplier,)]
            new_remaining_values = remaining_values.copy()
            new_remaining_values.remove(candidate)
            yield from backtrack_fill_sequence(current_games - 1, new_sequence, new_remaining_values, upper_bound)
        return

# Generate unfilled game sequences
combined_games_dict = generate_unfilled_game_sequences(df)

# Select the data for UPV in 2024
games_data = combined_games_dict[('RLJ', 2024)][['wins', 'games']]
target_length = 161
remaining_values = games_data.to_dict(orient='records')
print(len(remaining_values))
# Start backtracking to find all possible sequences
all_possible_sequences = list(backtrack_fill_sequence(target_length, [], remaining_values, target_length))

# Print the results
print("Remaining Values and All Possible Sequences (One-to-One Comparison):")
for idx, sequence in enumerate(all_possible_sequences):
    print(f"Possible Sequence {idx + 1}:")
    for val, (wins, games) in zip(remaining_values, sequence):
        # Simplify the wins and games by finding the GCD
        gcd_value = gcd(int(wins), int(games))
        simplified_wins = wins // gcd_value
        simplified_games = games // gcd_value
        # Print the remaining values and sequence in the same line for comparison
        print(f"Remaining Values - Wins: {val['wins']}, Games: {val['games']} | Sequence - Wins: {wins}, Games: {games} -> Simplified: Wins: {simplified_wins}, Games: {simplified_games}")
    print("-" * 40)  # Separator for readability

161
Remaining Values and All Possible Sequences (One-to-One Comparison):
Possible Sequence 1:
Remaining Values - Wins: 1.0, Games: 1.0 | Sequence - Wins: 80.0, Games: 161.0 -> Simplified: Wins: 80.0, Games: 161.0
Remaining Values - Wins: 1.0, Games: 2.0 | Sequence - Wins: 80.0, Games: 160.0 -> Simplified: Wins: 1.0, Games: 2.0
Remaining Values - Wins: 1.0, Games: 2.0 | Sequence - Wins: 80.0, Games: 159.0 -> Simplified: Wins: 80.0, Games: 159.0
Remaining Values - Wins: 1.0, Games: 2.0 | Sequence - Wins: 80.0, Games: 158.0 -> Simplified: Wins: 40.0, Games: 79.0
Remaining Values - Wins: 1.0, Games: 2.0 | Sequence - Wins: 79.0, Games: 157.0 -> Simplified: Wins: 79.0, Games: 157.0
Remaining Values - Wins: 1.0, Games: 3.0 | Sequence - Wins: 78.0, Games: 156.0 -> Simplified: Wins: 1.0, Games: 2.0
Remaining Values - Wins: 3.0, Games: 5.0 | Sequence - Wins: 77.0, Games: 155.0 -> Simplified: Wins: 77.0, Games: 155.0
Remaining Values - Wins: 4.0, Games: 6.0 | Sequence - Wins: 76.0, Games: 154.0 -

In [8]:
# Function to calculate win rate given reduced wins and games for a specific team
def calculate_win_rate(reduced_wins, reduced_games, team):
    # Generate unfilled game sequences
    combined_games_dict = generate_unfilled_game_sequences(df)

    # Select the data for the given team in 2024
    if (team, 2024) not in combined_games_dict:
        print(f"No data found for team {team} in 2024.")
        return None
    games_data = combined_games_dict[(team, 2024)][['wins', 'games']]
    target_length = 161 if team != 'PDF' else 159
    remaining_values = games_data.to_dict(orient='records')
    
    # Start backtracking to find all possible sequences
    all_possible_sequences = list(backtrack_fill_sequence(target_length, [], remaining_values, target_length))
    if not all_possible_sequences:
        print("No possible sequences found.")
        return None
    
    # Take the first sequence of results
    game_results = all_possible_sequences[0]
    
    total_games = 0
    # Calculate win rate
    total_wins = 0
    # Find all possible matches with the given reduced wins and games, and calculate the average win rate
    for idx, (wins, games) in enumerate(game_results):
        if idx == len(game_results) - 1:
            break
        gcd_value = gcd(int(wins), int(games))
        simplified_wins = int(wins) // gcd_value
        simplified_games = int(games) // gcd_value
        if simplified_wins == reduced_wins and simplified_games == reduced_games:
            previous_wins = wins
            current_wins = game_results[idx + 1][0]
            if current_wins > previous_wins:
                total_wins += 1
    
    win_rate = total_wins / total_games if total_games > 0 else 0
    return win_rate

"""
calculate_win_rate(35,68, 'RLJ')

for team in ['UPV', 'YHA', 'STC', 'RLJ', 'UPV', 'YHA', 'STC', 'RLJ', 'MOO', 'PDF', 'JBM', 'XFB']:
    
    calculate_win_rate(35,68, team)
"""

"\ncalculate_win_rate(35,68, 'RLJ')\n\nfor team in ['UPV', 'YHA', 'STC', 'RLJ', 'UPV', 'YHA', 'STC', 'RLJ', 'MOO', 'PDF', 'JBM', 'XFB']:\n    \n    calculate_win_rate(35,68, team)\n"

In [9]:
# Function to fill missing game sequences
def fill_missing_games(combined_games_dict, without=[]):#['UPV', 'YHA', 'STC', 'RLJ', 'UPV', 'YHA', 'STC', 'RLJ', 'MOO', 'PDF', 'JBM', 'XFB']):
    game_sequences = {}
    for (team, year), combined_games in combined_games_dict.items():
        if team in without:
            continue
        filled_games = fill_missing_game_sequence(combined_games)
        game_sequences[(team, year)] = list(filled_games.itertuples(index=False, name=None))
    
    return game_sequences

# Helper function to fill missing game sequences
# Also tries to reduce and match any new data to existing rows
# If found, replace and delete the original data from the sequence
def fill_missing_game_sequence(combined_games):
    filled_games = combined_games.copy()
    games_list = filled_games.to_records(index=False)
    i = len(games_list) - 2
    while i >= 0:
        head = i
        tail = i + 1
        if tail == len(games_list) - 1 or games_list[tail+1].games != games_list[tail].games:
            if head == 0 or games_list[head-1].games != games_list[head].games:
                if not np.isnan(games_list[head].games) and not np.isnan(games_list[tail].games) and games_list[head].games + 2 <= games_list[tail].games:
                    if games_list[tail].wins - games_list[head].wins == games_list[tail].games - games_list[head].games:
                        # winning streak
                        # Fill in the missing games as wins (winning streak)
                        missing_count = int(games_list[tail].games - games_list[head].games)
                        for j in range(1, missing_count):
                            new_wins = int(games_list[head].wins + j)
                            new_games = int(games_list[head].games + j)
                            reduced_wins = new_wins // gcd(new_wins, new_games)
                            reduced_games = new_games // gcd(new_wins, new_games)
                            match = filled_games[(filled_games['wins'] == reduced_wins) & (filled_games['games'] == reduced_games)]
                            if not match.empty:
                                matched_id = match.iloc[0]['id']
                                matched_index = match.index[0]
                            else:
                                matched_id = filled_games['id'][0]
                                matched_index = 0
                            # 1. Append a new row
                            new_row = pd.DataFrame({'id': [matched_id], 'wins': [new_wins], 'games': [new_games], 'result': [1], 'opponent': [filled_games.iloc[tail]['opponent']]})
                            filled_games = pd.concat([filled_games, new_row], ignore_index=True)
                            # 2. Drop a specific row (假設 match.index[0] 是你要刪除的列)
                            filled_games = filled_games.drop(matched_index).reset_index(drop=True)
                            # 3. Sort by 'games' column and then by 'id' (ascending order), with NaN values at the beginning
                            filled_games = filled_games.sort_values(by=['games', 'id'], ascending=[True, True], na_position='first').reset_index(drop=True)
                            # Update the opponent's sequence
                            opponent_team = filled_games.iloc[tail]['opponent']
                            opponent_game = filled_games.iloc[tail]
                            opponent_idx = filled_games[(filled_games['id'] == opponent_game['id']) & (filled_games['games'] == opponent_game['games'])].index[0]
                            filled_games.at[opponent_idx, 'result'] = 0
                    elif games_list[tail].wins == games_list[head].wins:
                        # Fill in the missing games as losses (losing streak)
                        missing_count = int(games_list[tail].games - games_list[head].games)
                        for j in range(1, missing_count):
                            new_wins = int(games_list[head].wins)
                            new_games = int(games_list[head].games + j)
                            reduced_wins = new_wins // gcd(new_wins, new_games)
                            reduced_games = new_games // gcd(new_wins, new_games)
                            match = filled_games[(filled_games['wins'] == reduced_wins) & (filled_games['games'] == reduced_games)]
                            if not match.empty:
                                matched_id = match.iloc[0]['id']
                                matched_index = match.index[0]
                            else:
                                matched_id = filled_games['id'][0]
                                matched_index = 0
                            # 1. Append a new row
                            new_row = pd.DataFrame({'id': [matched_id], 'wins': [new_wins], 'games': [new_games], 'result': [0], 'opponent': [filled_games.iloc[tail]['opponent']]})
                            filled_games = pd.concat([filled_games, new_row], ignore_index=True)
                            # 2. Drop a specific row (假設 match.index[0] 是你要刪除的列)
                            filled_games = filled_games.drop(matched_index).reset_index(drop=True)
                            # 3. Sort by 'games' column and then by 'id' (ascending order), with NaN values at the beginning
                            filled_games = filled_games.sort_values(by=['games', 'id'], ascending=[True, True], na_position='first').reset_index(drop=True)
                            # Update the opponent's sequence
                            opponent_team = filled_games.iloc[tail]['opponent']
                            opponent_game = filled_games.iloc[tail]
                            opponent_idx = filled_games[(filled_games['id'] == opponent_game['id']) & (filled_games['games'] == opponent_game['games'])].index[0]
                            filled_games.at[opponent_idx, 'result'] = 1
                    elif games_list[head].games + 2 == games_list[tail].games:
                        # Fill in the missing games as a draw
                        new_wins = int(games_list[head].wins + games_list[tail].wins) // 2
                        new_games = int(games_list[head].games + 1)
                        reduced_wins = new_wins // gcd(new_wins, new_games)
                        reduced_games = new_games // gcd(new_wins, new_games)
                        match = filled_games[(filled_games['wins'] == reduced_wins) & (filled_games['games'] == reduced_games)]
                        if not match.empty:
                            matched_id = match.iloc[0]['id']
                            matched_index = match.index[0]
                        else:
                            matched_id = filled_games['id'][0]
                            matched_index = 0
                        # 1. Append a new row
                        new_row = pd.DataFrame({'id': [matched_id], 'wins': [new_wins], 'games': [new_games], 'result': [0.5], 'opponent': [filled_games.iloc[tail]['opponent']]})
                        filled_games = pd.concat([filled_games, new_row], ignore_index=True)
                        # 2. Drop a specific row (假設 match.index[0] 是你要刪除的列)
                        filled_games = filled_games.drop(matched_index).reset_index(drop=True)
                        # 3. Sort by 'games' column and then by 'id' (ascending order), with NaN values at the beginning
                        filled_games = filled_games.sort_values(by=['games', 'id'], ascending=[True, True], na_position='first').reset_index(drop=True)
                        # Update the opponent's sequence
                        opponent_team = filled_games.iloc[tail]['opponent']
                        opponent_game = filled_games.iloc[tail]
                        opponent_idx = filled_games[(filled_games['id'] == opponent_game['id']) & (filled_games['games'] == opponent_game['games'])].index[0]
                        filled_games.at[opponent_idx, 'result'] = 0
        i -= 1
    return filled_games

# Fill missing values in all game sequences
game_sequences = fill_missing_games(combined_games_dict)

# Determine the largest games value with repeated games count
def find_max_repeated_games(game_sequences):
    max_games_with_repeats = 0
    for sequence in game_sequences.values():
        games_counter = {}
        for _, _, games, *_ in sequence:
            if games in games_counter:
                games_counter[games] += 1
            else:
                games_counter[games] = 1
        for games, count in games_counter.items():
            if count > 1 and games > max_games_with_repeats:
                max_games_with_repeats = games
    return max_games_with_repeats

max_repeated_games = find_max_repeated_games(game_sequences)
print(f'The maximum games value with repeated games: {max_repeated_games}')

# Define a function to get the precomputed game sequence
def get_team_game_sequence(team, year):
    return game_sequences.get((team, year), [])

The maximum games value with repeated games: 79.0


In [10]:
import pandas as pd
import numpy as np
from datetime import datetime
from math import gcd

# Load the filtered data for further processing
filtered_file_path = 'expand_data.csv'
df = pd.read_csv(filtered_file_path)

# Precompute game sequences for all teams and seasons

def precompute_team_game_sequences(df):
    game_sequences = {}
    for year in df['season'].unique():
        year_data = df[df['season'] == year]
        for team in pd.concat([year_data['home_team_abbr'], year_data['away_team_abbr']]).unique():
            # Find rows where the team is either the home or away team
            home_games = year_data[year_data['home_team_abbr'] == team][['id', 'home_reduced_wins', 'home_reduced_total_games']]
            away_games = year_data[year_data['away_team_abbr'] == team][['id', 'away_reduced_wins', 'away_reduced_total_games']]
            
            # Rename columns for home and away games to have uniform naming
            home_games = home_games.rename(columns={'home_reduced_wins': 'wins', 'home_reduced_total_games': 'games'})
            away_games = away_games.rename(columns={'away_reduced_wins': 'wins', 'away_reduced_total_games': 'games'})
            
            # Concatenate home and away games
            combined_games = pd.concat([home_games, away_games])
            
            # Sort by total games played, with NaN values at the top
            combined_games = combined_games.sort_values(by=['games', 'id'], ascending=[True, True], na_position='first').reset_index(drop=True)
            
            # Fill missing values in the game sequence based on the winning streak or losing streak pattern
            filled_games = fill_missing_game_sequence(combined_games)
            
            # Store the game sequence
            game_sequences[(team, year)] = list(filled_games.itertuples(index=False, name=None))
    return game_sequences

# Helper function to fill missing game sequences
# Also tries to reduce and match any new data to existing rows
# If found, replace and delete the original data from the sequence
def fill_missing_game_sequence(combined_games):
    filled_games = combined_games.copy()
    games_list = filled_games.to_records(index=False)
    i = len(games_list) - 2
    print(combined_games)
    while i >= 0:
        print(i)
        # Skip if games_list[i].id is NaN
        if pd.isna(games_list[i].id) or pd.isna(games_list[i].wins):
            i -= 1
            continue
        # Check if the current games_list[i] has a label for home_team_win
        current_id = games_list[i].id
        # Find the corresponding row in the original df to check home_team_win
        current_row = df[df['id'] == current_id]
        if not current_row.empty and pd.notna(current_row.iloc[0]['home_team_win']):
            if current_row.iloc[0]['home_team_win'] == True:
                # Add a new winning data point
                new_wins = int(games_list[i].wins - 1)
                new_games = int(games_list[i].games - 1)
            elif current_row.iloc[0]['home_team_win'] == False:
                # Add a new losing data point
                new_wins = int(games_list[i].wins)
                new_games = int(games_list[i].games - 1)
            # Check if this point already exists
            reduced_wins = new_wins // gcd(new_wins, new_games) if new_games > 0 else new_wins
            reduced_games = new_games // gcd(new_wins, new_games) if new_games > 0 else new_games
            match = filled_games[(filled_games['wins'] == reduced_wins) & (filled_games['games'] == reduced_games)]
            if not match.empty:
                matched_id = match.iloc[0]['id']
                matched_index = match.index[0]
            else:
                matched_id = filled_games['id'][0]
                matched_index = 0
            # Append a new row
            new_row = pd.DataFrame({'id': [matched_id], 'wins': [new_wins], 'games': [new_games]})
            filled_games = pd.concat([filled_games, new_row], ignore_index=True)
            # Drop the original matched row
            filled_games = filled_games.drop(matched_index).reset_index(drop=True)
            # Sort by 'games' column and then by 'id' (ascending order), with NaN values at the beginning
            filled_games = filled_games.sort_values(by=['games', 'id'], ascending=[True, True], na_position='first').reset_index(drop=True)
            
            i -= 1
            continue
        head = i
        tail = i + 1
        if tail == len(games_list) - 1 or games_list[tail+1].games != games_list[tail].games:
            if head == 0 or games_list[head-1].games != games_list[head].games:
                if not np.isnan(games_list[head].games) and not np.isnan(games_list[tail].games) and games_list[head].games != games_list[tail].games:
                    if games_list[tail].wins - games_list[head].wins == games_list[tail].games - games_list[head].games:
                        # winning streak
                        # Fill in the missing games as wins (winning streak)
                        missing_count = int(games_list[tail].games - games_list[head].games)
                        for j in range(1, missing_count):
                            new_wins = int(games_list[head].wins + j)
                            new_games = int(games_list[head].games + j)
                            reduced_wins = new_wins // gcd(new_wins, new_games)
                            reduced_games = new_games // gcd(new_wins, new_games)
                            match = filled_games[(filled_games['wins'] == reduced_wins) & (filled_games['games'] == reduced_games)]
                            if not match.empty:
                                matched_id = match.iloc[0]['id']
                                matched_index = match.index[0]
                            else:
                                matched_id = filled_games['id'][0]
                                matched_index = 0
                            # 1. Append a new row
                            new_row = pd.DataFrame({'id': [matched_id], 'wins': [new_wins], 'games': [new_games]})
                            filled_games = pd.concat([filled_games, new_row], ignore_index=True)
                            # 2. Drop a specific row (假設 match.index[0] 是你要刪除的列)
                            filled_games = filled_games.drop(matched_index).reset_index(drop=True)
                            # 3. Sort by 'games' column and then by 'id' (ascending order), with NaN values at the beginning
                            filled_games = filled_games.sort_values(by=['games', 'id'], ascending=[True, True], na_position='first').reset_index(drop=True)
                    elif games_list[tail].wins == games_list[head].wins:
                        # Fill in the missing games as losses (losing streak)
                        missing_count = int(games_list[tail].games - games_list[head].games)
                        for j in range(1, missing_count):
                            new_wins = int(games_list[head].wins)
                            new_games = int(games_list[head].games + j)
                            reduced_wins = new_wins // gcd(new_wins, new_games)
                            reduced_games = new_games // gcd(new_wins, new_games)
                            match = filled_games[(filled_games['wins'] == reduced_wins) & (filled_games['games'] == reduced_games)]
                            if not match.empty:
                                matched_id = match.iloc[0]['id']
                                matched_index = match.index[0]
                            else:
                                matched_id = filled_games['id'][0]
                                matched_index = 0
                            # 1. Append a new row
                            new_row = pd.DataFrame({'id': [matched_id], 'wins': [new_wins], 'games': [new_games]})
                            filled_games = pd.concat([filled_games, new_row], ignore_index=True)
                            # 2. Drop a specific row (假設 match.index[0] 是你要刪除的列)
                            filled_games = filled_games.drop(matched_index).reset_index(drop=True)
                            # 3. Sort by 'games' column and then by 'id' (ascending order), with NaN values at the beginning
                            filled_games = filled_games.sort_values(by=['games', 'id'], ascending=[True, True], na_position='first').reset_index(drop=True)
        i -= 1
    print(filled_games)
    return filled_games

# Precompute all game sequences
game_sequences = precompute_team_game_sequences(df)

# Define a function to get the precomputed game sequence
def get_team_game_sequence(team, year):
    return game_sequences.get((team, year), [])

# Fill missing values for testing data in 'home_team_win'
for index, row in df[df['id'] <= testing_max_id].iterrows():
    home_team = row['home_team_abbr']
    away_team = row['away_team_abbr']
    year = row['season']
    game_id = row['id']
    
    if year == 0:
        df.at[index, 'home_team_win'] = row['home_team_wins_mean_original'] > row['away_team_wins_mean_original']
        continue
    
    # Get sequences for home and away teams
    home_sequence = get_team_game_sequence(home_team, year)
    away_sequence = get_team_game_sequence(away_team, year)

    # Find the relevant game information for the current testing data id
    home_game_data = next((item for item in home_sequence if item[0] == game_id), None)
    away_game_data = next((item for item in away_sequence if item[0] == game_id), None)
    
    # Helper function to calculate win rate from previous group
    def calculate_win_rate_from_previous(group, current_wins, current_games):
        future_group = [g for g in group if g[2] != np.nan and g[2] > current_games]
        if not future_group:
            return np.nan
        
        closest_games = min([g[2] for g in future_group])  # Find the closest but smaller game count
        group_with_closest_games = [g for g in future_group if g[2] == closest_games]  # Get all entries with that closest game count
        avg_wins = np.mean([g[1] for g in group_with_closest_games])
        return (avg_wins - current_wins) / (closest_games - current_games)

    # Calculate win rate for home and away teams if data is available
    home_win_rate = calculate_win_rate_from_previous(home_sequence, home_game_data[1], home_game_data[2]) if home_game_data else np.nan
    away_win_rate = calculate_win_rate_from_previous(away_sequence, away_game_data[1], away_game_data[2]) if away_game_data else np.nan

    # Determine the correct prediction
    if home_win_rate in [0, 1]:
        df.at[index, 'home_team_win'] = (home_win_rate == 1)
    elif away_win_rate in [0, 1]:
        df.at[index, 'home_team_win'] = (away_win_rate == 0)
    elif pd.notnull(home_win_rate) and home_win_rate != 0.5:
        df.at[index, 'home_team_win'] = (home_win_rate > 0.5)
    elif pd.notnull(away_win_rate) and away_win_rate != 0.5:
        df.at[index, 'home_team_win'] = (away_win_rate < 0.5)
    else:
        df.at[index, 'home_team_win'] = row['home_team_wins_mean_original'] > row['away_team_wins_mean_original']

# Generate timestamp for file names
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the updated DataFrame
df.to_csv(f'processed_data_{timestamp}.csv', index=False)

# Save testing sample IDs and home_team_win values
testing_sample = df[df['id'] <= testing_max_id][['id', 'home_team_win']]
testing_sample.to_csv(f'submit_{timestamp}.csv', index=False)

       id  wins  games
0     730   NaN    NaN
1     927   1.0    1.0
2     895   2.0    2.0
3    1306   2.0    3.0
4    1748   3.0    4.0
..    ...   ...    ...
156  1811  74.0  157.0
157  1513  75.0  158.0
158  1576  75.0  159.0
159   706  76.0  160.0
160  1749  77.0  161.0

[161 rows x 3 columns]
159
158
157
156
155
154
153
152
151
150
149
148
147
146
145
144
143
142
141
140
139
138
137
136
135
134
133
132
131
130
129
128
127
126
125
124
123
122
121
120
119
118
117
116
115
114
113
112
111
110
109
108
107
106
105
104
103
102
101
100
99
98
97
96
95
94
93
92
91
90
89
88
87
86
85
84
83
82
81
80
79
78
77
76
75
74
73
72
71
70
69
68
67
66
65
64
63
62
61
60
59
58
57
56
55
54
53
52
51
50
49
48
47
46
45
44
43
42
41
40
39
38
37
36
35
34
33
32
31
30
29
28
27
26
25
24
23
22
21
20
19
18
17
16
15
14
13
12
11
10
9
8
7
6
5
4
3
2
1
0
         id  wins  games
0     730.0   NaN    NaN
1     927.0   1.0    1.0
2     895.0   2.0    2.0
3    1748.0   3.0    4.0
4    1773.0   3.0    5.0
..      ...   ...   

  df.at[index, 'home_team_win'] = (home_win_rate == 1)


In [11]:
# Concatenate the filtered rows with the remaining rows
df = pd.concat([filtered_rows, df]).reset_index(drop=True)
df = df.sort_values(by='id').reset_index(drop=True)

df['home_team_win'] = df['home_team_win'].fillna(False)

# Generate timestamp for file names
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the updated DataFrame
df.to_csv(f'processed_data_{timestamp}.csv', index=False)

# Save testing sample IDs and home_team_win values
testing_sample = df[df['id'] <= testing_max_id][['id', 'home_team_win']]
testing_sample.to_csv(f'submit_{timestamp}.csv', index=False)

testing_sample['home_team_win'] = testing_sample['home_team_win'].apply(lambda x: not x)
testing_sample.to_csv(f'submit_flip_{timestamp}.csv', index=False)

  df['home_team_win'] = df['home_team_win'].fillna(False)
