# 基于广义Shapley值的baseline

# 数据处理

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def data_load(data_path, nrows = None):
    data = pd.read_csv(data_path, nrows = nrows, index_col=0)
    print(data.info())
    print(data.columns)
    return data

def data_prerpocess(df, training = True):
    used_features = ['eFG%', 'TOV%', 'ORB%', 'FT%']
    total_over_features = []
    for i in range(36):
        for feature in used_features:
            feature = feature + '_' + str(i)
            total_over_features.append(feature)

    # 去除df中无法转化为float的数据
    rows_with_special_character = df[df['BPM_0'].astype(str).str.contains(':')]
    # print(df['BPM_0'].astype(str).str.contains(':'))
    print('异常数据：',len(rows_with_special_character))
    # X = df[df[total_over_features].apply(lambda x: pd.to_numeric(x, errors='coerce')).notnull()]
    X = df[~df[total_over_features].apply(lambda x: x.astype(str).str.contains(':')).any(axis=1)]
    X = X[total_over_features].astype(float)
    final_features = []
    for feat in used_features:
        off_columns = [feat+'_'+str(i) for i in range(1,18)]
        X[feat+'_off'] = X[off_columns].sum(axis=1)
        def_columns = [feat+'_'+str(i) for i in range(18,36)]
        X[feat+'_def'] = X[def_columns].mean(axis=1)
        final_features.append(feat+'_off')
        final_features.append(feat+'_def')
    X = X[final_features]
    # 假设用平均值填充缺失值
    X = X.fillna(X.mean())
    rows_with_nan = X[X.isnull().any(axis=1)]
    print(rows_with_nan)    
    print('处理后数据：',X[:5])
    Y = df[~df[total_over_features+['result']].apply(lambda x: x.astype(str).str.contains(':')).any(axis=1)]['result'].astype(int)
    #Y2 = df['score_ourside'] - df['score_opposite']
    if training:
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 
        return X_train, X_test, Y_train, Y_test
    else:
        return X, Y

data_path = './data/nba_mvp.csv'
data = data_load(data_path)
X_train, X_test, y_train, y_test = data_prerpocess(data)


# 训练逻辑回归胜率模型

In [None]:
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


# 特征标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 初始化逻辑回归模型
model = LogisticRegression()

# 训练模型
model.fit(X_train, y_train)

# 保存模型
with open('model/nba_mvp_baseline_model.pkl', 'wb') as file:
    pickle.dump(model, file)
    
# 预测
y_pred = model.predict(X_test)
print(y_pred[:5])
# 预测
y_pred_proba = model.predict_proba(X_test)
print(y_pred_proba[:5])
# 评估模型
accuracy = accuracy_score(y_test, y_pred)
print("准确率:", accuracy)

[1 0 0 0 1]
[[0.28137574 0.71862426]
 [0.59909135 0.40090865]
 [0.57905903 0.42094097]
 [0.84426377 0.15573623]
 [0.21962385 0.78037615]]
准确率: 0.6950011883070586


# 广义Shapley值计算球员价值

In [22]:
import pickle
import numpy as np
import pandas as pd
# 加载模型
with open('model/nba_mvp_baseline_model.pkl', 'rb') as file:
    model = pickle.load(file)

#加载球员数据
def data_load(data_path, nrows = None):
    data = pd.read_csv(data_path, nrows = nrows, index_col=0)
    print(data.info())
    print(data.columns)
    return data

def data_prerpocess(df, training = True):
    used_features = ['starters','eFG%', 'TOV%', 'ORB%', 'FT%']
    total_over_features = []
    for i in range(36):
        for feature in used_features:
            feature = feature + '_' + str(i)
            total_over_features.append(feature)
    float_features = ['eFG%', 'TOV%', 'ORB%', 'FT%']
    total_float_features = []
    for i in range(36):
        for feature in float_features:
            feature = feature + '_' + str(i)        
            total_float_features.append(feature)
    # 去除df中无法转化为float的数据
    rows_with_special_character = df[df['BPM_0'].astype(str).str.contains(':')]
    # print(df['BPM_0'].astype(str).str.contains(':'))
    print('异常数据：',len(rows_with_special_character))
    # X = df[df[total_over_features].apply(lambda x: pd.to_numeric(x, errors='coerce')).notnull()]
    X = df[~df[total_over_features].apply(lambda x: x.astype(str).str.contains(':')).any(axis=1)]
    X[total_float_features] = X[total_float_features].astype(float)
    X = X[total_over_features]
    # 假设用平均值填充缺失值
    X.fillna(0, inplace=True)
   
    print('处理后数据：',X[:5])
    Y = df[~df[total_over_features+['result']].apply(lambda x: x.astype(str).str.contains(':')).any(axis=1)]['result'].astype(int)
    
    return X, Y, total_over_features

data_path = './data/nba_mvp.csv'
features = ['starters', 'team', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT',
       'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', '+/-', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM',
       'file_name']
# 连接所有球员数据
new_features = []
for i in range(36):
    for feat in features:
        new_feat = feat + '_' + str(i)
        new_features.append(new_feat)

result_features = new_features + ['score_ourside', 'score_opposite', 'result']

year = 2022

data = data_load(data_path)
if year == 2024:
        data = data[data.apply(lambda x: x.astype(str).str.contains('{}_10|{}_11|{}_12|{}_01|{}_02|{}_03|{}_04_0|2024_04_10|2024_04_11|2024_04_12|2024_04_13|2024_04_14'
                    .format(year-1,year-1,year-1,year,year,year,year), regex=True)).any(axis=1)]
if year == 2023:
    data = data[data.apply(lambda x: x.astype(str).str.contains('{}_10|{}_11|{}_12|{}_01|{}_02|{}_03|{}_04_0'
                    .format(year-1,year-1,year-1,year,year,year,year), regex=True)).any(axis=1)]
if year == 2022:
    data = data[data.apply(lambda x: x.astype(str).str.contains('{}_10|{}_11|{}_12|{}_01|{}_02|{}_03|{}_04_0|2022_04_10'
                    .format(year-1,year-1,year-1,year,year,year,year), regex=True)).any(axis=1)]    
# data = data[data.apply(lambda x: x.astype(str).str.contains('{}_10|{}_11|{}_12|{}_01|{}_02|{}_03|{}_04|{}_05|{}_06|{}_07|{}_08|{}_09'
#                 .format(year-1,year-1,year-1,year,year,year,year,year,year,year,year,year), regex=True)).any(axis=1)]
# data = data[~data[result_features].apply(lambda x: x.astype(str).str.contains(':')).any(axis=1)]
print('{}比赛数据：'.format(year),len(data))
X, Y, total_over_features = data_prerpocess(data)
print(X.shape)
print(total_over_features)


  data = pd.read_csv(data_path, nrows = nrows, index_col=0)


<class 'pandas.core.frame.DataFrame'>
Index: 63162 entries, 0 to 63161
Columns: 1371 entries, starters_0 to result
dtypes: float64(1200), int64(1), object(170)
memory usage: 661.2+ MB
None
Index(['starters_0', 'team_0', 'MP_0', 'FG_0', 'FGA_0', 'FG%_0', '3P_0',
       '3PA_0', '3P%_0', 'FT_0',
       ...
       'BLK%_35', 'TOV%_35', 'USG%_35', 'ORtg_35', 'DRtg_35', 'BPM_35',
       'file_name_35', 'score_ourside', 'score_opposite', 'result'],
      dtype='object', length=1371)
2022比赛数据： 2460
异常数据： 0


  X.fillna(0, inplace=True)


处理后数据：       starters_0  eFG%_0  TOV%_0  ORB%_0  FT%_0          starters_1  eFG%_1  \
46  John Konchar   0.682    13.2    14.1  0.400  Xavier Tillman Sr.   0.200   
47  Terry Rozier   0.694     5.3     0.0  0.000         LaMelo Ball   0.688   
48  Jayson Tatum   1.000    24.6     0.0  1.000        Daniel Theis   0.286   
49      Naz Reid   0.700    16.7    11.0  0.000     Anthony Edwards   0.375   
50   Ayo Dosunmu   0.583    24.9     5.5  0.714    Patrick Williams   0.548   

    TOV%_1  ORB%_1  FT%_1  ... starters_34  eFG%_34  TOV%_34  ORB%_34  FT%_34  \
46     8.7     6.7  1.000  ...           0      0.0      0.0      0.0     0.0   
47    32.2     9.1  1.000  ...           0      0.0      0.0      0.0     0.0   
48    22.2     4.6  1.000  ...           0      0.0      0.0      0.0     0.0   
49    10.1     0.0  1.000  ...           0      0.0      0.0      0.0     0.0   
50    15.5     0.0  0.857  ...           0      0.0      0.0      0.0     0.0   

   starters_35  eFG%_35  TOV%_3

In [23]:
from tqdm import tqdm
import random
from collections import defaultdict

print(len(X),X.iloc[0][['starters_1','starters_15']])
player_value_dict = defaultdict(list)
for i  in tqdm(range(len(X))): 
    data = X.iloc[i].copy()
    for num in range(18):
        if data['starters_{}'.format(num)]==0:
            break
    #对每场比赛一个队员求Shapley
    for j in range(num):
        #随机采样取出包含j的五个球员
        v = 0
        epoch = 20
        for _ in range(epoch):
            sampled_numbers = random.sample(list(range(num)), 5)
            if j in sampled_numbers:
                1
            else:
                sampled_numbers[random.randint(0, 4)] = j  # 用j替换随机选中的一个数
         
            used_features = ['eFG%', 'TOV%', 'ORB%', 'FT%']
            final_features = []
            for feat in used_features:
                off_columns = [feat+'_'+str(i) for i in sampled_numbers]
                data[feat+'_off'] = data[off_columns].sum(axis=0)
                def_columns = [feat+'_'+str(i) for i in range(18,36)]
                data[feat+'_def'] = data[def_columns].mean(axis=0)
                final_features.append(feat+'_off')
                final_features.append(feat+'_def')
            #计算当前阵容胜率效用值
            v += model.predict_proba([list(data[final_features])])[0,0]
        player_value_dict[data['starters_{}'.format(j)]].append(v/epoch)


2460 starters_1     Xavier Tillman Sr.
starters_15                     0
Name: 46, dtype: object


100%|██████████| 2460/2460 [22:10<00:00,  1.85it/s]


In [24]:
import pickle
print(player_value_dict)
# 计算每个键对应值列表的平均值
player_value = {key: sum(values) / len(values) for key, values in player_value_dict.items()}
print(len(player_value),player_value)
sorted_value = dict(sorted(player_value.items(), key=lambda item: item[1], reverse=True))
print(sorted_value)
# 将字典保存到文件
with open('shap/{}_baseline_mvp.pkl'.format(year), 'wb') as f:
    pickle.dump(sorted_value, f)
    


defaultdict(<class 'list'>, {'John Konchar': [0.987621065104717, 0.934100231778569, 0.7631965824613862, 0.9999925173034512, 0.9819129002183746, 0.8429545618977341, 0.9675633497821992, 0.9553973165894775, 0.9999673481730914, 0.9265579450684196, 0.8574794354183208, 0.6519665269603723, 0.7119052675171458, 0.9523569851196323, 0.7518309386973614, 0.9405483027295223, 0.7469056374066054, 0.9999999999999701, 0.8737573823669699, 0.9488743655307509, 0.713014601158531, 0.9999957926644829, 0.9169683985952949, 0.9999919732642679, 0.6622889398755272, 0.9763368390089159, 0.8147669164009199, 0.6018868070865278, 0.9270570469319385, 0.9967599940144691, 0.8174861740150184, 0.7225387640206392, 0.8319255446946228, 0.72352444608737, 0.6895490225090091, 0.444746195897291, 0.6224545466377474, 0.5916064459983683, 0.832832708643784, 0.9999994009332749, 0.6008861780436667, 0.8200466281188202, 0.9527013529440772, 0.9285065193226162, 0.9642625122288692, 0.852632928826314, 0.8975346119210759, 0.7042529808183705, 0.

# 评估baseline

In [75]:
import numpy as np
from sklearn.metrics import ndcg_score, average_precision_score
import pandas as pd
from scipy.stats import spearmanr
 
year = 2024
#2024
if year == 2024:
    mvp_voting = ['Nikola Jokić', 'Shai Gilgeous-Alexander', 'Luka Dončić','Giannis Antetokounmpo',
                  'Jalen Brunson','Jayson Tatum', 'Anthony Edwards', 'Domantas Sabonis', 'Kevin Durant']
#2023
if year == 2023:
    mvp_voting = ['Joel Embiid', 'Nikola Jokić', 'Giannis Antetokounmpo', 'Jayson Tatum', 
                  'Shai Gilgeous-Alexander', 'Donovan Mitchell', 'Domantas Sabonis', 'Luka Dončić'
                  , 'Stephen Curry', 'Jimmy Butler', "De'Aaron Fox", 'Jalen Brunson', 'Ja Morant']
#2022
if year == 2022:
    mvp_voting = ['Nikola Jokić', 'Joel Embiid', 'Giannis Antetokounmpo', 'Devin Booker',
                  'Luka Dončić', 'Jayson Tatum', 'Ja Morant', 'Stephen Curry', 'Chris Paul',
                  'DeMar DeRozan', 'Kevin Durant', 'LeBron James']
topk = 3
# topk = len(mvp_voting)

In [76]:
# 从文件中加载字典
with open('shap/{}_baseline_mvp.pkl'.format(year), 'rb') as f:
    sorted_value = pickle.load(f)
    
result_df =  pd.DataFrame(columns=['starters', 'score'])
#最低场次限制
# {24:130, 23:110, 22:109, 21:87, 20:113, 19:109, 18:100, 17:122
# 16:130, 15:127, 14:123, 13:130, 12:77, 11:130, 10:130, 09:130
# 08:130, 07:79, 06:130}
data_value = result_df
result_df = \
data_value[data_value['starters'].
            map(data_value['starters'].value_counts()) > 130]
for key, value in sorted_value.items():
    new_row = {'starters': key, 'score': value}
    result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)
    # print(key, ":", value)

#球员shapley值平均 'starters', 'score'
mvp_by_shap = result_df.groupby('starters')['score'].mean().sort_values(ascending=False)
print(mvp_by_shap[:10])

# print('mvp:',mvp) # Nikola Jokić 尼古拉·约基奇:1 ,Joel Embiid 乔尔·恩比德:3
mvp_rank = mvp_by_shap.rank(ascending=False)
print(mvp_rank[:10])

rank_mean_err = 0
predicted_labels = []
for i in range(topk):
    predicted_labels.append(mvp_rank[mvp_voting[i]]-1)
    rank_mean_err += abs(mvp_rank[mvp_voting[i]]-i-1)
print('rank_mean_err:', rank_mean_err/topk)
actual_labels = list(range(1, 1+topk))
print(actual_labels)
print(predicted_labels)
# 计算Spearman相关系数
correlation, p_value = spearmanr(actual_labels, predicted_labels)
print("Spearman相关系数:", correlation)
# print("p值:", p_value)
#前13名计算召回率=精度：
count = sum(1 for i in predicted_labels if i < topk)
print("Precision or recall:", count/topk)

starters
Jaylen Martin        0.995066
Wenyen Gabriel       0.989646
Robert Williams      0.980531
Scoot Henderson      0.976247
Scotty Pippen Jr.    0.974644
Keyonte George       0.973621
Killian Hayes        0.968369
Kelly Olynyk         0.966635
Darius Bazley        0.965764
Zach Collins         0.965036
Name: score, dtype: float64
starters
Jaylen Martin         1.0
Wenyen Gabriel        2.0
Robert Williams       3.0
Scoot Henderson       4.0
Scotty Pippen Jr.     5.0
Keyonte George        6.0
Killian Hayes         7.0
Kelly Olynyk          8.0
Darius Bazley         9.0
Zach Collins         10.0
Name: score, dtype: float64
rank_mean_err: 297.0
[1, 2, 3]
[250.0, 409.0, 235.0]
Spearman相关系数: -0.5
Precision or recall: 0.0


  result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)


# 基于规则方案的baseline

In [63]:
#API：各高阶特征效率值加权平均
#Sports analytics — Evaluation ofbasketball players and team performance
import pickle
import numpy as np
import pandas as pd

#加载球员数据
def data_load(data_path, nrows = None):
    data = pd.read_csv(data_path, nrows = nrows, index_col=0)
    print(data.info())
    print(data.columns)
    return data

def data_prerpocess(df, training = True):
    used_features = ['starters','TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM']
    total_over_features = []
    for i in range(36):
        for feature in used_features:
            feature = feature + '_' + str(i)
            total_over_features.append(feature)
    float_features = ['TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM']
    total_float_features = []
    for i in range(36):
        for feature in float_features:
            feature = feature + '_' + str(i)        
            total_float_features.append(feature)
    # 去除df中无法转化为float的数据
    rows_with_special_character = df[df['BPM_0'].astype(str).str.contains(':')]
    # print(df['BPM_0'].astype(str).str.contains(':'))
    print('异常数据：',len(rows_with_special_character))
    # X = df[df[total_over_features].apply(lambda x: pd.to_numeric(x, errors='coerce')).notnull()]
    X = df[~df[total_over_features].apply(lambda x: x.astype(str).str.contains(':')).any(axis=1)]
    X[total_float_features] = X[total_float_features].astype(float)
    X = X[total_over_features]
    # 假设用0填充缺失值
    X.fillna(0, inplace=True)
   
    print('处理后数据：',X[:5])
    Y = df[~df[total_over_features+['result']].apply(lambda x: x.astype(str).str.contains(':')).any(axis=1)]['result'].astype(int)
    
    return X, Y, total_over_features

data_path = './data/nba_mvp.csv'
features = ['starters', 'team', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT',
       'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', '+/-', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM',
       'file_name']
# 连接所有球员数据
new_features = []
for i in range(36):
    for feat in features:
        new_feat = feat + '_' + str(i)
        new_features.append(new_feat)

result_features = new_features + ['score_ourside', 'score_opposite', 'result']

year = 2024
data = data_load(data_path)
if year == 2024:
        data = data[data.apply(lambda x: x.astype(str).str.contains('{}_10|{}_11|{}_12|{}_01|{}_02|{}_03|{}_04_0|2024_04_10|2024_04_11|2024_04_12|2024_04_13|2024_04_14'
                    .format(year-1,year-1,year-1,year,year,year,year), regex=True)).any(axis=1)]
if year == 2023:
    data = data[data.apply(lambda x: x.astype(str).str.contains('{}_10|{}_11|{}_12|{}_01|{}_02|{}_03|{}_04_0'
                    .format(year-1,year-1,year-1,year,year,year,year), regex=True)).any(axis=1)]
if year == 2022:
    data = data[data.apply(lambda x: x.astype(str).str.contains('{}_10|{}_11|{}_12|{}_01|{}_02|{}_03|{}_04_0|2022_04_10'
                    .format(year-1,year-1,year-1,year,year,year,year), regex=True)).any(axis=1)]    
# data = data[data.apply(lambda x: x.astype(str).str.contains('{}_10|{}_11|{}_12|{}_01|{}_02|{}_03|{}_04|{}_05|{}_06|{}_07|{}_08|{}_09'
#                 .format(year-1,year-1,year-1,year,year,year,year,year,year,year,year,year), regex=True)).any(axis=1)]
# data = data[~data[result_features].apply(lambda x: x.astype(str).str.contains(':')).any(axis=1)]
print('{}比赛数据：'.format(year),len(data))
X, Y, total_over_features = data_prerpocess(data)
print(X.shape)
print(total_over_features)


  data = pd.read_csv(data_path, nrows = nrows, index_col=0)


<class 'pandas.core.frame.DataFrame'>
Index: 63162 entries, 0 to 63161
Columns: 1371 entries, starters_0 to result
dtypes: float64(1200), int64(1), object(170)
memory usage: 661.2+ MB
None
Index(['starters_0', 'team_0', 'MP_0', 'FG_0', 'FGA_0', 'FG%_0', '3P_0',
       '3PA_0', '3P%_0', 'FT_0',
       ...
       'BLK%_35', 'TOV%_35', 'USG%_35', 'ORtg_35', 'DRtg_35', 'BPM_35',
       'file_name_35', 'score_ourside', 'score_opposite', 'result'],
      dtype='object', length=1371)
2024比赛数据： 2462
异常数据： 0
处理后数据：           starters_0  TS%_0  eFG%_0  3PAr_0  FTr_0  ORB%_0  DRB%_0  TRB%_0  \
0        Isaac Okoro  0.724   0.692   0.538  0.385     5.2     2.7     4.0   
1         Tyus Jones  0.458   0.458   0.500  0.000     3.2     4.1     3.6   
2  De'Anthony Melton  0.793   0.769   0.538  0.154     2.8    11.6     7.2   
3     Paolo Banchero  0.395   0.310   0.381  0.333     0.0    24.8    12.5   
4    Khris Middleton  0.715   0.694   0.389  0.111     4.0     6.1     5.2   

   AST%_0  STL%_0  

In [64]:
from tqdm import tqdm
import random
from collections import defaultdict

used_features = ['starters','TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%',
        'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM']

adv_info = []
for i  in tqdm(range(len(X))): 
    data = X.iloc[i].copy()
    #重构df为原数据
    for j in range(36):
        columns = [feat+'_'+str(j) for feat in used_features]
        adv_info.append(data[columns].values)

player_adv_info =  pd.DataFrame(adv_info, columns=used_features) 
print(player_adv_info[:5]) 
            
    

100%|██████████| 2462/2462 [00:17<00:00, 140.02it/s]


           starters    TS%   eFG%   3PAr    FTr  ORB%  DRB%  TRB%  AST%  STL%  \
0       Isaac Okoro  0.724  0.692  0.538  0.385   5.2   2.7   4.0   4.0   0.0   
1     Jarrett Allen  0.611  0.529  0.000  0.353  25.4  41.2  33.1  29.2   2.8   
2         Max Strus  0.400  0.400  0.800  0.000   3.0  12.4   7.6  24.0   1.5   
3         Dean Wade  0.300  0.300  1.000  0.000   3.1  22.8  12.8   0.0   0.0   
4  Craig Porter Jr.  0.379  0.364  0.091  0.182  13.5  28.2  20.7  33.9   4.9   

   BLK%  TOV%  USG%   ORtg   DRtg  BPM  
0   2.4  11.6  18.1  132.0  117.0  0.8  
1   0.0  16.9  27.1  128.0  105.0  8.2  
2   2.8  16.7  14.5  101.0  111.0 -3.7  
3   0.0   0.0   6.3   85.0  115.0 -9.6  
4   3.2  14.4  19.1  108.0  100.0  2.4  


In [65]:
#去除空值
player_adv_info = player_adv_info[player_adv_info['starters']!=0]
# print(len(player_adv_info)) 
# 对每列进行归一化
feat_normalized = ['TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%',
        'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM']
player_adv_info['score'] = player_adv_info[feat_normalized].apply(lambda x: (x - x.min()) / (x.max() - x.min())).mean(axis=1)
print(player_adv_info[:5]) 


           starters    TS%   eFG%   3PAr    FTr  ORB%  DRB%  TRB%  AST%  STL%  \
0       Isaac Okoro  0.724  0.692  0.538  0.385   5.2   2.7   4.0   4.0   0.0   
1     Jarrett Allen  0.611  0.529  0.000  0.353  25.4  41.2  33.1  29.2   2.8   
2         Max Strus  0.400  0.400  0.800  0.000   3.0  12.4   7.6  24.0   1.5   
3         Dean Wade  0.300  0.300  1.000  0.000   3.1  22.8  12.8   0.0   0.0   
4  Craig Porter Jr.  0.379  0.364  0.091  0.182  13.5  28.2  20.7  33.9   4.9   

   BLK%  TOV%  USG%   ORtg   DRtg  BPM     score  
0   2.4  11.6  18.1  132.0  117.0  0.8  0.319202  
1   0.0  16.9  27.1  128.0  105.0  8.2  0.338885  
2   2.8  16.7  14.5  101.0  111.0 -3.7  0.309336  
3   0.0   0.0   6.3   85.0  115.0 -9.6  0.299925  
4   3.2  14.4  19.1  108.0  100.0  2.4  0.292730  


# 评估baseline2

In [71]:
import numpy as np
from sklearn.metrics import ndcg_score, average_precision_score
import pandas as pd
from scipy.stats import spearmanr
 
# year = 2024
#2024
if year == 2024:
    mvp_voting = ['Nikola Jokić', 'Shai Gilgeous-Alexander', 'Luka Dončić','Giannis Antetokounmpo',
                  'Jalen Brunson','Jayson Tatum', 'Anthony Edwards', 'Domantas Sabonis', 'Kevin Durant']
#2023
if year == 2023:
    mvp_voting = ['Joel Embiid', 'Nikola Jokić', 'Giannis Antetokounmpo', 'Jayson Tatum', 
                  'Shai Gilgeous-Alexander', 'Donovan Mitchell', 'Domantas Sabonis', 'Luka Dončić'
                  , 'Stephen Curry', 'Jimmy Butler', "De'Aaron Fox", 'Jalen Brunson', 'Ja Morant']
#2022
if year == 2022:
    mvp_voting = ['Nikola Jokić', 'Joel Embiid', 'Giannis Antetokounmpo', 'Devin Booker',
                  'Luka Dončić', 'Jayson Tatum', 'Ja Morant', 'Stephen Curry', 'Chris Paul',
                  'DeMar DeRozan', 'Kevin Durant', 'LeBron James']
topk = 3
# topk = len(mvp_voting)

In [72]:
#最低场次限制
# {24:130, 23:110, 22:109, 21:87, 20:113, 19:109, 18:100, 17:122
# 16:130, 15:127, 14:123, 13:130, 12:77, 11:130, 10:130, 09:130
# 08:130, 07:79, 06:130}
data_value = player_adv_info
player_adv_info = \
data_value[data_value['starters'].
           map(data_value['starters'].value_counts()) > 130]
mvp_by_shap = player_adv_info.groupby('starters')['score'].mean().sort_values(ascending=False)
print(year,mvp_by_shap[:10])
mvp_rank = mvp_by_shap.rank(ascending=False)
rank_mean_err = 0
predicted_labels = []
for i in range(topk):
    predicted_labels.append(mvp_rank[mvp_voting[i]]-1)
    rank_mean_err += abs(mvp_rank[mvp_voting[i]]-i-1)
print('rank_mean_err:', rank_mean_err/topk)
actual_labels = list(range(1, 1+topk))
print(actual_labels)
print(predicted_labels)
# 计算Spearman相关系数
correlation, p_value = spearmanr(actual_labels, predicted_labels)
print("Spearman相关系数:", correlation)
# print("p值:", p_value)
#前13名计算召回率=精度：.
count = sum(1 for i in predicted_labels if i < topk)
print("Precision or recall:", count/topk)

2024 starters
Luka Dončić          0.335919
Sam Hauser           0.332701
Victor Wembanyama    0.331539
Nikola Jokić         0.329482
Al Horford           0.326633
Andre Drummond       0.326632
Stephen Curry        0.325977
James Harden         0.325794
Chet Holmgren        0.325534
Domantas Sabonis     0.325366
Name: score, dtype: float64
rank_mean_err: 22.333333333333332
[1, 2, 3]
[3.0, 63.0, 0.0]
Spearman相关系数: -0.5
Precision or recall: 0.3333333333333333
