In [None]:
import pandas as pd
import time
import json
import os
from tqdm import tqdm
from nba_api.stats.endpoints import commonplayerinfo

# --- Step 1: Load the data created in the previous step ---
# --- Step 1: 前ステップで作成したデータの読み込み ---
df_use = pd.read_pickle("df_use.pkl")
player_ids = df_use['PLAYER_ID'].unique()

# --- Step 2: Read cache (empty dictionary if not present) ---
# --- Step 2: キャッシュの読み込み（存在しなければ空辞書） ---
CACHE_PATH = "player_info_cache.json"

if os.path.exists(CACHE_PATH):
    with open(CACHE_PATH, "r") as f:
        player_info_dict = json.load(f)
else:
    player_info_dict = {}

# --- Step 3: Function to get API ---.
# --- Step 3: API取得用関数 ---
def get_player_height_and_position(player_id):
    try:
        info = commonplayerinfo.CommonPlayerInfo(player_id=player_id).get_data_frames()[0]
        height = info['HEIGHT'].values[0]
        position = info['POSITION'].values[0]
        return height, position
    except:
        return None, None

# --- Step 4: API get loop (while caching) ---
# --- Step 4: API取得ループ（キャッシュしながら） ---
for pid in tqdm(player_ids):
    pid_str = str(pid)
    if pid_str in player_info_dict:
        continue
    height, pos = get_player_height_and_position(pid)
    player_info_dict[pid_str] = {'HEIGHT': height, 'POSITION': pos}
    time.sleep(0.6)  # API制限対策
    with open(CACHE_PATH, "w") as f:
        json.dump(player_info_dict, f)

# --- Step 5: Cache → DataFrame & Merge ---
# --- Step 5: キャッシュ → DataFrame化 & マージ ---
info_df = pd.DataFrame.from_dict(player_info_dict, orient='index').reset_index().rename(columns={'index': 'PLAYER_ID'})
info_df['PLAYER_ID'] = info_df['PLAYER_ID'].astype(int)

df_merged = pd.merge(df_use, info_df, on='PLAYER_ID', how='left')

# --- Step 6: Shaping process (height in cm, position simplified) ---
# --- Step 6: 整形処理（身長cm、ポジション簡略化） ---
def height_to_cm(height_str):
    try:
        feet, inches = map(int, height_str.split('-'))
        return round((feet * 12 + inches) * 2.54, 1)
    except:
        return None

def simplify_position(pos_str):
    try:
        if 'Guard' in pos_str:
            return 'G'
        elif 'Forward' in pos_str:
            return 'F'
        elif 'Center' in pos_str:
            return 'C'
    except:
        return None

df_merged['HEIGHT_CM'] = df_merged['HEIGHT'].apply(height_to_cm)
df_merged['POSITION_SIMPLE'] = df_merged['POSITION'].apply(simplify_position)

# --- Step 7: Active flag (Lv1, Lv2, both) ---
# --- Step 7: 活躍フラグ（Lv1, Lv2, 両方） ---
df_merged['Lv1'] = df_merged['GP'] >= 41
df_merged['Lv2'] = df_merged['MIN'] >= 20
df_merged['Lv1_and_Lv2'] = df_merged['Lv1'] & df_merged['Lv2']

# --- Step 8: Save ---
# --- Step 8: 保存 ---
df_merged.to_pickle("df_merged.pkl")

100%|██████████| 2652/2652 [08:47<00:00,  5.02it/s]  


In [2]:
df_merged.to_csv("df_merged.csv",index=True)

→nba_apiの使用により、pgとsgの分類が出来ないので、以下に変更

### リーグで活躍する選手の身長は、昔から今にかけて、Gは高くなり、Cは小さくなり、Fはあまり変わらない

In [1]:
import pandas as pd

# データ読み込み
df = pd.read_pickle("df_merged.pkl")

# G/F/Cで統一
def to_gfc(pos_str):
    try:
        if 'Guard' in pos_str:
            return 'G'
        elif 'Forward' in pos_str:
            return 'F'
        elif 'Center' in pos_str:
            return 'C'
    except:
        return None

# 再分類（上書き）
df['POSITION_GFC'] = df['POSITION'].apply(to_gfc)

# 身長とポジションが分かっていて、G/F/Cに分類できる行のみを対象に
df_valid = df[df['HEIGHT_CM'].notna() & df['POSITION_GFC'].notna()].copy()

# 活躍度フィルターの定義（再掲）
df_valid['Lv1'] = df_valid['GP'] >= 41
df_valid['Lv2'] = df_valid['MIN'] >= 20
df_valid['Lv1_and_Lv2'] = df_valid['Lv1'] & df_valid['Lv2']

# 保存しておく（今後のStep3で使う）
df_valid.to_pickle("df_valid_gfc.pkl")
df_valid.to_csv("df_valid_gfc.csv")