<a href="https://colab.research.google.com/github/ryo42164/kaggle/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
!pip install catboost



In [47]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,KNNImputer,IterativeImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,cross_val_score,RepeatedStratifiedKFold
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA

In [48]:
sub = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/sample_submission.csv")
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/test.csv")

In [49]:
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

In [50]:
test_or_sub = 'sub'

In [51]:
X = train.drop(['Id','Drafted'],axis=1)
y = train['Drafted']
X_test = test.drop('Id',axis=1)

In [52]:
high_missing_num = list(train.isnull().sum()[train.isnull().sum() > 500].index)

group_cols = ['Year','Position_Type']
target_cols = ['Sprint_40yd', 'Vertical_Jump', 'Bench_Press_Reps','Broad_Jump', 'Agility_3cone', 'Shuttle', 'Height', 'Weight']
new_cols = ['SpeedScore','ExplosivePower','AgilityCombo','StrengthIndex','PowerToSize']
#CV = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=1)
group = None
#group =train['Year']
CV = StratifiedKFold(n_splits=5,shuffle=True,random_state=2)
global_mean = y.mean()

In [53]:
def target_encode_smooth(X,y,X_test,col,global_mean=global_mean,cv=CV,k=10,group=group):
  oof_te = np.zeros(len(X))
  if len(col) == 1:
    single= True
    col = col[0]
  else:
    single = False

  for trn_idx,val_idx in cv.split(X,y,groups=group):
    X_trn,y_trn = X.iloc[trn_idx],y.iloc[trn_idx]
    df_trn = X_trn.copy()
    df_trn['Drafted'] = y_trn

    stats = df_trn.groupby(col)['Drafted'].agg(['mean','count'])
    alpha = stats['count']/(stats['count']+k)
    smoothed = alpha * stats['mean'] + (1 - alpha) *global_mean

    if single:
      oof_te[val_idx] = X.iloc[val_idx][col].map(smoothed)
    else:
      idx = list(zip(*(X.iloc[val_idx][c]for c in col)))
      oof_te[val_idx] = pd.Series(idx).map(smoothed)
  oof_te = np.where(np.isnan(oof_te),global_mean,oof_te)

  df_full = X.copy()
  df_full['Drafted'] = y
  stats_full = df_full.groupby(col)['Drafted'].agg(['mean','count'])
  alpha_full = stats_full['count']/(stats_full['count']+k)
  smoothed_full = alpha_full * stats_full['mean'] + (1-alpha_full)*global_mean
  if single:
    X[col+'_te_smoothed'] = oof_te
    X_test[col+'_te_smoothed'] = X_test[col].map(smoothed_full).fillna(global_mean)
  else:
    key = '_'.join(col)
    X[key + '_te_smoothed'] = oof_te

    idx_test = list(zip(*(X_test[c] for c in col)))
    X_test[key+'_te_smoothed'] = pd.Series(idx_test).map(smoothed_full).fillna(global_mean)
  return X,X_test

In [54]:


FEATURE_FLAGS = {
    'School_counts': False,
    'Missing_columns': True,
    'Age_missing':True,
    'School_te':False,
    'Position_te':False,
    'Position_smoothed_te':False,
    'Position_Type_smoothed_te':False,
    'PosStats':True,
    'BMI':True,
    'SpeedScore': True,
    'ExplosivePower': False,
    'AgilityCombo': True,
    'StrengthIndex': False,
    'PowerToSize': True,
    'SchoolPos_te':True,
    'Year_smoothed_te':False,
    'Year_counts':True,
    'Year_PositionType_te':False,
    'missing_rate_by_year':True,
    'Year_PositionType_count':True,
    'missing_cluster':True,
    'missing_PT':True,
    'School_embedding':True,
    'School_cluster':True,
    'School_stats':True
}
def build_features(X, X_test, FEATURE_FLAGS = FEATURE_FLAGS):
  if FEATURE_FLAGS['School_counts']:
    all_school_counts = pd.concat([train['School'],test['School']]).value_counts()
    oof = np.zeros(len(X))
    for tr_idx, val_idx in CV.split(X, y,groups=group):
      X_trn, y_trn = X.iloc[tr_idx], y[tr_idx]
      school_counts = X_trn['School'].value_counts()
      oof[val_idx] = X.iloc[val_idx]['School'].map(school_counts).fillna(0)
    X['School_count'] = oof
    X_test['School_count'] = test['School'].map(all_school_counts).fillna(0)

  if FEATURE_FLAGS['Missing_columns']:
    for col in high_missing_num+['Sprint_40yd']:
      if not col == 'Age':
        X[col+'_missing'] = X[col].isnull().astype(int)
        X_test[col+'_missing'] = X_test[col].isnull().astype(int)

  if FEATURE_FLAGS['Age_missing']:
    X['Age_missing'] = X['Age'].isnull().astype(int)
    X_test['Age_missing'] = X_test['Age'].isnull().astype(int)

  if FEATURE_FLAGS['School_te']:
    X,X_test = target_encode_smooth(X,y,X_test,['School'])

  if FEATURE_FLAGS['Position_te']:
    oof_te = np.zeros(len(X))

    for trn_idx,val_idx in CV.split(X,y):
      X_trn,y_trn = X.iloc[trn_idx],y.iloc[trn_idx]
      df_trn = X_trn.copy()
      df_trn['Drafted'] = y_trn

      te_map = df_trn.groupby('Position')['Drafted'].mean()
      oof_te[val_idx] = X.iloc[val_idx]['Position'].map(te_map)
    oof_te  = np.where(np.isnan(oof_te),global_mean,oof_te)
    df_full = X.copy()
    df_full['Drafted'] = y
    map_te_all = df_full.groupby('Position')['Drafted'].mean()
    X['Position_te'] = oof_te
    X_test['Position_te'] = X_test['Position'].map(map_te_all).fillna(global_mean)

  if FEATURE_FLAGS['Position_smoothed_te']:
    X,X_test = target_encode_smooth(X,y,X_test,['Position'])

  if FEATURE_FLAGS['Position_Type_smoothed_te']:
    X,X_test = target_encode_smooth(X,y,X_test,['Position_Type'])

  if FEATURE_FLAGS['BMI']:
    X['BMI'] = X['Weight'] / (X['Height'] ** 2)
    X_test['BMI'] = X_test['Weight'] /(X_test['Height']**2)

  # SpeedScore: 体重考慮の40ydスコア（有名な指標）
  if FEATURE_FLAGS.get('SpeedScore'):
      X['SpeedScore'] = (X['Weight'] * 200) / (X['Sprint_40yd'] ** 4)
      X_test['SpeedScore'] = (X_test['Weight'] * 200) / (X_test['Sprint_40yd'] ** 4)

  # ExplosivePower: 跳躍系を掛け合わせる
  if FEATURE_FLAGS.get('ExplosivePower'):
      X['ExplosivePower'] = X['Vertical_Jump'] * X['Broad_Jump']
      X_test['ExplosivePower'] = X_test['Vertical_Jump'] * X_test['Broad_Jump']

  # AgilityCombo: シャトルと3coneの平均
  if FEATURE_FLAGS.get('AgilityCombo'):
      X['AgilityCombo'] = (X['Agility_3cone'] + X['Shuttle']) / 2
      X_test['AgilityCombo'] = (X_test['Agility_3cone'] + X_test['Shuttle']) / 2

  # StrengthIndex: ベンチ / 体重
  if FEATURE_FLAGS.get('StrengthIndex'):
      X['StrengthIndex'] = X['Bench_Press_Reps'] / X['Weight']
      X_test['StrengthIndex'] = X_test['Bench_Press_Reps'] / X_test['Weight']

  # PowerToSize: 跳躍の爆発力 ÷ 身長
  if FEATURE_FLAGS.get('PowerToSize'):
      X['PowerToSize'] = (X['Vertical_Jump'] * X['Broad_Jump']) / X['Height']
      X_test['PowerToSize'] = (X_test['Vertical_Jump'] * X_test['Broad_Jump']) / X_test['Height']

  if FEATURE_FLAGS['PosStats']:
    posstats_base_cols = []
    for c in target_cols:
      if c in X.columns:
        posstats_base_cols.append(c)

    for c in new_cols:
      if c in X.columns:
        posstats_base_cols.append(c)

    for col in posstats_base_cols:
      grp = X.groupby(group_cols)[col]
      mean = grp.transform('mean')
      std = grp.transform('std')

      X[f'{col}_z_pos'] = (X[col]-mean)/std
      X[f'{col}_rank_pos'] = grp.rank(pct=True)

      stats = X.groupby(group_cols)[col].agg(['mean','std'])
      merged = X_test.merge(stats,how='left',left_on = group_cols,right_index=True)

      X_test[f'{col}_z_pos'] = (X_test[col]-merged['mean'])/merged['std']

      mean_rank = X[f'{col}_rank_pos'].mean()
      grp_rank = X.groupby(group_cols)[f'{col}_rank_pos'].mean()

      X_test = X_test.merge(grp_rank.rename(f'{col}_rank_pos'),how='left',left_on=group_cols,right_index=True)
      X_test[f'{col}_rank_pos'] = X_test[f'{col}_rank_pos'].fillna(mean_rank)


  if FEATURE_FLAGS.get('SchoolPos_te'):
    X,X_test = target_encode_smooth(X,y,X_test,['School','Position'])

  if FEATURE_FLAGS['Year_smoothed_te']:
    X,X_test = target_encode_smooth(X,y,X_test,['Year'],cv=RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=1),group=None)

  if FEATURE_FLAGS.get('Year_counts'):
    year_counts = X['Year'].value_counts()
    X['Year_count'] = X['Year'].map(year_counts)
    all_year_counts = pd.concat([train['Year'], test['Year']]).value_counts()
    X_test['Year_count'] = X_test['Year'].map(all_year_counts).fillna(0)

  if FEATURE_FLAGS.get('Year_te'):
      X, X_test = target_encode_smooth(X, y, X_test, ['Year'])

  if FEATURE_FLAGS.get('Year_PositionType_te'):
      X, X_test = target_encode_smooth(X, y, X_test, ['Year','Position_Type'])

  if FEATURE_FLAGS.get('missing_rate_by_year'):
    for col in high_missing_num:
      miss_rate_by_year = train.groupby('Year')[col].apply(lambda s: s.isnull().mean())
      X[col+'_missing_rate_year'] = X['Year'].map(miss_rate_by_year)
      X_test[col+'_missing_rate_year'] = X_test['Year'].map(miss_rate_by_year)

  if FEATURE_FLAGS.get('Year_PositionType_count'):
      grp = X.groupby(['Year','Position_Type']).size()
      X['Year_PositionType_count'] = list(
          zip(X['Year'], X['Position_Type'])
      )
      X['Year_PositionType_count'] = X['Year_PositionType_count'].map(grp)

      all_grp = pd.concat([
          train[['Year','Position_Type']],
          test[['Year','Position_Type']]
      ]).groupby(['Year','Position_Type']).size()

      X_test['Year_PositionType_count'] = list(
          zip(X_test['Year'], X_test['Position_Type'])
      )
      X_test['Year_PositionType_count'] = X_test['Year_PositionType_count'].map(all_grp).fillna(0)

  if FEATURE_FLAGS.get('missing_cluster'):
    df = pd.concat([train,test])
    missing_matrix = df[high_missing_num].isna().astype(int)
    kmeans = KMeans(n_clusters=5,random_state=1,n_init='auto')
    clusters = kmeans.fit_predict(missing_matrix)
    X['missing_cluster'] = clusters[~df['Drafted'].isnull()]
    X_test['missing_cluster'] = clusters[df['Drafted'].isnull()]
    X['missing_cluster'] = X['missing_cluster'].astype(str)
    X_test['missing_cluster'] = X_test['missing_cluster'].astype(str)

  if FEATURE_FLAGS.get('missing_PT'):
    for col in high_missing_num+['Sprint_40yd']:
      for p in train['Position_Type'].unique():
        X[f'{col}_missing_PT_{p}'] = ((X[f'{col}_missing']==1)& (X['Position_Type']== p)).astype(int)
        X_test[f'{col}_missing_PT_{p}'] = ((X_test[f'{col}_missing']==1)& (X_test['Position_Type']== p)).astype(int)

  if FEATURE_FLAGS.get('missing_rate_year'):
    for col in high_missing_num + ['Sprint_40yd']:
      year_missing_rate = X.groupby('Year')[col].apply(lambda x: x.isna().mean())

      X[f'{col}_missing_rate_year'] = X['Year'].map(year_missing_rate)
      X_test[f'{col}_missing_rate_year'] = X_test['Year'].map(year_missing_rate).fillna(0)
  X['Year'] = X['Year'].astype(str)
  X_test['Year'] = X_test['Year'].astype(str)
  return X,X_test

  if FEATURE_FLAGS('School_embedding'):
    embed_cols = []
    for c in target_cols:
      if c in X.columns:
        embed_cols.append(c)

    for c in new_cols:
      if c in X.columns:
        embed_cols.append(c)
    df_all = pd.concat([X[['School']+embed_cols],X_test[['School']+embed_cols]])
    school_profile = df_all.groupby('School')[embed_cols].mean()
    school_profile.fillna(school_profile.mean())

    scaler = StandardScaler()
    school_profile_scaled = scaler.fit_transform(school_profile.values)

    pca = PCA(n_components=3,random_state=1)
    school_embed = pca.fit_transform(school_profile_scaled)

    school_embed_df = pd.dataFrame(school_embed,
                                   index=school_profile.index,
                                   columns=["School_emb1","School_emb2","School_emb3"])
    X = X.merge(school_embed_df,how='left',left_on = 'School',right_index=True)
    X_test = X_test.merge(school_embed_df,how='left',left_on = 'School',right_index=True)

In [55]:
num_proc = Pipeline(steps = [
    ('imputer',IterativeImputer(random_state=1,max_iter=10,initial_strategy='median')),
    ('scaler',StandardScaler())
])

num_proc_high = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median'))
])

cat_proc = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown = 'ignore',sparse_output=False,))
])

missing_flag_proc = 'passthrough'

num_candidates = list(X.select_dtypes(exclude=['object','category']).columns)
missing_flag_cols = [c for c in X.columns if 'missing' in c]
cat_features = [c for c in X.select_dtypes(include=['object','category']).columns if c != 'School']
pos_stats = [c for c in X.columns if c.endswith('_z_pos') or c.endswith('_rank_pos')]
te_cols = [c for c in X.columns if c.endswith('_te_smoothed')]
age_missing=['Age_missing']
exclude_set = set(missing_flag_cols) | set(high_missing_num) | set(pos_stats) | set(te_cols) | set(target_cols) | set(new_cols)
num_features = [c for c in num_candidates if c not in exclude_set]

def make_preprocessor(X,model_name):
  """
  model_name : 'lgbm','xgb','cat'
  """
  missing_flag_proc = 'passthrough'
  high_missing_num = list(train.isnull().sum()[train.isnull().sum() > 500].index)

  num_cols = X.select_dtypes(include=['int', 'float']).columns.tolist()
  cat_cols = X.select_dtypes(include=['object']).columns.tolist()

  missing_flag_cols = [c for c in X.columns if 'missing' in c]
  pos_stats = [c for c in X.columns if c.endswith('_z_pos') or c.endswith('_rank_pos')]
  te_cols = [c for c in X.columns if c.endswith('_te_smoothed')]

  num_main_cat = [c for c in num_cols if c not in missing_flag_cols + pos_stats + te_cols]
  num_main = [c for c in num_main_cat if c not in high_missing_num]

  if model_name == 'cat':
    num_proc_cat = Pipeline([
        ('imputer',SimpleImputer(strategy='median'))
    ])
    pre = ColumnTransformer(
        transformers=[
            ('num',num_proc_cat,num_main_cat),
            ('cat_raw', 'passthrough', cat_cols)
        ],
        remainder='drop'
    )
    return pre

  elif model_name == 'lgbm':
    pre = ColumnTransformer(
        transformers=[
            ('num', num_proc, num_main),
            ('num_high',num_proc_high,high_missing_num),
            ('missing_flag', missing_flag_proc, missing_flag_cols),
            ('cat', cat_proc, cat_cols),
            ('pos_stats', 'passthrough', pos_stats),
            ('te_cols', 'passthrough', te_cols)
        ],
        remainder='drop'
    )
    return pre
  elif model_name == 'xgb':
    pre = ColumnTransformer(
        transformers=[
            ('num', num_proc, num_main),
            ('num_high', num_proc_high, high_missing_num),
            ('missing_flag', missing_flag_proc,missing_flag_cols),
            ('cat', cat_proc, cat_cols),
            #('pos_stats', 'passthrough', pos_stats),
            ('te_cols', 'passthrough', te_cols)
        ],
        remainder='drop'
    )
  return pre
best_params = {'learning_rate': 0.11034679072511581, 'num_leaves': 64, 'max_depth': 12, 'min_child_samples': 69, 'feature_fraction': 0.7189028957238169, 'bagging_fraction': 0.7212431184593704, 'bagging_freq': 6, 'lambda_l1': 2.2629728606140436, 'lambda_l2': 3.208765021602661}

def make_model_pipeline(X,model,model_name,s=None):
  """
  model: LGBMClassifier,CatBoostClassifier, XGBClassifierのインスタンス
  model_name: 'lgbm','cat','xgb'
  """

  preprocessor = make_preprocessor(X,model_name)
  if model_name == 'cat':
    cat = CatBoostClassifier(
        depth=8, learning_rate=0.05, iterations=800,
        loss_function='Logloss', verbose=False
    )
    pre_tmp = make_preprocessor(X,'cat')
    pre_tmp.fit(X,y)

    feat_names = pre_tmp.get_feature_names_out()
    cat_feature_names = set(cat_features+['School'])

    cat_feature_indices = [
        i for i, name in enumerate(feat_names)
        if name.split("__")[-1] in cat_feature_names
    ]
    cat = CatBoostClassifier(depth=8,learning_rate=0.05,
                             iterations=800,random_state=s,
                             loss_function='Logloss', verbose=False,
                             cat_features=cat_feature_indices)
    pre = make_preprocessor(X,'cat')
    pipe = Pipeline(steps=[
        ('preprocessor',pre),
        ('model', cat)
    ])
    return pipe
  else:
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',model)
    ])
  return pipe


In [56]:
lgbm = LGBMClassifier(**best_params,random_state=3,verbose=-1)
xgb = XGBClassifier(
    n_estimators=800,
    learning_rate=0.02,
    max_depth=12,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight = 5,
    random_state=3
)
cat = None

def make_oof(X,y,wl=0.6,wx=0.1,wc=0.3,s=1,cv=CV,group=group):
  oof = np.zeros(len(X))
  lgbm_oof = np.zeros(len(X))
  xgb_oof = np.zeros(len(X))
  cat_oof = np.zeros(len(X))


  scores = []
  lgbm_scores = []
  xgb_scores = []
  cat_scores = []
  for trn_idx, val_idx in cv.split(X, y,groups=group):
      X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
      y_tr, y_val = y[trn_idx], y[val_idx]

      # 各モデルのパイプライン作成
      pipe_lgbm = make_model_pipeline(X,lgbm,'lgbm')
      pipe_xgb  = make_model_pipeline(X,xgb,'xgb')
      pipe_cat  = make_model_pipeline(X,cat,'cat',s=s)

      # fit
      pipe_lgbm.fit(X_tr, y_tr)
      pipe_xgb.fit(X_tr, y_tr)
      pipe_cat.fit(X_tr, y_tr)

      # val予測
      pred_l = pipe_lgbm.predict_proba(X_val)[:,1]
      pred_x = pipe_xgb.predict_proba(X_val)[:,1]
      pred_c = pipe_cat.predict_proba(X_val)[:,1]

      # foldごとのアンサンブル
      pred_final = wl*pred_l + wx*pred_x + wc*pred_c

      scores.append(roc_auc_score(y_val,pred_final))
      lgbm_scores.append(roc_auc_score(y_val,pred_l))
      xgb_scores.append(roc_auc_score(y_val,pred_x))
      cat_scores.append(roc_auc_score(y_val,pred_c))
      oof[val_idx] = pred_final
      lgbm_oof[val_idx] = pred_l
      xgb_oof[val_idx] = pred_x
      cat_oof[val_idx] = pred_c

  # CVスコア
  score = roc_auc_score(y, oof)
  lgbm_score = roc_auc_score(y, lgbm_oof)
  xgb_score = roc_auc_score(y, xgb_oof)
  cat_score = roc_auc_score(y,cat_oof)
  score_dict = {'lgbm_score':lgbm_score,'xgb_score':xgb_score,'cat_score':roc_auc_score(y, cat_oof),'final_score':score}
  scores_dict = {'lgbm_scores':lgbm_scores,'xgb_scores':xgb_scores,'cat_scores':cat_scores,'final_scores':scores}
  oof_dict = {'lgbm_oof':lgbm_oof,'xgb_oof':xgb_oof,'cat_oof':cat_oof}
  return score_dict,scores_dict,oof_dict

if test_or_sub == 'test':
  X,X_test = build_features(X,X_test)
  score_dict,scores_dict,oof_dict = make_oof(X,y)
  print(score_dict)

{'lgbm_score': np.float64(0.8290397621777836),
 'xgb_score': np.float64(0.8169790862082851),
 'cat_score': np.float64(0.7509637992575429),
 'final_score': np.float64(0.8317800257920509)}

In [57]:
def find_best_w(oof_dict):
  """
  wl,wx,wcの順番
  """
  oof_lgbm = oof_dict['lgbm_oof']
  oof_xgb  = oof_dict['xgb_oof']
  oof_cat  = oof_dict['cat_oof']

  best_auc = -1
  best_w = None

  ws = np.linspace(0, 1, 101)  # 0.00〜1.00

  for w1 in ws:
      for w2 in ws:
          w3 = 1.0 - w1 - w2
          if w3 < 0:
              continue

          blend = (
              w1 * oof_lgbm +
              w2 * oof_xgb +
              w3 * oof_cat
          )

          auc = roc_auc_score(y, blend)
          if auc > best_auc:
              best_auc = auc
              best_w = (w1, w2, w3)

  print("best_auc:", best_auc)
  print("best_w:", best_w)
  return best_w,best_auc

In [None]:
def val():
  make_oof

In [58]:
best_params = {'learning_rate': 0.11034679072511581, 'num_leaves': 64, 'max_depth': 12, 'min_child_samples': 69, 'feature_fraction': 0.7189028957238169, 'bagging_fraction': 0.7212431184593704, 'bagging_freq': 6, 'lambda_l1': 2.2629728606140436, 'lambda_l2': 3.208765021602661}
def predict(X,y,X_test):
  X,X_test = build_features(X,X_test)
  xgb_preds = []
  lgbm_preds = []
  cat_preds =[]
  xgb_oofs =[]
  lgbm_oofs = []
  cat_oofs = []
  seeds = [1,2,3]
  cat = CatBoostClassifier(
      depth=8, learning_rate=0.05, iterations=800,
      loss_function='Logloss', verbose=False
  )
  pre_tmp = make_preprocessor(X,'cat')
  pre_tmp.fit(X,y)

  feat_names = pre_tmp.get_feature_names_out()
  cat_feature_names = set(cat_features+['School'])

  cat_feature_indices = [
      i for i, name in enumerate(feat_names)
      if name.split("__")[-1] in cat_feature_names
  ]
  for s in seeds:
    lgbm = LGBMClassifier(**best_params,random_state=s,verbose=-1)

    xgb = XGBClassifier(
        n_estimators=800,
        learning_rate=0.02,
        max_depth=12,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight = 5,
        random_state=s
    )

    cat = CatBoostClassifier(depth=8,learning_rate=0.05,
                             iterations=800,random_state=s,
                             loss_function='Logloss', verbose=False,
                             cat_features=cat_feature_indices)

    pipe_cat = make_model_pipeline(X,cat,'cat')
    pipe_xgb = make_model_pipeline(X,xgb,'xgb')
    pipe_lgbm = make_model_pipeline(X,lgbm,'lgbm')

    pipe_cat.fit(X,y)
    pipe_xgb.fit(X,y)
    pipe_lgbm.fit(X,y)

    cat_preds.append(pipe_cat.predict_proba(X_test)[:, 1])
    xgb_preds.append(pipe_xgb.predict_proba(X_test)[:, 1])
    lgbm_preds.append(pipe_lgbm.predict_proba(X_test)[:,1])

    _,_,oof_dict = make_oof(X,y,s=s)
    lgbm_oofs.append(oof_dict['lgbm_oof'])
    cat_oofs.append(oof_dict['cat_oof'])
    xgb_oofs.append(oof_dict['xgb_oof'])

  lgbm_oof = np.mean(lgbm_oofs,axis=0)
  xgb_oof = np.mean(xgb_oofs,axis=0)
  cat_oof = np.mean(cat_oofs,axis=0)
  oof_dict = {'lgbm_oof':lgbm_oof,'xgb_oof':xgb_oof,'cat_oof':cat_oof}
  (wl,wx,wc),score = find_best_w(oof_dict)

  xgb_pred = np.mean(xgb_preds,axis=0)
  lgbm_pred = np.mean(lgbm_preds,axis=0)
  cat_pred = np.mean(cat_preds,axis=0)
  pred_proba = np.zeros(len(X_test))

  final_pred = wl*lgbm_pred + wx*xgb_pred + wc* cat_pred
  print(score)
  return final_pred



In [59]:
if test_or_sub == 'sub':
  final_pred = predict(X,y,X_test)
  sub['Drafted'] = final_pred
  sub.to_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/outputs/submission.csv")



best_auc: 0.8358059221905776
best_w: (np.float64(0.73), np.float64(0.07), np.float64(0.2))
0.8358059221905776


In [60]:
import os
import json
from datetime import datetime
def run_experiment(feature_flags_override):
  """
  feature_flags_override:dict
    ex: {Posstats:False,'missing_PT:False}
  """
  flags = FEATURE_FLAGS.copy()
  flags.update(feature_flags_override)

  X_exp = train.drop(['Id','Drafted'],axis=1)
  X_test_exp = test.drop('Id',axis=1)
  X_exp,X_test_exp = build_features(X_exp,X_test_exp,FEATURE_FLAGS=flags)
  scores,_,oof_dict = make_oof(X_exp,y)
  best_w,best_score = find_best_w(oof_dict)
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  log_path = f"/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/logs/exp_{timestamp}.json"

  log_data = {
      "disabled_flags": [k for k,v in feature_flags_override.items() if v is False],
      "cv_scores": scores,
      "wl,wx,wc,best_score":[best_w,best_score]
  }

  with open(log_path, "w") as f:
      json.dump(log_data, f, indent=2)

  print(f"Saved log to {log_path}")
  return scores,best_score



In [61]:

feature_blocks = {
    "missing": [
        "Missing_columns",
        "Age_missing",
        "missing_cluster",
        "missing_PT",
        "missing_rate_by_year"
    ],

    "year": [
        "Year_smoothed_te",
        "Year_counts",
        "Year_PositionType_te",
        "Year_PositionType_count"
    ],

    "school": [
        "School_te",
        "School_counts",
        "SchoolPos_te",
        'School_embedding',
        'School_cluster',
        'School_stats'
    ],
    "pos_stats": [
        "PosStats"
    ],

    "physics": [
        "SpeedScore",
        "ExplosivePower",
        "AgilityCombo",
        "StrengthIndex",
        "PowerToSize"
    ],

    "te": [
        "Position_te",
        "Position_smoothed_te",
        "Position_Type_smoothed_te",
        "Year_te"
    ]
}
if test_or_sub == 'log':
  results={}
  for block_name,flags in feature_blocks.items():
    disable_dict = {f:False for f in flags}
    scores = run_experiment(disable_dict)
    results[block_name] = scores
  print("\n=== Block Ablation Summary ===")
  for k, v in results.items():
    print(k, v)

In [62]:
blocks_phase2 = {
    "school": ["School_counts", "School_te", "SchoolPos_te"],
    "physics": ["SpeedScore", "ExplosivePower", "AgilityCombo", "StrengthIndex", "PowerToSize"]
}

import itertools

def generate_subsets(feature_list):
    """
    feature_list 内の要素を 0〜全部OFF まで全部列挙する。
    例: ["A","B","C"] → [], ["A"], ["B"], ["C"], ["A","B"], ...
    """
    subsets = []
    n = len(feature_list)
    for r in range(n+1):
        for comb in itertools.combinations(feature_list, r):
            subsets.append(list(comb))
    return subsets

def search_block(block_name, feature_list):
    print(f"\n=== Searching combinations for block: {block_name} ===")

    subsets = generate_subsets(feature_list)
    results = {}

    for subset in subsets:
        disable_dict = {feat: False for feat in subset}  # subset内の特徴量OFF

        print(f" -> Testing OFF: {subset}")
        scores = run_experiment(disable_dict)
        results[tuple(subset)] = scores

    return results
"""
block_results = {}
if test_or_sub == 'log':
  for block_name, feats in blocks_phase2.items():
      res = search_block(block_name, feats)
      block_results[block_name] = res
"""
def summarize_results(block_results):
    print("\n===== Summary =====")
    for block, res in block_results.items():
        print(f"\n### Block: {block}")

        for subset, scores in res.items():
            fs = float(scores["final_score"])
            print(f"OFF={subset}  → final={fs:.6f}")



In [None]:
summarize_results(block_results)


In [None]:
from sklearn.model_selection import StratifiedKFold

CV_REPRO = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
import numpy as np

def reproducibility_check(feature_flags_override, n_runs=2, cv=CV_REPRO):
    scores = []
    for i in range(n_runs):
        s = run_experiment(feature_flags_override)  # run_experimentがcv受け取れるなら
        scores.append(float(s[1]))             # または最適化スコアを返すならそっち
    return np.mean(scores), np.std(scores), scores

flags_row20 = {'ExplosivePower':False, 'StrengthIndex':False}
flags_row0 = {'ExplosivePower':True, 'AgilityCombo':True, 'StrengthIndex':True, 'PowerToSize':True}

mean20, std20, raw20 = reproducibility_check(flags_row20, n_runs=2)
mean0,  std0,  raw0  = reproducibility_check(flags_row0,  n_runs=2)

print("row20:", mean20, std20, raw20)
print("row0 :", mean0,  std0,  raw0)
