<a href="https://colab.research.google.com/github/ryo42164/kaggle/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
!pip install catboost



In [72]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,KNNImputer,IterativeImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,cross_val_score,RepeatedStratifiedKFold
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [73]:
sub = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/sample_submission.csv")
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/test.csv")

In [74]:
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

In [75]:
test_or_sub = 'sub'

In [76]:
X = train.drop(['Id','Drafted'],axis=1)
y = train['Drafted']
X_test = test.drop('Id',axis=1)

In [77]:
high_missing_num = list(train.isnull().sum()[train.isnull().sum() > 500].index)

group_cols = ['Year','Position_Type']
target_cols = ['Sprint_40yd', 'Vertical_Jump', 'Bench_Press_Reps','Broad_Jump', 'Agility_3cone', 'Shuttle', 'Height', 'Weight']
CV = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=1)
global_mean = y.mean()

In [78]:
def target_encode_smooth(X,y,X_test,col,global_mean=global_mean,cv=CV,k=10,group=None):
  oof_te = np.zeros(len(X))
  if len(col) == 1:
    single= True
    col = col[0]
  else:
    single = False

  for trn_idx,val_idx in cv.split(X,y,groups=group):
    X_trn,y_trn = X.iloc[trn_idx],y.iloc[trn_idx]
    df_trn = X_trn.copy()
    df_trn['Drafted'] = y_trn

    stats = df_trn.groupby(col)['Drafted'].agg(['mean','count'])
    alpha = stats['count']/(stats['count']+k)
    smoothed = alpha * stats['mean'] + (1 - alpha) *global_mean

    if single:
      oof_te[val_idx] = X.iloc[val_idx][col].map(smoothed)
    else:
      idx = list(zip(*(X.iloc[val_idx][c]for c in col)))
      oof_te[val_idx] = pd.Series(idx).map(smoothed)
  oof_te = np.where(np.isnan(oof_te),global_mean,oof_te)

  df_full = X.copy()
  df_full['Drafted'] = y
  stats_full = df_full.groupby(col)['Drafted'].agg(['mean','count'])
  alpha_full = stats_full['count']/(stats_full['count']+k)
  smoothed_full = alpha_full * stats_full['mean'] + (1-alpha_full)*global_mean
  if single:
    X[col+'_te_smoothed'] = oof_te
    X_test[col+'_te_smoothed'] = X_test[col].map(smoothed_full).fillna(global_mean)
  else:
    key = '_'.join(col)
    X[key + '_te_smoothed'] = oof_te

    idx_test = list(zip(*(X_test[c] for c in col)))
    X_test[key+'_te_smoothed'] = pd.Series(idx_test).map(smoothed_full).fillna(global_mean)
  return X,X_test

In [79]:

FEATURE_FLAGS = {
    'School_counts': True,
    'Missing_columns': True,
    'Age_missing':True,
    'School_te':True,
    'Position_te':False,
    'Position_smoothed_te':False,
    'Position_Type_smoothed_te':True,
    'PosStats':True,
    'BMI':True,
    'SpeedScore': True,
    'ExplosivePower': True,
    'AgilityCombo': True,
    'StrengthIndex': True,
    'PowerToSize': True,
    'SchoolPos_te':False
}

if FEATURE_FLAGS['School_counts']:
  all_school_counts = pd.concat([train['School'],test['School']]).value_counts()
  oof = np.zeros(len(X))
  for tr_idx, val_idx in CV.split(X, y):
    X_trn, y_trn = X.iloc[tr_idx], y[tr_idx]
    school_counts = X_trn['School'].value_counts()
    oof[val_idx] = X.iloc[val_idx]['School'].map(school_counts).fillna(0)
  X['School_count'] = oof
  X_test['School_count'] = test['School'].map(all_school_counts).fillna(0)

if FEATURE_FLAGS['Missing_columns']:
  for col in high_missing_num:
    if not col == 'Age':
      X[col+'_missing'] = X[col].isnull().astype(int)
      X_test[col+'_missing'] = X_test[col].isnull().astype(int)

if FEATURE_FLAGS['Age_missing']:
  X['Age_missing'] = X['Age'].isnull().astype(int)
  X_test['Age_missing'] = X_test['Age'].isnull().astype(int)

if FEATURE_FLAGS['School_te']:
  X,X_test = target_encode_smooth(X,y,X_test,['School'])

if FEATURE_FLAGS['Position_te']:
  oof_te = np.zeros(len(X))

  for trn_idx,val_idx in CV.split(X,y):
    X_trn,y_trn = X.iloc[trn_idx],y.iloc[trn_idx]
    df_trn = X_trn.copy()
    df_trn['Drafted'] = y_trn

    te_map = df_trn.groupby('Position')['Drafted'].mean()
    oof_te[val_idx] = X.iloc[val_idx]['Position'].map(te_map)
  oof_te  = np.where(np.isnan(oof_te),global_mean,oof_te)
  df_full = X.copy()
  df_full['Drafted'] = y
  map_te_all = df_full.groupby('Position')['Drafted'].mean()
  X['Position_te'] = oof_te
  X_test['Position_te'] = X_test['Position'].map(map_te_all).fillna(global_mean)

if FEATURE_FLAGS['Position_smoothed_te']:
  X,X_test = target_encode_smooth(X,y,X_test,['Position'])

if FEATURE_FLAGS['Position_Type_smoothed_te']:
  X,X_test = target_encode_smooth(X,y,X_test,['Position_Type'])

if FEATURE_FLAGS['BMI']:
  X['BMI'] = X['Weight'] / (X['Height'] ** 2)
  X_test['BMI'] = X_test['Weight'] /(X_test['Height']**2)

if FEATURE_FLAGS['PosStats']:
  for col in target_cols:
    grp = X.groupby(group_cols)[col]
    mean = grp.transform('mean')
    std = grp.transform('std')

    X[f'{col}_z_pos'] = (X[col]-mean)/std
    X[f'{col}_rank_pos'] = grp.rank(pct=True)

    stats = X.groupby(group_cols)[col].agg(['mean','std'])
    merged = X_test.merge(stats,how='left',left_on = group_cols,right_index=True)

    X_test[f'{col}_z_pos'] = (X_test[col]-merged['mean'])/merged['std']

    mean_rank = X[f'{col}_rank_pos'].mean()
    grp_rank = X.groupby(group_cols)[f'{col}_rank_pos'].mean()

    X_test = X_test.merge(grp_rank.rename(f'{col}_rank_pos'),how='left',left_on=group_cols,right_index=True)
    X_test[f'{col}_rank_pos'] = X_test[f'{col}_rank_pos'].fillna(mean_rank)
# SpeedScore: 体重考慮の40ydスコア（有名な指標）
if FEATURE_FLAGS.get('SpeedScore'):
    X['SpeedScore'] = (X['Weight'] * 200) / (X['Sprint_40yd'] ** 4)
    X_test['SpeedScore'] = (X_test['Weight'] * 200) / (X_test['Sprint_40yd'] ** 4)

# ExplosivePower: 跳躍系を掛け合わせる
if FEATURE_FLAGS.get('ExplosivePower'):
    X['ExplosivePower'] = X['Vertical_Jump'] * X['Broad_Jump']
    X_test['ExplosivePower'] = X_test['Vertical_Jump'] * X_test['Broad_Jump']

# AgilityCombo: シャトルと3coneの平均
if FEATURE_FLAGS.get('AgilityCombo'):
    X['AgilityCombo'] = (X['Agility_3cone'] + X['Shuttle']) / 2
    X_test['AgilityCombo'] = (X_test['Agility_3cone'] + X_test['Shuttle']) / 2

# StrengthIndex: ベンチ / 体重
if FEATURE_FLAGS.get('StrengthIndex'):
    X['StrengthIndex'] = X['Bench_Press_Reps'] / X['Weight']
    X_test['StrengthIndex'] = X_test['Bench_Press_Reps'] / X_test['Weight']

# PowerToSize: 跳躍の爆発力 ÷ 身長
if FEATURE_FLAGS.get('PowerToSize'):
    X['PowerToSize'] = (X['Vertical_Jump'] * X['Broad_Jump']) / X['Height']
    X_test['PowerToSize'] = (X_test['Vertical_Jump'] * X_test['Broad_Jump']) / X_test['Height']

if FEATURE_FLAGS.get('SchoolPos_te'):
  X,X_test = target_encode_smooth(X,y,X_test,['School','Position'])


In [80]:
num_features

['Year',
 'Age',
 'School_count',
 'BMI',
 'SpeedScore',
 'ExplosivePower',
 'AgilityCombo',
 'StrengthIndex',
 'PowerToSize']

In [81]:
num_proc = Pipeline(steps = [
    ('imputer',IterativeImputer(random_state=1,max_iter=10,initial_strategy='median')),
    ('scaler',StandardScaler())
])

num_proc_high = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median'))
])

cat_proc = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown = 'ignore',sparse_output=False,))
])

missing_flag_proc = 'passthrough'

num_candidates = list(X.select_dtypes(exclude=['object','category']).columns)
missing_flag_cols = [c for c in X.columns if c.endswith('_missing')]
cat_features = [c for c in X.select_dtypes(include=['object','category']).columns if c != 'School']
pos_stats = [c for c in X.columns if c.endswith('_z_pos') or c.endswith('_rank_pos')]
te_cols = [c for c in X.columns if c.endswith('_te_smoothed')]
age_missing=['Age_missing']
exclude_set = set(missing_flag_cols) | set(high_missing_num) | set(pos_stats) | set(te_cols) | set(target_cols)
num_features = [c for c in num_candidates if c not in exclude_set]

def make_preprocessor(model_name):
  """
  model_name : 'lgbm','xgb','cat'
  """
  if model_name == 'cat':
    num_proc_cat = Pipeline([
        ('imputer',SimpleImputer(strategy='median'))
    ])
    cat_features_local = cat_features + ['School']
    num_cat_features = num_features.copy()
    num_cat_features.remove('School_count')
    pre = ColumnTransformer(
        transformers=[
            ('num',num_proc_cat,num_cat_features),
            #('num_high',num_proc_high,high_missing_num),
            #('missing_flag', 'passthrough', missing_flag_cols),
            #('pos_stats', 'passthrough', pos_stats),
            #('te_cols', 'passthrough', te_cols),
            #('age_missing','passthrough',age_missing),
            ('cat_raw', 'passthrough', cat_features_local)
        ],
        remainder='drop'
    )
    return pre
  else:
    pre = ColumnTransformer(
        transformers=[
            ('num', num_proc, num_features),
            ('num_high', num_proc_high, high_missing_num),
            ('missing_flag', missing_flag_proc, missing_flag_cols),
            ('cat', cat_proc, cat_features),
            ('pos_stats', 'passthrough', pos_stats),
            ('te_cols', 'passthrough', te_cols)
        ],
        remainder='drop'
    )
  return pre
best_params = {'learning_rate': 0.11034679072511581, 'num_leaves': 64, 'max_depth': 12, 'min_child_samples': 69, 'feature_fraction': 0.7189028957238169, 'bagging_fraction': 0.7212431184593704, 'bagging_freq': 6, 'lambda_l1': 2.2629728606140436, 'lambda_l2': 3.208765021602661}

def make_model_pipeline(model,model_name):
  """
  model: LGBMClassifier,CatBoostClassifier, XGBClassifierのインスタンス
  model_name: 'lgbm','cat','xgb'
  """

  preprocessor = make_preprocessor(model_name)

  pipe = Pipeline(steps=[
      ('preprocessor',preprocessor),
      ('model',model)
  ])
  return pipe


In [82]:
cat = CatBoostClassifier(
    depth=8, learning_rate=0.05, iterations=800,
    loss_function='Logloss', verbose=False
)
pre_tmp = make_preprocessor('cat')
pre_tmp.fit(X,y)

feat_names = pre_tmp.get_feature_names_out()
cat_feature_names = set(cat_features+['School'])

cat_feature_indices = [
    i for i, name in enumerate(feat_names)
    if name.split("__")[-1] in cat_feature_names
]
cat = CatBoostClassifier(
    depth=8, learning_rate=0.05, iterations=800,
    loss_function='Logloss',
    verbose=False,
    cat_features=cat_feature_indices,
)
pre = make_preprocessor('cat')
pipe_cat = Pipeline(steps=[
    ('preprocessor',pre),
    ('model', cat)
])

pipe_cat.fit(X, y)
cat_pred = pipe_cat.predict_proba(X_test)[:,1]

In [83]:
best_params = {'learning_rate': 0.11034679072511581, 'num_leaves': 64, 'max_depth': 12, 'min_child_samples': 69, 'feature_fraction': 0.7189028957238169, 'bagging_fraction': 0.7212431184593704, 'bagging_freq': 6, 'lambda_l1': 2.2629728606140436, 'lambda_l2': 3.208765021602661}
xgb_preds = []
lgbm_preds = []
seeds = [1,2,3]
for s in seeds:
  lgbm = LGBMClassifier(**best_params,random_state=s,verbose=-1)
  xgb = XGBClassifier(
      n_estimators=800,
      learning_rate=0.05,
      max_depth=8,
      subsample=0.8,
      colsample_bytree=0.8,
      random_state=s
  )
  pipe_xgb = make_model_pipeline(xgb,'xgb')
  pipe_lgbm = make_model_pipeline(lgbm,'lgbm')
  pipe_xgb.fit(X,y)
  pipe_lgbm.fit(X,y)
  xgb_preds.append(pipe_xgb.predict_proba(X_test)[:, 1])
  lgbm_preds.append(pipe_lgbm.predict_proba(X_test)[:,1])





In [84]:
xgb_pred = np.mean(xgb_preds,axis=0)
lgbm_pred = np.mean(lgbm_preds,axis=0)

In [85]:

oof_pred = np.zeros(len(X))
pred_proba = np.zeros(len(X_test))

final_pred = 0.4*lgbm_pred + 0.3*xgb_pred + 0.3* cat_pred


In [95]:
#test_or_sub = 'test'

In [96]:
if test_or_sub == 'test':
  oof_cat = np.zeros(len(train))
  for tr_idx,val_idx in CV.split(X,y):
    X_trn,y_trn = X.iloc[tr_idx],y.iloc[tr_idx]
    X_val,y_val = X.iloc[val_idx],y.iloc[val_idx]
    pipe_cat.fit(X_trn,y_trn)
    oof_cat[val_idx] = pipe_cat.predict_proba(X_val)[:,1]
    print(f'roc_auc_score:{roc_auc_score(y_val,oof_cat[val_idx])}')
  cat_score = roc_auc_score(y,oof_cat)

roc_auc_score:0.723896206682119
roc_auc_score:0.7311740890688259
roc_auc_score:0.7538887705092692
roc_auc_score:0.7335884353741496
roc_auc_score:0.7594671201814058
roc_auc_score:0.758988637006049
roc_auc_score:0.7605937921727395
roc_auc_score:0.7318559556786706
roc_auc_score:0.7121456916099773
roc_auc_score:0.696655328798186


In [99]:
if test_or_sub == 'test':
  oof_lgbm = np.zeros(len(train))
  lgbm_scores = []
  for tr_idx,val_idx in CV.split(X,y):
    X_trn,y_trn = X.iloc[tr_idx],y.iloc[tr_idx]
    X_val,y_val = X.iloc[val_idx],y.iloc[val_idx]
    pipe_lgbm.fit(X_trn,y_trn)
    oof_lgbm[val_idx] = pipe_lgbm.predict_proba(X_val)[:,1]
    lgbm_scores.append(roc_auc_score(y_val,oof_lgbm[val_idx]))
  for score in lgbm_scores:
    print(f'roc_auc_score:{score}')



roc_auc_score:0.8037904912657583
roc_auc_score:0.8112934157255487
roc_auc_score:0.8608139782655019
roc_auc_score:0.8219671201814058
roc_auc_score:0.8482426303854875
roc_auc_score:0.8570721917575894
roc_auc_score:0.8377015413026494
roc_auc_score:0.8483415015270972
roc_auc_score:0.7951955782312925
roc_auc_score:0.7768990929705215


In [102]:
if test_or_sub == 'test':
  xgb_scores = []
  oof_xgb = np.zeros(len(train))
  for tr_idx,val_idx in CV.split(X,y):
    X_trn,y_trn = X.iloc[tr_idx],y.iloc[tr_idx]
    X_val,y_val = X.iloc[val_idx],y.iloc[val_idx]
    pipe_xgb.fit(X_trn,y_trn)
    oof_xgb[val_idx] = pipe_xgb.predict_proba(X_val)[:,1]
    xgb_scores.append(roc_auc_score(y_val,oof_pred[val_idx]))
  for score in xgb_scores:
    print(f'roc_auc_score:{score}')



roc_auc_score:0.7966815535078297
roc_auc_score:0.7949002059805383
roc_auc_score:0.8598053839051069
roc_auc_score:0.8015164399092971
roc_auc_score:0.8454790249433107
roc_auc_score:0.8418932670020917
roc_auc_score:0.8328574472618794
roc_auc_score:0.8476738404716244
roc_auc_score:0.7812358276643991
roc_auc_score:0.7905895691609978


In [106]:
roc_auc_score(y,oof_xgb)

np.float64(0.8198741701798977)

In [89]:
if test_or_sub == 'test':
  oof = np.zeros(len(X))
  for trn_idx, val_idx in CV.split(X, y):
      X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
      y_tr, y_val = y[trn_idx], y[val_idx]

      # 各モデルのパイプライン作成
      pipe_lgbm = make_model_pipeline(lgbm,'lgbm')
      pipe_xgb  = make_model_pipeline(xgb,'xgb')
      pipe_cat  = make_model_pipeline(cat,'cat')

      # fit
      pipe_lgbm.fit(X_tr, y_tr)
      pipe_xgb.fit(X_tr, y_tr)
      pipe_cat.fit(X_tr, y_tr)

      # val予測
      pred_l = pipe_lgbm.predict_proba(X_val)[:,1]
      pred_x = pipe_xgb.predict_proba(X_val)[:,1]
      pred_c = pipe_cat.predict_proba(X_val)[:,1]

      # foldごとのアンサンブル
      pred_final = 0.4*pred_l + 0.3*pred_x + 0.3*pred_c

      oof[val_idx] = pred_final

  # CVスコア
  auc = roc_auc_score(y, oof)


In [104]:
np.corrcoef([oof_lgbm, oof_xgb, oof_cat])

array([[1.        , 0.94421442, 0.62904556],
       [0.94421442, 1.        , 0.6068448 ],
       [0.62904556, 0.6068448 , 1.        ]])

In [91]:
auc

np.float64(0.8281346585502236)

In [92]:
test_or_sub = 'sub'

In [93]:
if test_or_sub == 'sub':
  sub['Drafted'] = final_pred
  sub.to_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/outputs/submission.csv")