<a href="https://colab.research.google.com/github/ryo42164/kaggle/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [196]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [197]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,KNNImputer,IterativeImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,cross_val_score,RepeatedStratifiedKFold
from lightgbm import LGBMClassifier

In [198]:
sub = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/sample_submission.csv")
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/test.csv")

In [199]:
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

In [200]:
log_or_sub = ''

In [201]:
X = train.drop(['Id','Drafted'],axis=1)
y = train['Drafted']
X_test = test.drop('Id',axis=1)

In [202]:
high_missing_num = list(train.isnull().sum()[train.isnull().sum() > 500].index)

group_cols = ['Year','Position_Type']
target_cols = ['Sprint_40yd', 'Vertical_Jump', 'Bench_Press_Reps','Broad_Jump', 'Agility_3cone', 'Shuttle', 'Height', 'Weight']
CV = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=1)
global_mean = y.mean()

In [203]:
def target_encode_smooth(X,y,X_test,col,global_mean=global_mean,cv=CV,k=10,group=None):
  oof_te = np.zeros(len(X))

  for trn_idx,val_idx in cv.split(X,y,groups=group):
    X_trn,y_trn = X.iloc[trn_idx],y.iloc[trn_idx]
    df_trn = X_trn.copy()
    df_trn['Drafted'] = y_trn

    stats = df_trn.groupby(col)['Drafted'].agg(['mean','count'])
    alpha = stats['count']/(stats['count']+k)
    smoothed = alpha * stats['mean'] + (1 - alpha) *global_mean
    oof_te[val_idx] = X.iloc[val_idx][col].map(smoothed)
  oof_te = np.where(np.isnan(oof_te),global_mean,oof_te)

  df_full = X.copy()
  df_full['Drafted'] = y
  stats_full = df_full.groupby(col)['Drafted'].agg(['mean','count'])
  alpha_full = stats_full['count']/(stats_full['count']+k)
  smoothed_full = alpha_full * stats_full['mean'] + (1-alpha_full)*global_mean
  X[col+'_te_smoothed'] = oof_te
  X_test[col+'_te_smoothed'] = X_test[col].map(smoothed_full).fillna(global_mean)
  return X,X_test

In [204]:

FEATURE_FLAGS = {
    'School_counts': True,
    'Missing_columns': True,
    'Age_missing':True,
    'School_te':True,
    'Position_te':False,
    'Position_smoothed_te':False,
    'Position_Type_smoothed_te':True,
    'BMI':True,
    'PosStats':True
    }

if FEATURE_FLAGS['School_counts']:
  all_school_counts = pd.concat([train['School'],test['School']]).value_counts()
  oof = np.zeros(len(X))
  for tr_idx, val_idx in CV.split(X, y):
    X_trn, y_trn = X.iloc[tr_idx], y[tr_idx]
    school_counts = X_trn['School'].value_counts()
    oof[val_idx] = X.iloc[val_idx]['School'].map(school_counts).fillna(0)
  X['School_count'] = oof
  X_test['School_count'] = test['School'].map(all_school_counts).fillna(0)

if FEATURE_FLAGS['Missing_columns']:
  for col in high_missing_num:
    if not col == 'Age':
      X[col+'_missing'] = X[col].isnull().astype(int)
      X_test[col+'_missing'] = X_test[col].isnull().astype(int)

if FEATURE_FLAGS['Age_missing']:
  X['Age_missing'] = X['Age'].isnull().astype(int)
  X_test['Age_missing'] = X_test['Age'].isnull().astype(int)

if FEATURE_FLAGS['School_te']:
  X,X_test = target_encode_smooth(X,y,X_test,'School')

if FEATURE_FLAGS['Position_te']:
  oof_te = np.zeros(len(X))

  for trn_idx,val_idx in CV.split(X,y):
    X_trn,y_trn = X.iloc[trn_idx],y.iloc[trn_idx]
    df_trn = X_trn.copy()
    df_trn['Drafted'] = y_trn

    te_map = df_trn.groupby('Position')['Drafted'].mean()
    oof_te[val_idx] = X.iloc[val_idx]['Position'].map(te_map)
  oof_te  = np.where(np.isnan(oof_te),global_mean,oof_te)
  df_full = X.copy()
  df_full['Drafted'] = y
  map_te_all = df_full.groupby('Position')['Drafted'].mean()
  X['Position_te'] = oof_te
  X_test['Position_te'] = X_test['Position'].map(map_te_all).fillna(global_mean)

if FEATURE_FLAGS['Position_smoothed_te']:
  X,X_test = target_encode_smooth(X,y,X_test,'Position')

if FEATURE_FLAGS['Position_Type_smoothed_te']:
  X,X_test = target_encode_smooth(X,y,X_test,'Position_Type')

if FEATURE_FLAGS['BMI']:
  X['BMI'] = X['Weight'] / (X['Height'] ** 2)
  X_test['BMI'] = X_test['Weight'] /(X_test['Height']**2)

if FEATURE_FLAGS['PosStats']:
  for col in target_cols:
    grp = X.groupby(group_cols)[col]
    mean = grp.transform('mean')
    std = grp.transform('std')

    X[f'{col}_z_pos'] = (X[col]-mean)/std
    X[f'{col}_rank_pos'] = grp.rank(pct=True)

    stats = X.groupby(group_cols)[col].agg(['mean','std'])
    merged = X_test.merge(stats,how='left',left_on = group_cols,right_index=True)

    X_test[f'{col}_z_pos'] = (X_test[col]-merged['mean'])/merged['std']

    mean_rank = X[f'{col}_rank_pos'].mean()
    grp_rank = X.groupby(group_cols)[f'{col}_rank_pos'].mean()

    X_test = X_test.merge(grp_rank.rename(f'{col}_rank_pos'),how='left',left_on=group_cols,right_index=True)
    X_test[f'{col}_rank_pos'] = X_test[f'{col}_rank_pos'].fillna(mean_rank)


In [205]:
num_proc = Pipeline(steps = [
    ('imputer',IterativeImputer(random_state=1,max_iter=10,initial_strategy='median')),
    ('scaler',StandardScaler())
])

num_proc_high = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median'))
])

cat_proc = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown = 'ignore',sparse_output=False,))
])

missing_flag_proc = 'passthrough'

num_candidates = list(X.select_dtypes(exclude=['object','category']).columns)
missing_flag_cols = [c for c in X.columns if c.endswith('_missing')]
cat_features = [c for c in X.select_dtypes(include=['object','category']).columns if c != 'School']
pos_stats = [c for c in X.columns if c.endswith('_z_pos') or c.endswith('_rank_pos')]
te_cols = [c for c in X.columns if c.endswith('_te_smoothed')]

exclude_set = set(missing_flag_cols) | set(high_missing_num) | set(pos_stats) | set(te_cols) | set(target_cols)
num_features = [c for c in num_candidates if c not in exclude_set]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_proc, num_features),
        ('num_high', num_proc_high, high_missing_num),
        ('missing_flag', missing_flag_proc, missing_flag_cols),
        ('cat', cat_proc, cat_features),
        ('pos_stats', 'passthrough', pos_stats),
        ('te_cols', 'passthrough', te_cols)
    ],
    remainder='drop'
)

best_params = {'learning_rate': 0.11034679072511581, 'num_leaves': 64, 'max_depth': 12, 'min_child_samples': 69, 'feature_fraction': 0.7189028957238169, 'bagging_fraction': 0.7212431184593704, 'bagging_freq': 6, 'lambda_l1': 2.2629728606140436, 'lambda_l2': 3.208765021602661}
seeds = [1,10,42]
models = []
for s in seeds:
  clf = LGBMClassifier(**best_params,random_state=s,verbose=-1)
  pipe = Pipeline(steps=[
      ('preprocessor',preprocessor),
      ('clf',clf)
  ])
  pipe.fit(X,y)
  models.append(pipe)

In [207]:
oof_pred = np.zeros(len(X))
pred_proba = np.zeros(len(X_test))

groups = X['Year'].values

#cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=2)

preds = []
for model in models:
    preds.append(model.predict_proba(X_test)[:, 1])

final_pred = np.mean(preds, axis=0)

for tr_idx,val_idx in CV.split(X,y):
  X_trn,y_trn = X.iloc[tr_idx],y.iloc[tr_idx]
  X_val,y_val = X.iloc[val_idx],y.iloc[val_idx]
  pipe.fit(X_trn,y_trn)
  oof_pred[val_idx] = pipe.predict_proba(X_val)[:,1]
  print(f'roc_auc_score:{roc_auc_score(y_val,oof_pred[val_idx])}')
  pred_proba += pipe.predict_proba(X_test)[:,1]/CV.get_n_splits()
score = roc_auc_score(y,oof_pred)
print(score)

roc_auc_score:0.80599525128611
roc_auc_score:0.7967185169401235
roc_auc_score:0.8624334114638823
roc_auc_score:0.8155753968253968
roc_auc_score:0.8404336734693877
roc_auc_score:0.8544010401944712
roc_auc_score:0.840670502166347
roc_auc_score:0.8558988564528731
roc_auc_score:0.7799886621315194
roc_auc_score:0.7757086167800454
0.822197609755157


In [208]:
train[train.isnull().sum()>0&train.isnull()].mean().dropna()

Unnamed: 0,0
Age,21.997016
Sprint_40yd,4.764818
Vertical_Jump,83.506349
Bench_Press_Reps,20.236408
Broad_Jump,291.9857
Agility_3cone,7.230447
Shuttle,4.399422


In [209]:
if log_or_sub == 'sub':
  sub['Drafted'] = final_pred
  sub.to_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/outputs/submission.csv")