<a href="https://colab.research.google.com/github/ryo42164/kaggle/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [129]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [130]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold
from lightgbm import LGBMClassifier

In [131]:
sub = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/sample_submission.csv")
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/data/test.csv")

In [132]:
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

In [133]:
log_or_sub = 'sub'

In [134]:
X = train.drop(['Id','Drafted'],axis=1)
y = train['Drafted']
X_test = test.drop('Id',axis=1)

In [135]:
high_missing_num = list(train.isnull().sum()[train.isnull().sum() > 500].index)

In [136]:
train.columns

Index(['Id', 'Year', 'Age', 'School', 'Height', 'Weight', 'Sprint_40yd',
       'Vertical_Jump', 'Bench_Press_Reps', 'Broad_Jump', 'Agility_3cone',
       'Shuttle', 'Player_Type', 'Position_Type', 'Position', 'Drafted'],
      dtype='object')

In [137]:

FEATURE_FLAGS = {
    'School_counts': True,
    'Missing_columns': False,
    'Age_missing':True,
    'School_te':True,
    'Position_te':True,
    'BMI':True
    }

if FEATURE_FLAGS['School_counts']:
  school_counts = train['School'].value_counts()
  X['School_count'] = train['School'].map(school_counts)
  X_test['School_count'] = test['School'].map(school_counts).fillna(0)

if FEATURE_FLAGS['Missing_columns']:
  for col in high_missing_num:
    X[col+'_missing'] = X[col].isnull().astype(int)
    X_test[col+'_missing'] = X_test[col].isnull().astype(int)

if FEATURE_FLAGS['Age_missing']:
  X['Age_missing'] = X['Age'].isnull().astype(int)
  X_test['Age_missing'] = X_test['Age'].isnull().astype(int)

if FEATURE_FLAGS['School_te']:
  oof_te = np.zeros(len(X))
  global_mean = y.mean()

  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
  k = 10

  for tr_idx, val_idx in cv.split(X, y):
      X_trn, y_trn = X.iloc[tr_idx], y[tr_idx]
      df_tr = X_trn.copy()
      df_tr['Drafted'] = y_trn

      stats = df_tr.groupby('School')['Drafted'].agg(['mean', 'count'])

      alpha = stats['count'] / (stats['count'] + k)
      smoothed = alpha * stats['mean'] + (1 - alpha) * global_mean

      oof_te[val_idx] = X.iloc[val_idx]['School'].map(smoothed)

  oof_te = np.where(np.isnan(oof_te), global_mean, oof_te)

  df_full = X.copy()
  df_full['Drafted'] = y
  stats_all = df_full.groupby('School')['Drafted'].agg(['mean', 'count'])
  alpha_all = stats_all['count'] / (stats_all['count'] + k)
  smoothed_all = alpha_all * stats_all['mean'] + (1 - alpha_all) * global_mean

  X['School_te'] = oof_te
  X_test['School_te'] = X_test['School'].map(smoothed_all).fillna(global_mean)

if FEATURE_FLAGS['Position_te']:
  cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
  oof_te = np.zeros(len(X))
  global_mean = y.mean()

  for trn_idx,val_idx in cv.split(X,y):
    X_trn,y_trn = X.iloc[trn_idx],y.iloc[trn_idx]
    df_trn = X_trn.copy()
    df_trn['Drafted'] = y_trn

    te_map = df_trn.groupby('Position')['Drafted'].mean()
    oof_te[val_idx] = X.iloc[val_idx]['Position'].map(te_map)
  oof_te  = np.where(np.isnan(oof_te),global_mean,oof_te)
  df_full = X.copy()
  df_full['Drafted'] = y
  map_te_all = df_full.groupby('Position')['Drafted'].mean()
  X['Position_te'] = oof_te
  X_test['Position_te'] = X_test['Position'].map(map_te_all).fillna(global_mean)

if FEATURE_FLAGS['BMI']:
  X['BMI'] = X['Weight'] / (X['Height'] ** 2)
  X_test['BMI'] = X_test['Weight'] /(X_test['Height']**2)

In [138]:
num_proc = Pipeline(steps = [
    ('imputer',KNNImputer(n_neighbors=5)),
    ('scaler',StandardScaler())
])

num_proc_high = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median'))
])

cat_proc = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown = 'ignore',sparse_output=False,))
])

missing_flag_proc = 'passthrough'

num_features = X.select_dtypes(exclude=['object','category']).columns
num_features = list(set(num_features)-set(high_missing_num))
num_features = [feat for feat in num_features if not '_missing' in feat]

missing_flag_cols = [c for c in X.columns if c.endswith('_missing')]

cat_features = list(X.select_dtypes(include=['object','category']).columns)
cat_features.remove('School')

preprocessor = ColumnTransformer(
    transformers=[
        ('num',num_proc,num_features),
        ('num_high',num_proc_high,high_missing_num),
        ('missing_flag',missing_flag_proc,missing_flag_cols),
        ('cat',cat_proc,cat_features)
    ],
    remainder = 'drop'
)
best_params = {'learning_rate': 0.11034679072511581, 'num_leaves': 64, 'max_depth': 12, 'min_child_samples': 69, 'feature_fraction': 0.7189028957238169, 'bagging_fraction': 0.7212431184593704, 'bagging_freq': 6, 'lambda_l1': 2.2629728606140436, 'lambda_l2': 3.208765021602661}
seeds = [1,10,42]
models = []
for s in seeds:
  clf = LGBMClassifier(**best_params,random_state=s,verbose=-1)
  pipe = Pipeline(steps=[
      ('preprocessor',preprocessor),
      ('clf',clf)
  ])
  pipe.fit(X,y)
  models.append(pipe)

In [139]:
oof_pred = np.zeros(len(X))
pred_proba = np.zeros(len(X_test))

groups = X['Year'].values

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)

preds = []
for model in models:
    preds.append(model.predict_proba(X_test)[:, 1])

final_pred = np.mean(preds, axis=0)

for tr_idx,val_idx in cv.split(X,y):
  X_trn,y_trn = X.iloc[tr_idx],y.iloc[tr_idx]
  X_val,y_val = X.iloc[val_idx],y.iloc[val_idx]
  pipe.fit(X_trn,y_trn)
  oof_pred[val_idx] = pipe.predict_proba(X_val)[:,1]
  print(roc_auc_score(y_val,oof_pred[val_idx]))
  pred_proba += pipe.predict_proba(X_test)[:,1]/cv.get_n_splits()
score = roc_auc_score(y,oof_pred)
print(score)

0.8114930182599356
0.8114922934867532
0.8590666950777754
0.8336026077097506
0.8337726757369615
0.8296193460796423


In [140]:
if log_or_sub == 'sub':
  sub['Drafted'] = final_pred
  sub.to_csv("/content/drive/MyDrive/Colab Notebooks/NFL Draft Prediction/outputs/submission.csv")

In [141]:
sub

Unnamed: 0,Id,Drafted
0,2781,0.827940
1,2782,0.895053
2,2783,0.942672
3,2784,0.862153
4,2785,0.790792
...,...,...
691,3472,0.024298
692,3473,0.718873
693,3474,0.701839
694,3475,0.474310
