## カラムの日本語訳

In [1]:
# Marital status - 配偶者の有無
# Application mode - 応募形態
# Application order - 応募順序
# Course - コース
# Daytime/evening attendance - 昼間/夜間の出席
# Previous qualification - 以前の資格
# Previous qualification (grade) - 以前の資格（成績）
# Nacionality - 国籍
# Mother's qualification - 母親の資格
# Father's qualification - 父親の資格
# Mother's occupation - 母親の職業
# Father's occupation - 父親の職業
# Admission grade - 入学成績
# Displaced - 移住者
# Educational special needs - 教育的特別支援
# Debtor - 債務者
# Tuition fees up to date - 授業料の支払い状況
# Gender - 性別
# Scholarship holder - 奨学生
# Age at enrollment - 入学時の年齢
# International - 国際的
# Curricular units 1st sem (credited) - 1学期の履修単位（認定済み）
# Curricular units 1st sem (enrolled) - 1学期の履修単位（登録済み）
# Curricular units 1st sem (evaluations) - 1学期の履修単位（評価済み）
# Curricular units 1st sem (approved) - 1学期の履修単位（合格済み）
# Curricular units 1st sem (grade) - 1学期の履修単位（成績）
# Curricular units 1st sem (without evaluations) - 1学期の履修単位（未評価）
# Curricular units 2nd sem (credited) - 2学期の履修単位（認定済み）
# Curricular units 2nd sem (enrolled) - 2学期の履修単位（登録済み）
# Curricular units 2nd sem (evaluations) - 2学期の履修単位（評価済み）
# Curricular units 2nd sem (approved) - 2学期の履修単位（合格済み）
# Curricular units 2nd sem (grade) - 2学期の履修単位（成績）
# Curricular units 2nd sem (without evaluations) - 2学期の履修単位（未評価）
# Unemployment rate - 失業率
# Inflation rate - インフレ率
# GDP - 国内総生産

## import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from optuna.samplers import TPESampler
from sklearn.linear_model import LogisticRegression
import optuna
import warnings

  from .autonotebook import tqdm as notebook_tqdm


## データの読み込み

In [3]:
train = pd.read_csv("../inputs/train.csv")
test = pd.read_csv("../inputs/test.csv")

In [4]:
x_train = train.drop(columns=["Target", "id"], axis=1)
y_train = train["Target"]
x_test = test.drop(columns=["id"], axis=1)

In [5]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)

## 特徴量エンジニアリング

In [6]:
# age_labels = ['0', '1', '2', '3', '4', '5', '6']
age_bins = [0, 18, 25, 30, 40, 50, 60, 70]
def add_features(df):
    df["sum_grade"] = df["Curricular units 1st sem (grade)"] + df["Curricular units 2nd sem (grade)"]
    df["interaction_grade"] = df["Curricular units 1st sem (grade)"] * df["Curricular units 2nd sem (grade)"]
    df["average_grade"] = df["sum_grade"] / 2
    df['age_group'] = pd.cut(df['Age at enrollment'], bins=age_bins, labels=False, right=True)

add_features(x_train)
add_features(x_test)

## Stacking

In [7]:
def objective(trial, model):
    base_params = model.get_params()
    
    if isinstance(model, XGBClassifier):
        new_params = {
            'eta': trial.suggest_loguniform('eta', 1e-3, 0.5),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
            'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
            'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0)
        }
    elif isinstance(model, LGBMClassifier):
        new_params ={
                'num_leaves': trial.suggest_int('num_leaves', 100, 500),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
                'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
                'subsample_for_bin': trial.suggest_int('subsample_for_bin', 20000, 300000),
                'min_child_samples': trial.suggest_int('min_child_samples', 20, 500),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
                'subsample': trial.suggest_float('subsample', 0.25, 1.0),
                'max_depth': trial.suggest_int('max_depth', 1, 15)
            }   
    elif isinstance(model, CatBoostClassifier):
        new_params = {
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'depth': trial.suggest_int('depth', 1, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
            'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
            'border_count': trial.suggest_int('border_count', 1, 255),
            'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10),
            'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-3, 10),
            'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
            'od_wait': trial.suggest_int('od_wait', 10, 50)
        }
    else:
        raise ValueError("Unsupported model type!")

    base_params.update(new_params)
    model = model.__class__(**base_params)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, x_train, y_train, cv=kf, scoring='accuracy')
    
    return np.mean(cv_results)

def predict_cv(model, x_train, y_train, x_test):
    preds = list()
    preds_test = list()
    va_idxes = list()
    
    kf = KFold(n_splits=5, shuffle=True, random_state=71)
    iterator = tqdm(enumerate(kf.split(x_train)), total=kf.get_n_splits(), desc='CV Progress')  # tqdmを使ってプログレスバーを表示
    
    for _, (tr_idx, va_idx) in iterator:
        tr_x, va_x = x_train.iloc[tr_idx], x_train.iloc[va_idx]
        tr_y, va_y = y_train[tr_idx], y_train[va_idx]
        
        model.fit(tr_x, tr_y)
        
        pred = model.predict(va_x)
        preds.append(pred)
        
        pred_test = model.predict(x_test)
        preds_test.append(pred_test)
        
        va_idxes.append(va_idx)

    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]
    preds_test = np.mean(preds_test, axis=0)
    
    return pred_train, preds_test

## Model

In [8]:
initial_model = XGBClassifier(objective='multi:softmax', random_state=0, eval_metric=['merror','mlogloss'], verbosity=0)
study_xgb = optuna.create_study(sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=0), direction="maximize")
study_xgb.optimize(lambda trial: objective(trial, initial_model), n_trials=100)
print('Best value:', study_xgb.best_value)
print('Best trial:', study_xgb.best_trial.params)

[I 2024-06-18 16:35:57,230] A new study created in memory with name: no-name-91a956e0-fe3a-4efe-9022-cf4f318fbe44
  'eta': trial.suggest_loguniform('eta', 1e-3, 0.5),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0)
[I 2024-06-18 16:36:00,238] Trial 0 finished with value: 0.8276744787527525 and parameters: {'eta': 0.03028525153605885, 'max_depth': 8, 'subsample': 0.801381688035822, 'colsample_bytree': 0.7724415914984484, 'gamma': 2.4504079607415994e-05, 'lambda': 0.006502000785097662, 'alpha': 8.675307976899775e-05}. Best is trial 0 with value: 0.8276744787527525.
  'eta': trial.suggest_loguniform('eta', 1e-3, 0.5),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample

Best value: 0.832274713959752
Best trial: {'eta': 0.23221571189095544, 'max_depth': 5, 'subsample': 0.9272654256205575, 'colsample_bytree': 0.7325163770688324, 'gamma': 3.672084660729236e-05, 'lambda': 0.00013440553631962227, 'alpha': 2.4138932621244575}


In [9]:
initial_model = LGBMClassifier(objective='multiclass', random_state=0, device='cpu', verbosity=-1)
study_lgb = optuna.create_study(sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=0), direction="maximize")
study_lgb.optimize(lambda trial: objective(trial, initial_model), n_trials=100)
print('Best value:', study_lgb.best_value)
print('Best trial:', study_lgb.best_trial.params)

[I 2024-06-18 16:39:55,373] A new study created in memory with name: no-name-233ee22e-f201-4142-8981-a3eb53599f18
[I 2024-06-18 16:40:12,439] Trial 0 finished with value: 0.8201468701287828 and parameters: {'num_leaves': 320, 'learning_rate': 0.269388301928541, 'n_estimators': 843, 'subsample_for_bin': 172567, 'min_child_samples': 223, 'reg_alpha': 0.0028770084050677926, 'reg_lambda': 2.3761404778025532e-05, 'colsample_bytree': 0.9350638004692479, 'subsample': 0.9727470703757719, 'max_depth': 6}. Best is trial 0 with value: 0.8201468701287828.
[I 2024-06-18 16:41:41,558] Trial 1 finished with value: 0.8235708773385728 and parameters: {'num_leaves': 417, 'learning_rate': 0.11423254155608371, 'n_estimators': 811, 'subsample_for_bin': 279167, 'min_child_samples': 54, 'reg_alpha': 7.435205853060191e-09, 'reg_lambda': 1.5928833561691813e-09, 'colsample_bytree': 0.8995719073287628, 'subsample': 0.8336175632123879, 'max_depth': 14}. Best is trial 1 with value: 0.8235708773385728.
[I 2024-06-1

Best value: 0.8333855790672603


In [10]:
initial_model = CatBoostClassifier(objective='MultiClass', random_seed=0, verbose=0)
study_cat = optuna.create_study(sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=0), direction="maximize")
study_cat.optimize(lambda trial: objective(trial, initial_model), n_trials=100)
print('Best value:', study_cat.best_value)
print('Best trial:', study_cat.best_trial.params)

[I 2024-06-18 17:37:42,464] A new study created in memory with name: no-name-6f9bf779-e42c-411c-83ca-1907134edf15
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-3, 10),
[I 2024-06-18 17:38:19,671] Trial 0 finished with value: 0.8287722770893946 and parameters: {'iterations': 594, 'depth': 8, 'learning_rate': 0.01605191133358762, 'l2_leaf_reg': 0.15119336467641012, 'border_count': 109, 'random_strength': 0.3833332156156664, 'bagging_temperature': 0.0562793204741517, 'od_type': 'Iter', 'od_wait': 25}. Best is trial 0 with value: 0.8287722770893946.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
  'random_strength': trial.suggest_loguniform('random_stre

Best value: 0.8321440206316464
Best trial: {'iterations': 898, 'depth': 6, 'learning_rate': 0.04808284430992379, 'l2_leaf_reg': 1.243823890921862, 'border_count': 126, 'random_strength': 0.007112741144396439, 'bagging_temperature': 0.07460997915246893, 'od_type': 'Iter', 'od_wait': 22}


In [8]:
lgb_best_params = {'num_leaves': 483, 'learning_rate': 0.014485102960977827, 'n_estimators': 914, 'subsample_for_bin': 26089, 'min_child_samples': 379, 'reg_alpha': 0.0021254339166529115, 'reg_lambda': 0.04441464594776136, 'colsample_bytree': 0.4523640107645242, 'subsample': 0.7551685547195315, 'max_depth': 13}
xgb_best_params = {'eta': 0.23221571189095544, 'max_depth': 5, 'subsample': 0.9272654256205575, 'colsample_bytree': 0.7325163770688324, 'gamma': 3.672084660729236e-05, 'lambda': 0.00013440553631962227, 'alpha': 2.4138932621244575}
cat_best_params = {'iterations': 898, 'depth': 6, 'learning_rate': 0.04808284430992379, 'l2_leaf_reg': 1.243823890921862, 'border_count': 126, 'random_strength': 0.007112741144396439, 'bagging_temperature': 0.07460997915246893, 'od_type': 'Iter', 'od_wait': 22}
lgb_model = LGBMClassifier(**lgb_best_params, objective='multiclass', random_state=0, device='cpu', verbosity=-1)
xgb_model = XGBClassifier(**xgb_best_params,objective='multi:softmax', random_state=0, eval_metric=['merror','mlogloss'])
cat_model = CatBoostClassifier(**cat_best_params, objective='MultiClass', random_seed=0, verbose=0)

In [9]:
pred_train_lgb, pred_test_lgb = predict_cv(lgb_model, x_train, y_train, x_test)
pred_train_xgb, pred_test_xgb = predict_cv(xgb_model, x_train, y_train, x_test)
pred_train_cat, pred_test_cat = predict_cv(cat_model, x_train, y_train, x_test)

CV Progress: 100%|██████████| 5/5 [00:53<00:00, 10.63s/it]
CV Progress: 100%|██████████| 5/5 [00:02<00:00,  1.91it/s]
CV Progress: 100%|██████████| 5/5 [00:19<00:00,  3.93s/it]


In [1]:
print(f"Accuracy LGBM: {accuracy_score(y_train, pred_train_lgb)}")
print(f"Accuracy XGB: {accuracy_score(y_train, pred_train_xgb)}")
print(f"Accuracy CatBoost: {accuracy_score(y_train, pred_train_cat)}")

print(f"pred lgbm: {pred_train_lgb}")
print(f"pred xgb: {pred_train_xgb}")
print(f"pred cat: {pred_train_cat.flatten()}")


NameError: name 'accuracy_score' is not defined

In [12]:
pred_train_cat, pred_test_cat = pred_train_cat.flatten(), pred_test_cat.flatten()
x2_train = pd.DataFrame({'lgb': pred_train_lgb, 'xgb': pred_train_xgb, 'cat': pred_train_cat})
x2_test = pd.DataFrame({'lgb': pred_test_lgb, 'xgb': pred_test_xgb, 'cat': pred_test_cat})

In [13]:
linear_model = LogisticRegression(
    multi_class='multinomial',  # 多クラス分類用に設定
    random_state=42
)
cv = KFold(5, shuffle=True, random_state=42)
cv_splits = tqdm(cv.split(x2_train, y_train), total=cv.get_n_splits(), desc='CV Progress')

scores = []
for train_idx, val_idx in cv_splits:
    x2_train_fold, x2_val_fold = x2_train.iloc[train_idx], x2_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    linear_model.fit(x2_train_fold, y_train_fold)
    y_pred = linear_model.predict(x2_val_fold)
    r2 = accuracy_score(y_val_fold, y_pred)
    scores.append(r2)
    
    print(f'score: {r2}')

print(f"Mean Score ＝ {np.mean(scores):.5f}") 

CV Progress:  40%|████      | 2/5 [00:00<00:00,  7.54it/s]

score: 0.8347490852064819
score: 0.8359905906952431


CV Progress:  80%|████████  | 4/5 [00:00<00:00,  8.48it/s]

score: 0.8263199163617355
score: 0.8300986734627197


CV Progress: 100%|██████████| 5/5 [00:00<00:00,  8.46it/s]

score: 0.8321897667124093
Mean Score ＝ 0.83187





## 提出用ファイルの作成

In [14]:
submit = pd.read_csv("../inputs/sample_submission.csv")
pred = linear_model.predict(x2_test)
submit['Target'] = le.inverse_transform(pred)
submit.to_csv("../outputs/submission_add_feature_and_stacking.csv", index=False)
submit.head()

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Graduate
4,76522,Enrolled
