## カラムの日本語訳

In [20]:
# Marital status - 配偶者の有無
# Application mode - 応募形態
# Application order - 応募順序
# Course - コース
# Daytime/evening attendance - 昼間/夜間の出席
# Previous qualification - 以前の資格
# Previous qualification (grade) - 以前の資格（成績）
# Nacionality - 国籍
# Mother's qualification - 母親の資格
# Father's qualification - 父親の資格
# Mother's occupation - 母親の職業
# Father's occupation - 父親の職業
# Admission grade - 入学成績
# Displaced - 移住者
# Educational special needs - 教育的特別支援
# Debtor - 債務者
# Tuition fees up to date - 授業料の支払い状況
# Gender - 性別
# Scholarship holder - 奨学生
# Age at enrollment - 入学時の年齢
# International - 国際的
# Curricular units 1st sem (credited) - 1学期の履修単位（認定済み）
# Curricular units 1st sem (enrolled) - 1学期の履修単位（登録済み）
# Curricular units 1st sem (evaluations) - 1学期の履修単位（評価済み）
# Curricular units 1st sem (approved) - 1学期の履修単位（合格済み）
# Curricular units 1st sem (grade) - 1学期の履修単位（成績）
# Curricular units 1st sem (without evaluations) - 1学期の履修単位（未評価）
# Curricular units 2nd sem (credited) - 2学期の履修単位（認定済み）
# Curricular units 2nd sem (enrolled) - 2学期の履修単位（登録済み）
# Curricular units 2nd sem (evaluations) - 2学期の履修単位（評価済み）
# Curricular units 2nd sem (approved) - 2学期の履修単位（合格済み）
# Curricular units 2nd sem (grade) - 2学期の履修単位（成績）
# Curricular units 2nd sem (without evaluations) - 2学期の履修単位（未評価）
# Unemployment rate - 失業率
# Inflation rate - インフレ率
# GDP - 国内総生産

## import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from optuna.samplers import TPESampler
import optuna
import warnings

  from .autonotebook import tqdm as notebook_tqdm


## データの読み込み

In [2]:
train = pd.read_csv("../inputs/train.csv")
test = pd.read_csv("../inputs/test.csv")

In [25]:
x_train = train.drop(columns=["Target", "id"], axis=1)
y_train = train["Target"]
x_test = test.drop(columns=["id"], axis=1)

In [26]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)

## optunaによるチューニング

In [31]:
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
        'subsample_for_bin': trial.suggest_int('subsample_for_bin', 20000, 300000),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 500),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.25, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 15)
    }
    
    model = LGBMClassifier(**params, objective='multiclass', random_state=0, device='cpu', verbosity=-1)
    
    # Cross-validation with 5 folds using KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, x_train, y_train, cv=kf, scoring='accuracy')
    
    # We maximize accuracy, so we return the mean accuracy of the cross-validation
    return np.mean(cv_results)

study = optuna.create_study(sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=0), direction="maximize")
study.optimize(objective, n_trials=100)


[I 2024-06-17 21:18:04,177] A new study created in memory with name: no-name-6746b9ac-67f9-4977-96cc-6b922681ea80
[I 2024-06-17 21:18:19,106] Trial 0 finished with value: 0.8201860678794365 and parameters: {'num_leaves': 320, 'learning_rate': 0.269388301928541, 'n_estimators': 843, 'subsample_for_bin': 172567, 'min_child_samples': 223, 'reg_alpha': 0.0028770084050677926, 'reg_lambda': 2.3761404778025532e-05, 'colsample_bytree': 0.9350638004692479, 'subsample': 0.9727470703757719, 'max_depth': 6}. Best is trial 0 with value: 0.8201860678794365.
[I 2024-06-17 21:19:48,598] Trial 1 finished with value: 0.8235447574605447 and parameters: {'num_leaves': 417, 'learning_rate': 0.11423254155608371, 'n_estimators': 811, 'subsample_for_bin': 279167, 'min_child_samples': 54, 'reg_alpha': 7.435205853060191e-09, 'reg_lambda': 1.5928833561691813e-09, 'colsample_bytree': 0.8995719073287628, 'subsample': 0.8336175632123879, 'max_depth': 14}. Best is trial 1 with value: 0.8235447574605447.
[I 2024-06-1

## optunaによるチューニングパラメータを利用してモデルを作成

In [32]:
lgb_model = LGBMClassifier(**study.best_params, objective='multiclass', random_state=0, device='cpu', verbosity=-1)
print('Best value:', study.best_value)
print('Best trial:', study.best_trial.params)

Best value: 0.8339998214837081
Best trial: {'num_leaves': 464, 'learning_rate': 0.01574193258343428, 'n_estimators': 1151, 'subsample_for_bin': 68638, 'min_child_samples': 440, 'reg_alpha': 2.106066174120728e-05, 'reg_lambda': 0.0003416360480277162, 'colsample_bytree': 0.47037718807948126, 'subsample': 0.39772831550793686, 'max_depth': 8}


## クロスバリデーション

In [33]:
cv = KFold(5, shuffle=True, random_state=42)
cv_splits = tqdm(cv.split(x_train, y_train), total=cv.get_n_splits(), desc='CV Progress')

scores = []
for train_idx, val_idx in cv_splits:
    x_train_fold, x_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    lgb_model.fit(x_train_fold, y_train_fold)
    y_pred = lgb_model.predict(x_val_fold)
    r2 = accuracy_score(y_val_fold, y_pred)
    scores.append(r2)
    
    print(f'score: {r2}')

print(f"Mean Score ＝ {np.mean(scores):.5f}") 

CV Progress:  20%|██        | 1/5 [00:07<00:29,  7.49s/it]

score: 0.8359252483010977


CV Progress:  40%|████      | 2/5 [00:15<00:22,  7.54s/it]

score: 0.8380815473078934


CV Progress:  60%|██████    | 3/5 [00:22<00:15,  7.69s/it]

score: 0.8296523784631469


CV Progress:  80%|████████  | 4/5 [00:30<00:07,  7.71s/it]

score: 0.8314056067437757


CV Progress: 100%|██████████| 5/5 [00:38<00:00,  7.64s/it]

score: 0.8349343266026269
Mean Score ＝ 0.83400





## 提出用ファイルの作成

In [34]:
submit = pd.read_csv("inputs/sample_submission.csv")
pred = lgb_model.predict(x_test)
submit['Target'] = le.inverse_transform(pred)
submit.to_csv("outputs/submission_add_feature.csv", index=False)
submit.head()

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Graduate
4,76522,Enrolled
