## カラムの日本語訳

In [15]:
# Marital status - 配偶者の有無
# Application mode - 応募形態
# Application order - 応募順序
# Course - コース
# Daytime/evening attendance - 昼間/夜間の出席
# Previous qualification - 以前の資格
# Previous qualification (grade) - 以前の資格（成績）
# Nacionality - 国籍
# Mother's qualification - 母親の資格
# Father's qualification - 父親の資格
# Mother's occupation - 母親の職業
# Father's occupation - 父親の職業
# Admission grade - 入学成績
# Displaced - 移住者
# Educational special needs - 教育的特別支援
# Debtor - 債務者
# Tuition fees up to date - 授業料の支払い状況
# Gender - 性別
# Scholarship holder - 奨学生
# Age at enrollment - 入学時の年齢
# International - 国際的
# Curricular units 1st sem (credited) - 1学期の履修単位（認定済み）
# Curricular units 1st sem (enrolled) - 1学期の履修単位（登録済み）
# Curricular units 1st sem (evaluations) - 1学期の履修単位（評価済み）
# Curricular units 1st sem (approved) - 1学期の履修単位（合格済み）
# Curricular units 1st sem (grade) - 1学期の履修単位（成績）
# Curricular units 1st sem (without evaluations) - 1学期の履修単位（未評価）
# Curricular units 2nd sem (credited) - 2学期の履修単位（認定済み）
# Curricular units 2nd sem (enrolled) - 2学期の履修単位（登録済み）
# Curricular units 2nd sem (evaluations) - 2学期の履修単位（評価済み）
# Curricular units 2nd sem (approved) - 2学期の履修単位（合格済み）
# Curricular units 2nd sem (grade) - 2学期の履修単位（成績）
# Curricular units 2nd sem (without evaluations) - 2学期の履修単位（未評価）
# Unemployment rate - 失業率
# Inflation rate - インフレ率
# GDP - 国内総生産

## import

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from optuna.samplers import TPESampler
import optuna
import warnings
from sklearn.manifold import TSNE

## データの読み込み

In [32]:
train = pd.read_csv("../inputs/train.csv")
test = pd.read_csv("../inputs/test.csv")

In [33]:
x_train = train.drop(columns=["Target", "id"], axis=1)
y_train = train["Target"]
x_test = test.drop(columns=["id"], axis=1)

In [34]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)

## T-SNE２よる次元削減特徴量

In [36]:
features = x_train.columns
x_train_for_tsne = x_train.copy()
x_test_for_tsne = x_test.copy()

scaler = StandardScaler()
x_train_for_tsne = scaler.fit_transform(x_train_for_tsne)
x_test_for_tsne = scaler.transform(x_test_for_tsne)

tsne = TSNE(n_components=2, random_state=42)
train_tsne = tsne.fit_transform(x_train_for_tsne)
test_tsne = tsne.fit_transform(x_test_for_tsne)

# 新たな特徴量として追加
train_tsne_df = pd.DataFrame(train_tsne, columns=['tsne_1', 'tsne_2'])
test_tsne_df = pd.DataFrame(test_tsne, columns=['tsne_1', 'tsne_2'])

x_train = pd.concat([x_train, train_tsne_df], axis=1)
x_test = pd.concat([x_test, test_tsne_df], axis=1)

## 特徴量エンジニアリング

In [39]:
age_labels = ['0', '1', '2', '3', '4', '5', '6']
age_bins = [0, 18, 25, 30, 40, 50, 60, 70]
def add_features(df):
    df["sum_grade"] = df["Curricular units 1st sem (grade)"] + df["Curricular units 2nd sem (grade)"]
    df["interaction_grade"] = df["Curricular units 1st sem (grade)"] * df["Curricular units 2nd sem (grade)"]
    df["average_grade"] = df["sum_grade"] / 2
    
    df["sum_approved"] = df["Curricular units 1st sem (approved)"] + df["Curricular units 2nd sem (approved)"]
    df["interaction_approved"] = df["Curricular units 1st sem (approved)"] * df["Curricular units 2nd sem (approved)"]
    df["average_approved"] = df["sum_approved"] / 2
    
    df['age_group'] = pd.cut(df['Age at enrollment'], bins=age_bins, labels=age_labels, right=True)

add_features(x_train)
add_features(x_test)

In [40]:
x_train

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,GDP,tsne_1,tsne_2,sum_grade,interaction_grade,average_grade,sum_approved,interaction_approved,average_approved,age_group
0,1,1,1,9238,1,1,126.0,1,1,19,...,2.02,108.181633,1.579130,26.928571,180.214286,13.464286,12,36,6.0,0
1,1,17,1,9238,1,1,125.0,1,19,19,...,2.02,-35.984982,-23.366604,11.600000,0.000000,5.800000,4,0,2.0,0
2,1,17,2,9254,1,1,137.0,1,3,19,...,-0.92,-71.786469,-5.816445,0.000000,0.000000,0.000000,0,0,0.0,0
3,1,1,3,9500,1,1,131.0,1,19,3,...,2.02,97.711197,-45.690197,25.411250,161.419825,12.705625,14,49,7.0,0
4,1,1,2,9500,1,1,132.0,1,19,37,...,0.32,-44.124130,-85.912933,25.866667,167.271111,12.933333,12,36,6.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,1,17,1,9254,1,1,121.0,1,19,1,...,0.79,86.098816,-64.980354,21.266667,113.066667,10.633333,11,30,5.5,0
76514,1,1,6,9254,1,1,125.0,1,1,38,...,-3.12,-67.259239,-57.417076,26.875000,180.375000,13.437500,10,24,5.0,1
76515,5,17,1,9085,1,1,138.0,1,37,37,...,-3.12,-89.092697,74.283813,23.900000,142.500000,11.950000,9,20,4.5,1
76516,1,1,3,9070,1,1,136.0,1,38,37,...,0.32,-87.191727,-4.385127,0.000000,0.000000,0.000000,0,0,0.0,0


## optunaによるチューニング

In [41]:
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
        'subsample_for_bin': trial.suggest_int('subsample_for_bin', 20000, 300000),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 500),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.25, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 15)
    }
    
    model = LGBMClassifier(**params, objective='multiclass', random_state=0, device='cpu', verbosity=-1)
    
    # Cross-validation with 5 folds using KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, x_train, y_train, cv=kf, scoring='accuracy')
    
    # We maximize accuracy, so we return the mean accuracy of the cross-validation
    return np.mean(cv_results)

study = optuna.create_study(sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=0), direction="maximize")
study.optimize(objective, n_trials=100)


[I 2024-06-19 13:21:29,573] A new study created in memory with name: no-name-63086e16-f6d5-4487-886a-e8f7a8afa805
[I 2024-06-19 13:21:46,095] Trial 0 finished with value: 0.8197156154513126 and parameters: {'num_leaves': 320, 'learning_rate': 0.269388301928541, 'n_estimators': 843, 'subsample_for_bin': 172567, 'min_child_samples': 223, 'reg_alpha': 0.0028770084050677926, 'reg_lambda': 2.3761404778025532e-05, 'colsample_bytree': 0.9350638004692479, 'subsample': 0.9727470703757719, 'max_depth': 6}. Best is trial 0 with value: 0.8197156154513126.
[I 2024-06-19 13:23:18,245] Trial 1 finished with value: 0.8242896957670427 and parameters: {'num_leaves': 417, 'learning_rate': 0.11423254155608371, 'n_estimators': 811, 'subsample_for_bin': 279167, 'min_child_samples': 54, 'reg_alpha': 7.435205853060191e-09, 'reg_lambda': 1.5928833561691813e-09, 'colsample_bytree': 0.8995719073287628, 'subsample': 0.8336175632123879, 'max_depth': 14}. Best is trial 1 with value: 0.8242896957670427.
[I 2024-06-1

## optunaによるチューニングパラメータを利用してモデルを作成

In [42]:
# params = {'num_leaves': 489, 'learning_rate': 0.012116445295938309, 'n_estimators': 826, 'subsample_for_bin': 99575, 'min_child_samples': 375, 'reg_alpha': 0.0009319658505052182, 'reg_lambda': 9.808198625785853e-05, 'colsample_bytree': 0.49847769193736885, 'subsample': 0.9517501312160254, 'max_depth': 15}

lgb_model = LGBMClassifier(**study.best_params, objective='multiclass', random_state=0, device='cpu', verbosity=-1)
print('Best value:', study.best_value)
print('Best trial:', study.best_trial.params)

Best value: 0.8334509342711278
Best trial: {'num_leaves': 254, 'learning_rate': 0.012341088279315678, 'n_estimators': 1150, 'subsample_for_bin': 112655, 'min_child_samples': 409, 'reg_alpha': 5.04266093431367e-07, 'reg_lambda': 0.01889696770827688, 'colsample_bytree': 0.4472356451830753, 'subsample': 0.5719969608388208, 'max_depth': 11}


## クロスバリデーション

In [43]:
cv = KFold(5, shuffle=True, random_state=42)
cv_splits = tqdm(cv.split(x_train, y_train), total=cv.get_n_splits(), desc='CV Progress')

scores = []
for train_idx, val_idx in cv_splits:
    x_train_fold, x_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    lgb_model.fit(x_train_fold, y_train_fold)
    y_pred = lgb_model.predict(x_val_fold)
    r2 = accuracy_score(y_val_fold, y_pred)
    scores.append(r2)
    
    print(f'score: {r2}')

print(f"Mean Score: {np.mean(scores):.5f}") 

CV Progress:  20%|██        | 1/5 [00:10<00:43, 10.86s/it]

score: 0.8351411395713539


CV Progress:  40%|████      | 2/5 [00:21<00:32, 10.89s/it]

score: 0.8380162049137481


CV Progress:  60%|██████    | 3/5 [00:32<00:21, 10.94s/it]

score: 0.8286069001568217


CV Progress:  80%|████████  | 4/5 [00:43<00:10, 10.84s/it]

score: 0.8309481800954062


CV Progress: 100%|██████████| 5/5 [00:54<00:00, 10.83s/it]

score: 0.8345422466183101
Mean Score: 0.83345





In [44]:
from sklearn import datasets

feature_name = x_train.columns
importance = pd.DataFrame(lgb_model.feature_importances_, columns=['importance'], index=feature_name)
display(importance.sort_values('importance', ascending=False))

Unnamed: 0,importance
Admission grade,20198
tsne_2,18730
tsne_1,18232
Previous qualification (grade),13588
Curricular units 2nd sem (grade),12695
Curricular units 1st sem (grade),11878
Course,11227
sum_grade,11085
interaction_grade,10329
Age at enrollment,9127


## 提出用ファイルの作成

In [45]:
submit = pd.read_csv("../inputs/sample_submission.csv")
pred = lgb_model.predict(x_test)
submit['Target'] = le.inverse_transform(pred)
submit.to_csv("../outputs/submission_add_feature_tsne.csv", index=False)
submit.head()

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Graduate
4,76522,Enrolled
