## カラムの日本語訳

In [1]:
# Marital status - 配偶者の有無
# Application mode - 応募形態
# Application order - 応募順序
# Course - コース
# Daytime/evening attendance - 昼間/夜間の出席
# Previous qualification - 以前の資格
# Previous qualification (grade) - 以前の資格（成績）
# Nacionality - 国籍
# Mother's qualification - 母親の資格
# Father's qualification - 父親の資格
# Mother's occupation - 母親の職業
# Father's occupation - 父親の職業
# Admission grade - 入学成績
# Displaced - 移住者
# Educational special needs - 教育的特別支援
# Debtor - 債務者
# Tuition fees up to date - 授業料の支払い状況
# Gender - 性別
# Scholarship holder - 奨学生
# Age at enrollment - 入学時の年齢
# International - 国際的
# Curricular units 1st sem (credited) - 1学期の履修単位（認定済み）
# Curricular units 1st sem (enrolled) - 1学期の履修単位（登録済み）
# Curricular units 1st sem (evaluations) - 1学期の履修単位（評価済み）
# Curricular units 1st sem (approved) - 1学期の履修単位（合格済み）
# Curricular units 1st sem (grade) - 1学期の履修単位（成績）
# Curricular units 1st sem (without evaluations) - 1学期の履修単位（未評価）
# Curricular units 2nd sem (credited) - 2学期の履修単位（認定済み）
# Curricular units 2nd sem (enrolled) - 2学期の履修単位（登録済み）
# Curricular units 2nd sem (evaluations) - 2学期の履修単位（評価済み）
# Curricular units 2nd sem (approved) - 2学期の履修単位（合格済み）
# Curricular units 2nd sem (grade) - 2学期の履修単位（成績）
# Curricular units 2nd sem (without evaluations) - 2学期の履修単位（未評価）
# Unemployment rate - 失業率
# Inflation rate - インフレ率
# GDP - 国内総生産

## import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from optuna.samplers import TPESampler
import optuna
import warnings

  from .autonotebook import tqdm as notebook_tqdm


## データの読み込み

In [2]:
train = pd.read_csv("inputs/train.csv")
test = pd.read_csv("inputs/test.csv")

In [None]:
columns = train.columns.tolist()
for column in columns:
    print(f"{column}: {train[column].dtype}")
    print(f"ユニークな値: {train[column].unique()}")
    print(f"統計情報: {train[column].describe()}\n")

In [3]:
x_train = train.drop(columns=["Target", "id"], axis=1)
y_train = train["Target"]
x_test = test.drop(columns=["id"], axis=1)

In [4]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)

In [24]:
x_train.head(5)

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,0,6,7,6,12.428571,0,11.1,0.6,2.02
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,0,6,9,0,0.0,0,11.1,0.6,2.02
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,0,6,0,0,0.0,0,16.2,0.3,-0.92
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,0,8,11,7,12.82,0,11.1,0.6,2.02
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,0,7,12,6,12.933333,0,7.6,2.6,0.32


## 特徴量エンジニアリング

In [5]:
def cleaning(dataset):
    features = dataset.columns.tolist()
    dataset['total'] = dataset[features].sum(axis=1)
    dataset['mean_features'] = 0.1*dataset[features].mean(axis=1)
    dataset['std_features'] = dataset[features].std(axis=1)
    dataset['max_features'] = dataset[features].max(axis=1)
    dataset['min_features'] = dataset[features].min(axis=1)
    dataset['range_features'] = dataset['max_features'] - dataset['min_features']
    dataset['median_features'] = 0.1*dataset[features].median(axis=1)
    dataset['skewness_features'] = dataset[features].skew(axis=1)
    dataset['ptp'] = dataset[features].values.ptp(axis=1)
    dataset['q25'] = dataset[features].quantile(0.25, axis=1)
    dataset['q75'] = dataset[features].quantile(0.75, axis=1)

cleaning(x_train)
cleaning(x_test)

In [6]:
def add_features(df):
    df['ClimateImpact'] = df['MonsoonIntensity'] + df['ClimateChange']
    df['AnthropogenicPressure'] = df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments']
    df['InfrastructureQuality'] = df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']
    df['CoastalVulnerabilityTotal'] = df['CoastalVulnerability'] + df['Landslides']
    df['PreventiveMeasuresEfficiency'] = df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning']
    df['EcosystemImpact'] = df['WetlandLoss'] + df['Watersheds']
    df['SocioPoliticalContext'] = df['PopulationScore'] * df['PoliticalFactors']

add_features(x_train)
add_features(x_test)

## trainとtestの分布を確認

In [None]:
def plot_distribution_pairs(train, test, feature, hue="set", palette=None):
    data_df = train.copy()
    data_df['set'] = 'train'
    data_df = pd.concat([data_df, test.copy()]).fillna('test')
    data_df.replace([np.inf, -np.inf], np.nan, inplace=True)

    f, axes = plt.subplots(1, 2, figsize=(14, 6))
    for i, s in enumerate(data_df[hue].unique()):
        selection = data_df.loc[data_df[hue]==s, feature]
        # Filter 'selection' to include only the central 95% of the data
        # q_025, q_975 = np.percentile(selection, [2.5, 97.5])
        # selection_filtered = selection[(selection >= q_025) & (selection <= q_975)]
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=FutureWarning)
            sns.histplot(selection, color=palette[i], ax=axes[0], label=s)
            sns.boxplot(x=hue, y=feature, data=data_df, palette=palette, ax=axes[1])
    axes[0].set_title(f"Paired train/test distributions of {feature}")
    axes[1].set_title(f"Paired train/test boxplots of {feature}")
    axes[0].legend()
    axes[1].legend()
    plt.show()
color_list = ["#A5D7E8", "#576CBC", "#19376D", "#0B2447"]
for feature in x_train.columns:
  plot_distribution_pairs(x_train, x_test, feature, palette=color_list)

## 標準化

In [8]:
# s = StandardScaler()
# s.fit(pd.concat([x_train, x_test]))
# x_train = s.transform(x_train)
# x_test = s.transform(x_test)

## optunaによるチューニング

In [5]:
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
        'subsample_for_bin': trial.suggest_int('subsample_for_bin', 20000, 300000),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 500),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.25, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 15)
    }
    
    model = LGBMClassifier(**params, objective='multiclass', random_state=0, device='cpu', verbosity=-1)
    
    # Cross-validation with 5 folds using KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, x_train, y_train, cv=kf, scoring='accuracy')
    
    # We maximize accuracy, so we return the mean accuracy of the cross-validation
    return np.mean(cv_results)

study = optuna.create_study(sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=0), direction="maximize")
study.optimize(objective, n_trials=100)


[I 2024-06-17 14:06:01,210] A new study created in memory with name: no-name-5576c4ad-4ab8-469c-8dbb-f529365f458d
[I 2024-06-17 14:06:15,296] Trial 0 finished with value: 0.8200292768768216 and parameters: {'num_leaves': 320, 'learning_rate': 0.269388301928541, 'n_estimators': 843, 'subsample_for_bin': 172567, 'min_child_samples': 223, 'reg_alpha': 0.0028770084050677926, 'reg_lambda': 2.3761404778025532e-05, 'colsample_bytree': 0.9350638004692479, 'subsample': 0.9727470703757719, 'max_depth': 6}. Best is trial 0 with value: 0.8200292768768216.
[I 2024-06-17 14:07:37,218] Trial 1 finished with value: 0.8233356797328909 and parameters: {'num_leaves': 417, 'learning_rate': 0.11423254155608371, 'n_estimators': 811, 'subsample_for_bin': 279167, 'min_child_samples': 54, 'reg_alpha': 7.435205853060191e-09, 'reg_lambda': 1.5928833561691813e-09, 'colsample_bytree': 0.8995719073287628, 'subsample': 0.8336175632123879, 'max_depth': 14}. Best is trial 1 with value: 0.8233356797328909.
[I 2024-06-1

## optunaによるチューニングパラメータを利用してモデルを作成

In [6]:
lgb_model = LGBMClassifier(**study.best_params, objective='multiclass', random_state=0, device='cpu', verbosity=-1)
print('Best value:', study.best_value)
print('Best trial:', study.best_trial.params)

Best value: 0.8339867572747865
Best trial: {'num_leaves': 429, 'learning_rate': 0.025355751209213052, 'n_estimators': 660, 'subsample_for_bin': 122661, 'min_child_samples': 390, 'reg_alpha': 1.149659987277261e-05, 'reg_lambda': 2.3293680332394752e-05, 'colsample_bytree': 0.5758409684476786, 'subsample': 0.5721023355561136, 'max_depth': 9}


## クロスバリデーション

In [8]:
cv = KFold(5, shuffle=True, random_state=42)
cv_splits = tqdm(cv.split(x_train, y_train), total=cv.get_n_splits(), desc='CV Progress')

scores = []
for train_idx, val_idx in cv_splits:
    x_train_fold, x_val_fold = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    lgb_model.fit(x_train_fold, y_train_fold)
    y_pred = lgb_model.predict(x_val_fold)
    r2 = accuracy_score(y_val_fold, y_pred)
    scores.append(r2)
    
    print(f'score: {r2}')

print(f"Mean Score ＝ {np.mean(scores):.5f}") 

CV Progress:  20%|██        | 1/5 [00:04<00:16,  4.20s/it]

score: 0.8360559330893884


CV Progress:  40%|████      | 2/5 [00:08<00:12,  4.13s/it]

score: 0.83840825927862


CV Progress:  60%|██████    | 3/5 [00:12<00:08,  4.10s/it]

score: 0.8288029273392578


CV Progress:  80%|████████  | 4/5 [00:16<00:04,  4.08s/it]

score: 0.8321244200483565


CV Progress: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it]

score: 0.8345422466183101
Mean Score ＝ 0.83399





## 提出用ファイルの作成

In [9]:
submit = pd.read_csv("inputs/sample_submission.csv")
pred = lgb_model.predict(x_test)
submit['Target'] = le.inverse_transform(pred)
submit.to_csv("outputs/submission.csv", index=False)
submit.head()

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Graduate
4,76522,Enrolled
