In [1]:
# ライブラリの読み込み
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve, auc


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jaxa623/extrovert-vs-introvert-behavior-data-backup")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/extrovert-vs-introvert-behavior-data-backup


In [3]:
# 読み込み
train_df = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
original = pd.read_csv('/kaggle/input/extrovert-vs-introvert-behavior-data-backup/personality_dataset.csv')

# 列名変更と重複排除
original = original.rename(columns={'Personality': 'match_p'})
original = original.drop_duplicates()  # まず完全重複を削除（またはsubset指定）

# 結合（'id' カラムが共通キーなら使う）
train_df = train_df.merge(original, how='left')
test_df = test_df.merge(original, how='left')


In [4]:
original


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,match_p
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...
2894,0.0,No,9.0,3.0,No,12.0,,Extrovert
2896,3.0,No,8.0,3.0,No,14.0,9.0,Extrovert
2897,4.0,Yes,1.0,1.0,Yes,4.0,0.0,Introvert
2898,11.0,Yes,1.0,,Yes,2.0,0.0,Introvert


In [5]:
# train/test に merge 後
# LabelEncoder でカテゴリ → 数値化
le = LabelEncoder()
train_df['match_p_encoded'] = le.fit_transform(train_df['match_p'].fillna('Unknown'))
test_df['match_p_encoded'] = le.transform(test_df['match_p'].fillna('Unknown'))


In [6]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import early_stopping, log_evaluation

# =========================================================
# 特徴量と目的変数の準備
# =========================================================
features = [
    'Time_spent_Alone',
    'Stage_fear',
    'Drained_after_socializing',
    'Social_event_attendance',
    'Going_outside',
    'Friends_circle_size',
    'Post_frequency',
    'match_p_encoded'
]

# Personality を数値に変換（Introvert:0, Extrovert:1）
train_df['target'] = train_df['Personality'].map({'Introvert': 0, 'Extrovert': 1})
test_df['target'] = np.nan

# =========================================================
# 欠損値処理
# =========================================================
for col in ['Stage_fear', 'Drained_after_socializing']:
    train_df[col] = train_df[col].fillna('Unknown').astype('category')
    test_df[col] = test_df[col].fillna('Unknown').astype('category')

# 数値カラムの欠損は平均値で補完
train_df.fillna(train_df.mean(numeric_only=True), inplace=True)
test_df.fillna(train_df.mean(numeric_only=True), inplace=True)

# =========================================================
# 学習データ・テストデータ作成
# =========================================================
X_train = train_df[features]
y_train = train_df['target']
X_test = test_df[features]

cat_cols = ['Stage_fear', 'Drained_after_socializing']

# =========================================================
# Stratified KFold での LightGBM 学習
# =========================================================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train))
test_preds = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = lgb.LGBMClassifier(
        objective='binary',
        random_state=42,
        num_leaves=82,
        max_depth=4,
        learning_rate=0.0476,
        feature_fraction=0.62,
        bagging_freq=1,
        min_data_in_leaf=14,
        reg_alpha=3.27,
        reg_lambda=1.66,
        n_estimators=1000,
        verbosity=-1,
        n_jobs=-1,
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        categorical_feature=cat_cols,
        callbacks=[
            early_stopping(stopping_rounds=50),
            log_evaluation(period=100)
        ],
    )

    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / skf.n_splits

    auc = roc_auc_score(y_val, oof_preds[val_idx])
    print(f"Fold {fold} AUC: {auc:.4f}")

# =========================================================
# スコア算出・予測・提出ファイル作成
# =========================================================
final_auc = roc_auc_score(y_train, oof_preds)
print(f"OOF AUC: {final_auc:.4f}")

preds = (test_preds >= 0.5).astype(int)

submission = pd.DataFrame({
    'id': test_df['id'],
    'Personality': np.where(preds == 1, 'Extrovert', 'Introvert')
})

submission.to_csv('submission.csv', index=False)
print("[INFO] Submission saved to 'submission.csv'")


Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.119381
[200]	valid_0's binary_logloss: 0.116729
[300]	valid_0's binary_logloss: 0.116131
Early stopping, best iteration is:
[342]	valid_0's binary_logloss: 0.116068
Fold 1 AUC: 0.9750
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.128035
[200]	valid_0's binary_logloss: 0.127124
Early stopping, best iteration is:
[175]	valid_0's binary_logloss: 0.127066
Fold 2 AUC: 0.9725
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.131764
[200]	valid_0's binary_logloss: 0.130413
Early stopping, best iteration is:
[185]	valid_0's binary_logloss: 0.130355
Fold 3 AUC: 0.9691
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.123601
[200]	valid_0's binary_logloss: 0.122335
Early stopping, best iteration is:
[212]	valid_0's binary_logloss: 0.122284
Fold 4 AUC: 0.9719
Trainin