In [69]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e8/sample_submission.csv
/kaggle/input/playground-series-s5e8/train.csv
/kaggle/input/playground-series-s5e8/test.csv


In [70]:
# ライブラリの読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# モデル作成のためのライブラリ
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

# モデル作成に役立つライブラリ
from sklearn.model_selection import train_test_split

# 性能指標(正解率)
from sklearn.metrics import roc_auc_score

# 不要な警告を無視する
import warnings
warnings.filterwarnings('ignore')

In [71]:
train = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")

In [72]:
print(train.columns)

Index(['id', 'age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'y'],
      dtype='object')


In [73]:
train_x = train.drop(['y','id','day'],axis =1)
train_y = train['y']

In [74]:
#学習データの欠損値を確認する
print('訓練データの欠損値:\n', train_x.isnull().sum().sort_values(ascending=False), '\n')
#テストデータの欠損値を確認する
print('テストデータの欠損値:\n', test.isnull().sum().sort_values(ascending=False))

訓練データの欠損値:
 age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64 

テストデータの欠損値:
 id           0
contact      0
previous     0
pdays        0
campaign     0
duration     0
month        0
day          0
loan         0
age          0
housing      0
balance      0
default      0
education    0
marital      0
job          0
poutcome     0
dtype: int64


In [75]:
# データの情報の確認
print(train_x.info(), '\n')
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 15 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   age        750000 non-null  int64 
 1   job        750000 non-null  object
 2   marital    750000 non-null  object
 3   education  750000 non-null  object
 4   default    750000 non-null  object
 5   balance    750000 non-null  int64 
 6   housing    750000 non-null  object
 7   loan       750000 non-null  object
 8   contact    750000 non-null  object
 9   month      750000 non-null  object
 10  duration   750000 non-null  int64 
 11  campaign   750000 non-null  int64 
 12  pdays      750000 non-null  int64 
 13  previous   750000 non-null  int64 
 14  poutcome   750000 non-null  object
dtypes: int64(6), object(9)
memory usage: 85.8+ MB
None 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 17 columns):
 #   Column     Non-Null Count   Dtype 


In [76]:
# 月→数字に変換
month_to_num = {
    'jan': 1, 'feb': 2, 'mar': 3,
    'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9,
    'oct': 10, 'nov': 11, 'dec': 12
}

train_x['month_num'] = train_x['month'].map(month_to_num)

# 上半期 or 下半期に分類（1〜6：上半期、7〜12：下半期）
train_x['half_year'] = train_x['month_num'].apply(lambda x: 0 if x <= 6 else 1)


train_x.drop(columns=['month', 'month_num'], inplace=True)

test['month_num'] = test['month'].map(month_to_num)

# 上半期 or 下半期に分類（1〜6：上半期、7〜12：下半期）
test['half_year'] = test['month_num'].apply(lambda x: 0 if x <= 6 else 1)

# もう month/month_num はいらなければ削除
test.drop(columns=['month', 'month_num'], inplace=True)


In [77]:
# 関数を定義して職業をグループ分け
def group_job(job):
    if job in ['blue-collar', 'entrepreneur', 'services', 'housemaid']:
        return 0
    elif job in ['admin.', 'technician', 'unknown', 'self-employed', 'management', 'unemployed']:
        return 1
    else:  # retired, student
        return 2

# train_x に新しいカラムを追加
train_x['job_group'] = train_x['job'].apply(group_job)
test['job_group'] = test['job'].apply(group_job)

In [78]:
train_x.drop(columns=['job'], inplace=True)
test.drop(columns=['job'], inplace=True)


In [79]:
marital_dummies = pd.get_dummies(train_x['marital'], prefix='marital')
train_x = pd.concat([train_x.drop(columns=['marital']), marital_dummies], axis=1)
marital_dummies = pd.get_dummies(test['marital'], prefix='marital')
test = pd.concat([test.drop(columns=['marital']), marital_dummies], axis=1)



In [80]:
# カテゴリを明示的に定義（申込率の順じゃなくてもOK、全て含めることが大事）
categories = ['primary', 'secondary', 'tertiary', 'unknown']

# カテゴリ型に変換して、train と test 両方に同じカテゴリを指定
train_x['education'] = pd.Categorical(train_x['education'], categories=categories)
test['education'] = pd.Categorical(test['education'], categories=categories)

# One-Hot Encoding
train_x = pd.get_dummies(train_x, columns=['education'], drop_first=False)
test = pd.get_dummies(test, columns=['education'], drop_first=False)




In [81]:
# yes/no をまとめて 1/0 に変換するカラム一覧
binary_cols = ['default', 'housing', 'loan']

# train_x（特徴量）の変換
for col in binary_cols:
    if col in train_x.columns:
        train_x[col] = train_x[col].map({'yes': 1, 'no': 0})


# test データの変換も忘れずに
for col in binary_cols:
    if col in test.columns:
        test[col] = test[col].map({'yes': 1, 'no': 0})


In [82]:
train_x = pd.get_dummies(train_x, columns=['contact'], drop_first=False)
test = pd.get_dummies(test, columns=['contact'], drop_first=False)



In [83]:
# 成功かどうかを1/0で表すフラグ変数
train_x['poutcome_success'] = (train['poutcome'] == 'success').astype(int)
test['poutcome_success'] = (test['poutcome'] == 'success').astype(int)


In [84]:
train_x.drop(columns=['poutcome'], inplace=True)
test.drop(columns=['poutcome'], inplace=True)



In [85]:
test = test.drop(['id','day'],axis =1)


In [86]:
#ここは前処理ができているかの確認作業
def debug_check(train_x, test, train_y):
    print("🔸 カラム差分:", train_x.columns.symmetric_difference(test.columns))
    print("🔸 欠損数（train）:", train_x.isnull().sum().sum())
    print("🔸 欠損数（test）:", test.isnull().sum().sum())
    print("🔸 データ型:", train_x.dtypes.value_counts())
    print("🔸 目的変数の分布:\n", train_y.value_counts(normalize=True))


debug_check(train_x,test,train_y)


🔸 カラム差分: Index([], dtype='object')
🔸 欠損数（train）: 0
🔸 欠損数（test）: 0
🔸 データ型: int64    12
bool     10
Name: count, dtype: int64
🔸 目的変数の分布:
 y
0    0.879349
1    0.120651
Name: proportion, dtype: float64


In [87]:
# データを8:2に分割（stratifyでラベルの比率を保つ）
X_train, X_valid, y_train, y_valid = train_test_split(
    train_x, train_y, test_size=0.2, random_state=42, stratify=train_y
)


In [88]:
print(X_train.shape, X_valid.shape)
print(y_train.value_counts(normalize=True))
print(y_valid.value_counts(normalize=True))



(600000, 22) (150000, 22)
y
0    0.87935
1    0.12065
Name: proportion, dtype: float64
y
0    0.879347
1    0.120653
Name: proportion, dtype: float64


In [89]:
model = LGBMClassifier(
    objective='binary',
    random_state=42,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=-1
)

model.fit(X_train, y_train)

y_pred = model.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, y_pred)
print(f"Validation ROC AUC: {auc:.4f}")

[LightGBM] [Info] Number of positive: 72390, number of negative: 527610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032037 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120650 -> initscore=-1.986289
[LightGBM] [Info] Start training from score -1.986289
Validation ROC AUC: 0.9583


In [90]:
test_pred = model.predict_proba(test)[:, 1]

# sample_submission を読み込んで書き換える
submission = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")
submission["y"] = test_pred

# 提出用CSVファイルとして保存
submission.to_csv("submission.csv", index=False)