In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# データの読み込み
train_data = pd.read_csv('train.tsv', sep='\t')
test_data = pd.read_csv('test.tsv', sep='\t')

# 目的変数 (Y) と説明変数 (X) を分ける
X_train = train_data.drop(columns=['id', 'Y'])
y_train = train_data['Y']
X_test = test_data.drop(columns=['id'])

# カテゴリカル変数の前処理パイプライン
categorical_features = X_train.columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ]
)

# モデルの構築とトレーニング
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)

# テストデータで予測
y_pred = model.predict(X_test)

# 結果を DataFrame にまとめる
submit_df = pd.DataFrame({
    'id': test_data['id'],
    'Y': y_pred
})

# submit.csv に出力
submit_df.to_csv('submit.csv', index=False)
