In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
TRAIN_PATH = "/workspace/data/house_prices/train.csv"
TEST_PATH = "/workspace/data/house_prices/test.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_x = pd.read_csv(TEST_PATH)

In [3]:
# 数値データは0で穴埋め / 文字データは'0'で穴埋め# # デフォルトカラムのうち、intとfloatのみ選択
type_df = pd.DataFrame(train_df.dtypes.reset_index())
type_df = type_df.rename(columns={0:"type"})
type_df['type'] = type_df['type'].apply(lambda x: str(x))
str_columns = list(type_df.query("type in 'object'")["index"].unique())

train_df = train_df.fillna(0)
test_x = test_x.fillna(0)

for column in str_columns:
    train_df[column] = train_df[column].apply(lambda x: '0' if x == 0 else x)
    test_x[column] = test_x[column].apply(lambda x: '0' if x == 0 else x)

In [4]:
# testデータにしかないデータは最頻値にて穴埋め
for c in str_columns:
    train_c_data = set(train_df[c].unique())
    test_c_data = set(test_x[c].unique())
    
    diff = list(test_c_data - train_c_data)
    if diff:
        print(c, "testにしかないデータ数:", test_x.query(f"{c} in @diff").shape[0])
        
        train_max = train_df.groupby(f"{c}").size().idxmax()
        test_x[c] = test_x[c].apply(lambda x: train_max if x in diff else x)
        
# label Encoder
for c in str_columns:
    # 学習データに基づいて定義する
    le = LabelEncoder()
    le.fit(train_df[c])
    train_df[c] = le.transform(train_df[c])
    test_x[c] = le.transform(test_x[c])

MSZoning testにしかないデータ数: 4
Utilities testにしかないデータ数: 2
Exterior1st testにしかないデータ数: 1
Exterior2nd testにしかないデータ数: 1
KitchenQual testにしかないデータ数: 1
Functional testにしかないデータ数: 2
SaleType testにしかないデータ数: 1


In [5]:
train_x = train_df.drop(["Id", "SalePrice"], axis=1)
train_y = train_df[["SalePrice"]]
test_x = test_x.drop("Id", axis=1)

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [6]:
clf = RandomForestClassifier(random_state=71)
clf.fit(tr_x, tr_y)

  clf.fit(tr_x, tr_y)


RandomForestClassifier(random_state=71)

In [7]:
clf.score(va_x, va_y)

0.0273972602739726

In [8]:
pred = pd.DataFrame({"SalePrice":clf.predict(test_x)})
# Idを合わせる
pred = pred.reset_index()
pred = pred.rename(columns={'index':'Id'})
pred['Id'] = pred['Id'].apply(lambda x: x + 1461)

In [9]:
path = "/workspace/data/house_prices/result_RandomForest.csv"
pred.to_csv(path, index=False, encoding='utf-8-sig')