In [1]:
!pip install category_encoders

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

%matplotlib inline



In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
# separete train data to X, y
X_train = df_train.drop("SalePrice", axis=1)
y_train = df_train["SalePrice"]

In [4]:
# def separate function(separate data in dtype, object,int,float)
def separate_data(df):
  obj = pd.DataFrame()
  df_int = pd.DataFrame()
  flo = pd.DataFrame()

  for col in df.columns:
    if df[col].dtype == "O":
      obj = pd.concat([obj, df[col]], axis=1)
    elif df[col].dtype == "int64":
      df_int = pd.concat([df_int, df[col]], axis=1)
    else:
      flo = pd.concat([flo, df[col]], axis=1)

  return obj, df_int, flo

In [5]:
X_train_obj, X_train_int, X_train_flo = separate_data(X_train)

In [6]:
X_train_obj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 43 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSZoning       1460 non-null   object
 1   Street         1460 non-null   object
 2   Alley          91 non-null     object
 3   LotShape       1460 non-null   object
 4   LandContour    1460 non-null   object
 5   Utilities      1460 non-null   object
 6   LotConfig      1460 non-null   object
 7   LandSlope      1460 non-null   object
 8   Neighborhood   1460 non-null   object
 9   Condition1     1460 non-null   object
 10  Condition2     1460 non-null   object
 11  BldgType       1460 non-null   object
 12  HouseStyle     1460 non-null   object
 13  RoofStyle      1460 non-null   object
 14  RoofMatl       1460 non-null   object
 15  Exterior1st    1460 non-null   object
 16  Exterior2nd    1460 non-null   object
 17  MasVnrType     588 non-null    object
 18  ExterQual      1460 non-null

In [7]:
# set encoding function(how to chage object to int)
list_cols = X_train_obj.columns
encoding = ce.OrdinalEncoder(cols=list_cols)

In [8]:
# fit X_train_obj to encoding function
X_train_obj_ec = encoding.fit_transform(X_train_obj)
X_train_obj_ec

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,1,1,3,1,1,1,1,1,1,1,...,1,1,1,1,1,4,5,5,1,1
1,1,1,3,1,1,1,2,1,2,2,...,1,1,1,1,1,4,5,5,1,1
2,1,1,3,2,1,1,1,1,1,1,...,1,1,1,1,1,4,5,5,1,1
3,1,1,3,2,1,1,3,1,3,1,...,2,2,1,1,1,4,5,5,1,2
4,1,1,3,2,1,1,2,1,4,1,...,1,1,1,1,1,4,5,5,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1,1,3,1,1,1,1,1,18,1,...,1,1,1,1,1,4,5,5,1,1
1456,1,1,3,1,1,1,1,1,7,1,...,1,2,1,1,1,4,1,5,1,1
1457,1,1,3,1,1,1,1,1,3,1,...,1,1,1,1,1,4,3,1,1,1
1458,1,1,3,1,1,1,1,1,12,1,...,1,2,1,1,1,4,5,5,1,1


In [9]:
X_train_int['Id']

0          1
1          2
2          3
3          4
4          5
        ... 
1455    1456
1456    1457
1457    1458
1458    1459
1459    1460
Name: Id, Length: 1460, dtype: int64

In [10]:
X_train_coc = pd.concat([X_train_obj_ec, X_train_int, X_train_flo], axis=1)

In [11]:
import lightgbm as lgb

In [12]:
# LightGBMのモデル構築
params = {
    "objective": "regression",
    "metric": "rmse",  # RMSEを評価指標に使用
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": 0
}

In [13]:
X_train_pra, X_train_val, y_train_pra, y_train_val = train_test_split(X_train_coc, y_train, test_size=0.2)

In [14]:
train_data = lgb.Dataset(X_train_pra, label=y_train_pra)
eval_data = lgb.Dataset(X_train_val, label=y_train_val, reference=train_data)

In [15]:
model = lgb.train(params, train_data, valid_sets=[eval_data], num_boost_round=1000)

In [16]:
X_test_obj, X_test_int, X_test_flo = separate_data(df_test)

In [17]:
X_test_obj_ec = encoding.transform(X_test_obj)

In [18]:
X_test_coc = pd.concat([X_test_obj_ec, X_test_int, X_test_flo], axis=1)

In [19]:
X_test_coc['Id']

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

In [20]:
y_test_pred = model.predict(X_test_coc)
y_test_pred

array([139703.25354592, 210051.45850175, 195396.18752773, ...,
       189123.61938817, 182127.30408728, 234525.13854312])

In [21]:
submission = pd.DataFrame({'Id': X_test_coc['Id'], 'SalePrice': y_test_pred})
submission.to_csv('submission_Advanced_Housing_Price_05.csv', index=False)