# 範例 : (Kaggle)房價預測

## [教學目標]
- 以下用房價預測資料, 觀察群聚編碼的效果

## [範例重點]
- 了解群聚編碼的寫作方式(In[3], Out[3])
- 觀察群聚編碼, 搭配線性迴歸以及隨機森林分別有什麼影響 (In[6]~In[9], Out[6]~Out[9])

In [1]:
# 請先確認您的 sklearn 版本是否相同, 如果版本不是 0.21.1 版, 有可能跑出與本範例不同的結果
import sklearn
sklearn.__version__

'0.19.1'

In [2]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

data_path = 'data/'
df = pd.read_csv(data_path + 'house_train.csv.gz')

train_Y = np.log1p(df['SalePrice'])
df = df.drop(['Id', 'SalePrice'] , axis=1)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [3]:
# 生活總面積(GrLivArea)對販售條件(SaleCondition)做群聚編碼
# 寫法類似均值編碼, 只是對另一個特徵, 而非目標值
df['SaleCondition'] = df['SaleCondition'].fillna('None')
mean_df = df.groupby(['SaleCondition'])['GrLivArea'].mean().reset_index()
mode_df = df.groupby(['SaleCondition'])['GrLivArea'].apply(lambda x: x.mode()[0]).reset_index()
median_df = df.groupby(['SaleCondition'])['GrLivArea'].median().reset_index()
max_df = df.groupby(['SaleCondition'])['GrLivArea'].max().reset_index()
temp = pd.merge(mean_df, mode_df, how='left', on=['SaleCondition'])
temp = pd.merge(temp, median_df, how='left', on=['SaleCondition'])
temp = pd.merge(temp, max_df, how='left', on=['SaleCondition'])
temp.columns = ['SaleCondition', 'Area_Sale_Mean', 'Area_Sale_Mode', 'Area_Sale_Median', 'Area_Sale_Max']
temp

Unnamed: 0,SaleCondition,Area_Sale_Mean,Area_Sale_Mode,Area_Sale_Median,Area_Sale_Max
0,Abnorml,1436.128713,864,1302.0,4476
1,AdjLand,1112.5,980,1143.0,1184
2,Alloca,1701.75,1535,1439.5,3194
3,Family,1480.95,948,1390.5,2526
4,Normal,1492.96828,864,1456.0,4316
5,Partial,1795.696,1456,1646.0,5642


In [4]:
df = pd.merge(df, temp, how='left', on=['SaleCondition'])
df = df.drop(['SaleCondition'] , axis=1)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,Area_Sale_Mean,Area_Sale_Mode,Area_Sale_Median,Area_Sale_Max
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,,,0,2,2008,WD,1492.96828,864,1456.0,4316
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,,,0,5,2007,WD,1492.96828,864,1456.0,4316
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,,,0,9,2008,WD,1492.96828,864,1456.0,4316
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,,,0,2,2006,WD,1436.128713,864,1302.0,4476
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,,,0,12,2008,WD,1492.96828,864,1456.0,4316


In [5]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

# 削減文字型欄位, 只剩數值型欄位
df = df[num_features]
df = df.fillna(-1)
MMEncoder = MinMaxScaler()
df.head()

40 Numeric Features : ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'Area_Sale_Mean', 'Area_Sale_Mode', 'Area_Sale_Median', 'Area_Sale_Max']



Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,Area_Sale_Mean,Area_Sale_Mode,Area_Sale_Median,Area_Sale_Max
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,0,2,2008,1492.96828,864,1456.0,4316
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,0,5,2007,1492.96828,864,1456.0,4316
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,0,9,2008,1492.96828,864,1456.0,4316
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,0,2,2006,1436.128713,864,1302.0,4476
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,0,12,2008,1492.96828,864,1456.0,4316


In [6]:
# 沒有這四個新特徵的 dataframe 稱為 df_minus
df_minus = df.drop(['Area_Sale_Mean', 'Area_Sale_Mode', 'Area_Sale_Median', 'Area_Sale_Max'] , axis=1)

# 原始特徵 + 線性迴歸
train_X = MMEncoder.fit_transform(df_minus)
estimator = LinearRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.8466330257520471

In [7]:
# 新特徵 + 線性迴歸 : 有些微改善
train_X = MMEncoder.fit_transform(df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.8490781901921691

In [8]:
# 原始特徵 + 隨機森林
train_X = MMEncoder.fit_transform(df_minus)
estimator = RandomForestRegressor()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.8475178984702921

In [9]:
# 新特徵 + 隨機森林
train_X = MMEncoder.fit_transform(df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.8538252717955828

# 作業 : (Kaggle)鐵達尼生存預測

## [作業目標]
- 試著模仿範例寫法, 在鐵達尼生存預測中, 觀察計數編碼與特徵雜湊的效果

## [作業重點]
- 仿造範例, 完成自己挑選特徵的群聚編碼 (In[2], Out[2])
- 觀察群聚編碼, 搭配邏輯斯回歸, 看看有什麼影響 (In[5], Out[5], In[6], Out[6])

## 作業1
* 試著使用鐵達尼號的例子，創立兩種以上的群聚編碼特徵( mean、median、mode、max、min、count 均可 )

In [14]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

data_path = 'data/'
df = pd.read_csv(data_path + 'titanic_train.csv')

train_Y = df['Survived']
df = df.drop(['PassengerId', 'Survived'] , axis=1)
df.head(30)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [16]:
# 取一個類別型欄位, 與一個數值型欄位, 做群聚編碼
"""
Your Code Here
"""
# Ticket對Fare做群聚編碼
# 寫法類似均值編碼, 只是對另一個特徵, 而非目標值
df['Sex'] = df['Sex'].fillna('None') #對類別型欄位補空值
mean_df = df.groupby(['Sex'])['Fare'].mean().reset_index() #群聚取平均值
mode_df = df.groupby(['Sex'])['Fare'].apply(lambda x: x.mode()[0]).reset_index() #群聚取眾數
median_df = df.groupby(['Sex'])['Fare'].median().reset_index() #群聚取中位數
max_df = df.groupby(['Sex'])['Fare'].max().reset_index() #群聚取最大值
temp = pd.merge(mean_df, mode_df, how='left', on=['Sex'])
temp = pd.merge(temp, median_df, how='left', on=['Sex'])
temp = pd.merge(temp, max_df, how='left', on=['Sex'])
temp.columns = ['Sex', 'Sex_Fare_Mean', 'Sex_Fare_Mode', 'Sex_Fare_Median', 'Sex_Fare_Max']
temp

Unnamed: 0,Sex,Sex_Fare_Mean,Sex_Fare_Mode,Sex_Fare_Median,Sex_Fare_Max
0,female,44.479818,7.75,23.0,512.3292
1,male,25.523893,8.05,10.5,512.3292


In [17]:
df = pd.merge(df, temp, how='left', on=['Sex'])
df = df.drop(['Sex'] , axis=1)
df.head()

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_Fare_Mean,Sex_Fare_Mode,Sex_Fare_Median,Sex_Fare_Max
0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,25.523893,8.05,10.5,512.3292
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,44.479818,7.75,23.0,512.3292
2,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,44.479818,7.75,23.0,512.3292
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,44.479818,7.75,23.0,512.3292
4,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,25.523893,8.05,10.5,512.3292


In [18]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

# 削減文字型欄位, 只剩數值型欄位
df = df[num_features]
df = df.fillna(-1)
MMEncoder = MinMaxScaler()
df.head()

9 Numeric Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_Fare_Mean', 'Sex_Fare_Mode', 'Sex_Fare_Median', 'Sex_Fare_Max']



Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_Fare_Mean,Sex_Fare_Mode,Sex_Fare_Median,Sex_Fare_Max
0,3,22.0,1,0,7.25,25.523893,8.05,10.5,512.3292
1,1,38.0,1,0,71.2833,44.479818,7.75,23.0,512.3292
2,3,26.0,0,0,7.925,44.479818,7.75,23.0,512.3292
3,1,35.0,1,0,53.1,44.479818,7.75,23.0,512.3292
4,3,35.0,0,0,8.05,25.523893,8.05,10.5,512.3292


## 作業2
* 將上述的新特徵，合併原有的欄位做生存率預估，結果是否有改善?

In [19]:
# 沒有這四個新特徵的 dataframe 稱為 df_minus
df_minus = df.drop(['Sex_Fare_Mean', 'Sex_Fare_Mode', 'Sex_Fare_Median', 'Sex_Fare_Max'] , axis=1)

# 原始特徵 + 羅吉斯迴歸
"""
Your Code Here
"""
train_X = MMEncoder.fit_transform(df_minus)
estimator = LogisticRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.7038635542329971

In [20]:
# 新特徵 + 羅吉斯迴歸
"""
Your Code Here
"""
train_X = MMEncoder.fit_transform(df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.7822774202448624

In [21]:
# 原始特徵 + 隨機森林
train_X = MMEncoder.fit_transform(df_minus)
estimator = RandomForestRegressor()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.022210187045084328

In [22]:
# 新特徵 + 隨機森林
train_X = MMEncoder.fit_transform(df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.3728646813669171