處理非數值欄位有三種方法
1.直接砍掉
2.幫他們編號，並保有排序，例如國外成績A,B,C 轉換成分數90,80,70
3.轉換0和1組成的向量,ex:藍藍黑紅
藍:(1,1,0,0)
黑:(0,0,1,0)
紅:(0,0,0,1)


補充:
2.Ordinal encoding 是將類別變量轉化為有序數值的策略。它將每個類別分別映射到一個整數上，並保留類別之間的大小關係。
3.One-hot encoding 是一種將類別變量轉化為一個由 0 和 1 組成的向量的策略。它將每個類別分別映射到一個新的二元特徵上。

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split


data = pd.read_csv('/Users/Rich/Desktop/Kaggle/KaggleTeamLearn/Rich/Internediate ML/melb_data.csv')


y = data.Price
X = data.drop(['Price'], axis=1)


X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)


low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]


numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]


my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [38]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [39]:
X_train.Rooms.dtype

dtype('int64')

In [40]:
X_train.Distance.dtype

dtype('float64')

In [41]:

s = (X_train.dtypes == 'object')
#這行可以理解成是把X_train的每一個column的type都跟object比較，如果是object的話就會是True，
#如果不是的話就會是False，並且會印出True的column name
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Type', 'Method', 'Regionname']


In [42]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [43]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
175703.48185157913


In [44]:
from sklearn.preprocessing import OrdinalEncoder


label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
165936.40548390493


In [45]:
object_cols

['Type', 'Method', 'Regionname']

In [46]:
from sklearn.preprocessing import OneHotEncoder


#handle_unknown = 'ignore' 這個參數是說如果有新的類別出現的話，就不要報錯，而是直接忽略
#sparse = False 這個參數是說不要用稀疏 matrix，而是用一般的matrix
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

#步驟是先設定好，然後用fit_transform去fit X_train，然後再用transform去fit X_valid
#接著設定index，最後再把兩個dataframe合併起來(因為我們在處理的是非數值col所以我們前面把他切出來)

OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
#這個方法會把原本的column name拿掉，所以要重新加回來

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)


OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)




以下是練習

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split


X = pd.read_csv('/Users/Rich/Desktop/Kaggle/KaggleTeamLearn/Rich/Internediate ML/home-data-for-ml-course/train.csv', index_col='Id') 
X_test = pd.read_csv('/Users/Rich/Desktop/Kaggle/KaggleTeamLearn/Rich/Internediate ML/home-data-for-ml-course/test.csv', index_col='Id')


X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)


cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)


X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [48]:
X_train.head

<bound method NDFrame.head of       MSSubClass MSZoning  LotArea Street LotShape LandContour Utilities  \
Id                                                                         
619           20       RL    11694   Pave      Reg         Lvl    AllPub   
871           20       RL     6600   Pave      Reg         Lvl    AllPub   
93            30       RL    13360   Pave      IR1         HLS    AllPub   
818           20       RL    13265   Pave      IR1         Lvl    AllPub   
303           20       RL    13704   Pave      IR1         Lvl    AllPub   
...          ...      ...      ...    ...      ...         ...       ...   
764           60       RL     9430   Pave      Reg         Lvl    AllPub   
836           20       RL     9600   Pave      Reg         Lvl    AllPub   
1217          90       RM     8930   Pave      Reg         Lvl    AllPub   
560          120       RL     3196   Pave      Reg         Lvl    AllPub   
685           60       RL    16770   Pave      IR2        

In [49]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [50]:
#使用刪除法把非數值column刪除
#Hint:使用select_dtypes()
#drop_X_train = X_
#drop_X_train = 

In [51]:
from sklearn.preprocessing import OrdinalEncoder

# Drop categorical columns that will not be encoded
#label_X_train = X_train.drop(bad_label_cols, axis=1)
#label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# 使用編碼法把非數值column填滿
#hint:使用OrdinalEncoder()


In [52]:
from sklearn.preprocessing import OneHotEncoder

# Use as many lines of code as you need!
onehot_encoder = OneHotEncoder()
#OH_X_train = onehot_encoder.fit_transform(X_train)
#使用onehot_encoder填補缺失值
#OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

#OH_X_valid = OH_encoder.fit_transform(X_valid)