**<font size=6>利用Pandas实现one-hot编码</font>**

In [2]:
import pandas as pd
from IPython.display import display

IPython.display可以在Jupyter notebook中输出漂亮的格式

**<font size=4>1.加载数据</font>**

In [27]:
col_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "gender", "capital-gain", "capital-loss", "hours-per-week", "native-country","income"]
data = pd.read_csv("adult.data", header=None, index_col=False, names = col_names)

display(data.head())

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


查看数据的shape

In [28]:
data.shape

(32561, 15)

为了便于说明，我们只用其中的几列

In [29]:
data = data[["age", "workclass", "education", "gender", "hours-per-week", "occupation", "income"]]
data.head()

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [30]:
data.shape

(32561, 7)

**<font size=4>2. 检查字符串编码的分类数据</font>**

value_counts()显示唯一值及其出现的次数

In [31]:
data.gender.value_counts()

 Male      21790
 Female    10771
Name: gender, dtype: int64

In [32]:
data.columns

Index(['age', 'workclass', 'education', 'gender', 'hours-per-week',
       'occupation', 'income'],
      dtype='object')

查看当前的列名

In [33]:
list(data.columns)

['age',
 'workclass',
 'education',
 'gender',
 'hours-per-week',
 'occupation',
 'income']

使用pd.get_dummies(data)自动变换所有具有对象类型的列或所有分类的列

In [34]:
data_dummies = pd.get_dummies(data)
data_dummies.head()

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,53,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,28,40,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [35]:
data_dummies.shape

(32561, 46)

从原来的7列数据，到现在的46列。连续特征的值没有变，而分类特征的每个值都被编码为一个0/1标志

In [36]:
list(data_dummies.columns)

['age',
 'hours-per-week',
 'workclass_ ?',
 'workclass_ Federal-gov',
 'workclass_ Local-gov',
 'workclass_ Never-worked',
 'workclass_ Private',
 'workclass_ Self-emp-inc',
 'workclass_ Self-emp-not-inc',
 'workclass_ State-gov',
 'workclass_ Without-pay',
 'education_ 10th',
 'education_ 11th',
 'education_ 12th',
 'education_ 1st-4th',
 'education_ 5th-6th',
 'education_ 7th-8th',
 'education_ 9th',
 'education_ Assoc-acdm',
 'education_ Assoc-voc',
 'education_ Bachelors',
 'education_ Doctorate',
 'education_ HS-grad',
 'education_ Masters',
 'education_ Preschool',
 'education_ Prof-school',
 'education_ Some-college',
 'gender_ Female',
 'gender_ Male',
 'occupation_ ?',
 'occupation_ Adm-clerical',
 'occupation_ Armed-Forces',
 'occupation_ Craft-repair',
 'occupation_ Exec-managerial',
 'occupation_ Farming-fishing',
 'occupation_ Handlers-cleaners',
 'occupation_ Machine-op-inspct',
 'occupation_ Other-service',
 'occupation_ Priv-house-serv',
 'occupation_ Prof-specialty',


**<font color="red">这里可以注意：pandas中的切片和列表有所不同</font>**

In [43]:
data.loc[:,"age":"gender"].head()#切片包含了gender这一列

Unnamed: 0,age,workclass,education,gender
0,39,State-gov,Bachelors,Male
1,50,Self-emp-not-inc,Bachelors,Male
2,38,Private,HS-grad,Male
3,53,Private,11th,Male
4,28,Private,Bachelors,Female


提取特征，将特征和目标分离

In [45]:
features = data_dummies.loc[:, "age":"occupation_ Transport-moving"]

In [48]:
X = features.values
X

array([[39, 40,  0, ...,  0,  0,  0],
       [50, 13,  0, ...,  0,  0,  0],
       [38, 40,  0, ...,  0,  0,  0],
       ...,
       [58, 40,  0, ...,  0,  0,  0],
       [22, 20,  0, ...,  0,  0,  0],
       [52, 40,  0, ...,  0,  0,  0]], dtype=int64)

In [50]:
y = data_dummies["income_ >50K"].values

In [51]:
print("X.shape:{}, y.shape:{}".format(X.shape, y.shape))

X.shape:(32561, 44), y.shape:(32561,)


**<font size=4>3. 调用机器学习方法</font>**

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [55]:
logistReg = LogisticRegression(penalty="l2", C=0.2)
logistReg.fit(X_train, y_train)



LogisticRegression(C=0.2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

查看在测试集上的效果

In [56]:
logistReg.score(X_test, y_test)

0.808991524382754

**<font size=5>对比dummies哑变量和one-hot编码</font>**

**<font size=4>dummies哑变量</font>**

In [6]:
import pandas as pd

demo_df = pd.DataFrame({"Categorical Feature":["socks", "fox", "socks", "box"], "Integer Feature":[0,1,2,1]})
print("Initial data:")
display(demo_df)

#dummies只会编码字符串特征，不会改变整数特征
dummies = pd.get_dummies(demo_df)
print("dummie:")
display(dummies)

Initial data:


Unnamed: 0,Categorical Feature,Integer Feature
0,socks,0
1,fox,1
2,socks,2
3,box,1


dummie:


Unnamed: 0,Integer Feature,Categorical Feature_box,Categorical Feature_fox,Categorical Feature_socks
0,0,0,0,1
1,1,0,1,0
2,2,0,0,1
3,1,1,0,0


**<font size=4>使用OneHotEncoder</font>**

In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(n_values="auto")

In [8]:
demo_df

Unnamed: 0,Categorical Feature,Integer Feature
0,socks,0
1,fox,1
2,socks,2
3,box,1


In [10]:
encoder.fit(demo_df)

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values='auto', sparse=True)

In [15]:
newdf = encoder.transform(demo_df).toarray()

默认对所有的列都实行OneHot编码

In [16]:
newdf

array([[0., 0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0.]])