# 第3章 特征变换

## 3.1 特征数值化

**基础知识**

In [1]:
import pandas as pd
df = pd.DataFrame({"gene_segA": [1, 0, 0, 1, 1, 1, 0, 0, 1, 0],
                   "gene_segB": [1, 0, 1, 0, 1, 1, 0, 0, 1, 0],
                   "hypertension": ["Y", 'N', 'N', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N'],
                   "Gallstones": ['Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y']
                  })
df

Unnamed: 0,gene_segA,gene_segB,hypertension,Gallstones
0,1,1,Y,Y
1,0,0,N,N
2,0,1,N,N
3,1,0,N,N
4,1,1,N,Y
5,1,1,N,Y
6,0,0,Y,Y
7,0,0,N,N
8,1,1,Y,N
9,0,0,N,Y


In [2]:
df.replace({"N": 0, 'Y': 1})

Unnamed: 0,gene_segA,gene_segB,hypertension,Gallstones
0,1,1,1,1
1,0,0,0,0
2,0,1,0,0
3,1,0,0,0
4,1,1,0,1
5,1,1,0,1
6,0,0,1,1
7,0,0,0,0
8,1,1,1,0
9,0,0,0,1


In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(df['hypertension'])

array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [4]:
le.fit_transform([1, 3, 3, 7, 3, 1])

array([0, 1, 1, 2, 1, 0])

In [5]:
le.inverse_transform([0, 1, 1, 2, 1, 0])

array([1, 3, 3, 7, 3, 1])

**项目案例**

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()    # ①
le.fit(['white', 'green', 'red', 'green', 'white'])    # ②
le.classes_    # ③

array(['green', 'red', 'white'], dtype='<U5')

In [7]:
le.transform(["green", 'green', 'green', 'white'])    # ④

array([0, 0, 0, 2])

In [8]:
le.transform(["green", 'green', 'green', 'blue'])

ValueError: y contains previously unseen labels: ['blue']

**动手练习**

In [9]:
# 第1题
import pandas as pd
path = "/Users/qiwsir/Documents/Codes/DataSet"
cwur = pd.read_csv(path + "/universityrank/cwurData.csv")
cwur.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012


In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(cwur['country'])

array([54, 54, 54, ...,  4, 48,  8])

In [11]:
# 第2题
import re
d1 = "I am Laoqi. I am a programmer."
d2 = "Laoqi is in Soochow. It is a beautiful city."
words = re.findall(r"\w+", d1+d2)    # 以正则表达式提炼单词，不是用split()，这样就避免了句点问题
words

['I',
 'am',
 'Laoqi',
 'I',
 'am',
 'a',
 'programmer',
 'Laoqi',
 'is',
 'in',
 'Soochow',
 'It',
 'is',
 'a',
 'beautiful',
 'city']

In [12]:
words = list(set(words))    # 唯一单词保存为列表
[w.lower() for w in words]
words

['It',
 'city',
 'I',
 'Soochow',
 'beautiful',
 'a',
 'programmer',
 'Laoqi',
 'in',
 'am',
 'is']

In [13]:
# 为每句话中的单词出现次数计数
def count_word(document, unique_words):
    count_doc = []
    for word in unique_words:
        n = document.lower().count(word)
        count_doc.append(n)
    return count_doc

count1 = count_word(d1, words)
count2 = count_word(d2, words)
print(count1)
print(count2)

[0, 0, 0, 0, 0, 5, 1, 0, 0, 3, 0]
[0, 1, 0, 0, 1, 3, 0, 0, 1, 0, 2]


In [14]:
# 保存为dataframe
df = pd.DataFrame([count1, count2], columns=words, index=['d1', 'd2'])
df

Unnamed: 0,It,city,I,Soochow,beautiful,a,programmer,Laoqi,in,am,is
d1,0,0,0,0,0,5,1,0,0,3,0
d2,0,1,0,0,1,3,0,0,1,0,2


In [15]:
#第2题目的方法2：
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
tf1 = count_vect.fit_transform([d1, d2])
tf1.shape

(2, 9)

In [16]:
count_vect.get_feature_names()  # 相对前面方法少了2个，因为I 和 a作为常用词停词了。

['am', 'beautiful', 'city', 'in', 'is', 'it', 'laoqi', 'programmer', 'soochow']

In [17]:
tf1.toarray()    # 显示记录数值

array([[2, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 1, 1, 1, 2, 1, 1, 0, 1]], dtype=int64)