## 1.特征选择

### 1.1 移除低方差

In [42]:

# sklearn 文档 https://sklearn.apachecn.org/#/docs/master/14
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel=VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)



array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

In [112]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
data=load_iris()

X=data.data
y=data.target
trainX,testX,trainY,testY=train_test_split(X,y,random_state=1)
knn=KNeighborsClassifier(5)
knn.fit(trainX,trainY)
knn.score(testX,testY)

1.0

In [3]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
data=load_iris()

X=data.data
y=data.target
sel=VarianceThreshold(threshold=0.2)
X=sel.fit_transform(X)
trainX,testX,trainY,testY=train_test_split(X,y,random_state=1)
knn=KNeighborsClassifier(5)
knn.fit(trainX,trainY)
knn.score(testX,testY)


1.0

In [5]:
sel.variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

### 1.2 单变量特征

In [6]:
# 卡方检验 https://zhuanlan.zhihu.com/p/87008470?ivk_sa=1024320u
# sklearn 文档 https://sklearn.apachecn.org/#/docs/master/14
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target
print(X.shape)
# SelectKBest 如果预测评价以后分数降低了，可以改变k值和检验方法
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
print(X_new.shape)

(150, 4)
(150, 2)


In [7]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, chi2
X, y = load_digits(return_X_y=True)
print(X.shape)
# 选择百分之10的特征
X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
print(X_new.shape)


(1797, 64)
(1797, 7)


### 1.3  递归特征消除法

In [9]:
from sklearn.datasets import load_digits


from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
digits = load_digits()
X, y = digits.data,digits.target
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, y)
print(selector.support_)
print(selector.ranking_)

[False False False False False False False False  True False False False
 False False False  True  True False False False False False False False
 False False False False False False False  True False False False False
 False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False False False False]
[60 40 42 35 31 56 38 17  1 41 15 27 26 30 28  1  1 50 22 25  4 34 37 11
 48  7  9 23  5 18 24  1 59 10 44 12 45 13 49 58 55 14 57 52  8 32 54 51
  2 47 46 19  3 33 20  6  1 29 43 16 53 21 39 36]


### 1.4 基于 SelectFromModel

In [44]:
# sklearn 文档 https://sklearn.apachecn.org/#/docs/master/14
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
digits = load_digits()
X, y = digits.data,digits.target
print(X.shape)
lsvc = LogisticRegression(penalty="l1").fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
print(X_new.shape)

(1797, 64)




(1797, 54)


In [32]:
#基于tree
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
iris = load_iris()
X, y = iris.data, iris.target
print(X.shape)
clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
print(clf.feature_importances_ ) 
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
print(X_new.shape)


(150, 4)
[0.04369914 0.03766247 0.30852165 0.61011673]
(150, 2)
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)




## 2.特征提取

### 2.1 从字典类型加载特征

In [47]:
#  来自sklearn https://sklearn.apachecn.org/#/docs/master/39
from sklearn.feature_extraction import DictVectorizer
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
   {'city': 'London', 'temperature': 12.},
   {'city': 'San Francisco', 'temperature': 18.},
]
vec = DictVectorizer()
print(vec.fit_transform(measurements).toarray())
print(vec.get_feature_names())


[[ 1.  0.  0. 33.]
 [ 0.  1.  0. 12.]
 [ 0.  0.  1. 18.]]
['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']


### 2.2 文本特征提取

In [61]:
## 实战新闻文本分类 https://www.heywhale.com/notebooks/run/61ee2c1c0ed7c200170a0d1b?label=5c03e1412270c000101d98e8&image=5dab5873359bc2002cc2ee18&notebook=61ee2c1c0ed7c200170a0d20&showNoti=true
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus = [
   'This is the first document.',
   'This is the second second document.',
   'And the third one.',
   'Is this the first document?',
]
X = vectorizer.fit_transform(corpus)
print(X.toarray())
print(vectorizer.get_feature_names())


[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


y_predict:
 [13 15  4 ... 14 10  8]
直接比对真实值和预测值:
 [ True False  True ...  True  True  True]
准确率为：
 0.8546264855687606
y_predict:
 [14 10 16 ...  4 12  3]
直接比对真实值和预测值:
 [ True  True  True ...  True False False]
准确率为：
 0.8537775891341256


[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
