# SVM做分类
## 树形结构作为分类策略，根节点采用置信度最高的分类器
## 训练的时候注意用一个类与其他每个类做二分类，而不是与其余类做二分类，结构风险小



$$
min \left\{\frac{1}{2}\left \| w\right \|^{2} + C_{+}\sum \zeta_{+} +C_{-}\sum \zeta_{-}\right\} \\
y_{i} (w x_{i} + b) \geq 1 - \zeta _{i}
$$

In [1]:
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import pickle

In [2]:
from sklearn.datasets.base import Bunch
with open("DB/matrix/BOW.pkl", 'rb') as f:
    Set = pickle.load(f)

In [3]:
Set.matrix.shape

(1289812, 469138)

In [4]:
len(Set.y_target)

1289812

## 数据预处理

> 进行降维

In [5]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MaxAbsScaler

In [6]:
import sklearn.pipeline
from sklearn.feature_extraction.text import TfidfTransformer

In [7]:
Matrix = sklearn.pipeline.Pipeline([('maxAbs', MaxAbsScaler()), ('chi2', SelectKBest(chi2, k=10000))]).fit_transform(Set.matrix, Set.y_target)

In [8]:
Matrix = TfidfTransformer().fit_transform(Matrix)
Matrix

<1289812x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 56540401 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.cross_validation import train_test_split
trainMatrix, testMatrix, trainTarget, testTarget = train_test_split(Matrix, Set.y_target, train_size = 0.004, stratify = Set.y_target)



In [10]:
print(trainMatrix.shape, testMatrix.shape)

(5159, 10000) (1284653, 10000)


## 训练和测试
>选择高斯核`rbf`，选择惩罚因子为与样本比例成反比——`balanced`

In [11]:
svc = SVC(kernel='linear', class_weight='balanced', cache_size=20000)

In [17]:
import time
beg = time.time()
predict = svc.fit(trainMatrix, trainTarget)
end = time.time()
print("Train time: %.2lf" % (end - beg))

Train time: 14.810357


In [18]:
import time
beg = time.time()
res_svc = predict.predict(testMatrix)
end = time.time()
print("test time: %.2lf" % (end - beg))

test time: 1777.72


In [19]:
label = ['财经', '股票', '教育', '科技', '时政','体育','游戏','娱乐','汽车','社会','军事']
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
prt_res = classification_report(testTarget, res_svc, target_names=label)
print(prt_res)

precision    recall  f1-score   support

         财经       0.70      0.70      0.70    103095
         股票       0.86      0.84      0.85    153780
         教育       0.90      0.84      0.87    100945
         科技       0.76      0.85      0.80    162277
         时政       0.75      0.74      0.74    104449
         体育       0.95      0.96      0.96    131077
         游戏       0.98      0.87      0.92    105233
         娱乐       0.85      0.92      0.88    105576
         汽车       0.91      0.85      0.88    101296
         社会       0.94      0.98      0.96    116427
         军事       1.00      0.98      0.99    100498

avg / total       0.87      0.87      0.87   1284653



In [20]:
with open('result/SK_rbfSVC.csv', 'w', encoding='utf-8') as f:
    f.write(prt_res)
    f.close()