### 1.加载数据集

In [1]:
#first extract the 20 news_group dataset to /scikit_learn_data  
from sklearn.datasets import fetch_20newsgroups  

#all categories  
#newsgroup_train = fetch_20newsgroups(subset='train')  

#part categories  
categories = ['comp.graphics',  
 'comp.os.ms-windows.misc',  
 'comp.sys.ibm.pc.hardware',  
 'comp.sys.mac.hardware',  
 'comp.windows.x']

newsgroup_train = fetch_20newsgroups(subset = 'train',categories = categories)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [2]:
#print category names  
from pprint import pprint  
pprint(list(newsgroup_train.target_names)) 

['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x']


In [5]:
newsgroup_test = fetch_20newsgroups(subset = 'test',categories = categories)

### 2. 提feature

In [7]:
#newsgroup_train.data is the original documents, but we need to extract the feature vectors inorder to model the text data  
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(stop_words = 'english', non_negative = True, n_features = 10000)  
fea_train = vectorizer.fit_transform(newsgroup_train.data)  
fea_test = vectorizer.fit_transform(newsgroup_test.data)
  
#return feature vector 'fea_train' [n_samples,n_features]  
print ('Size of fea_train:' + repr(fea_train.shape))  
print ('Size of fea_train:' + repr(fea_test.shape))  
#11314 documents, 130107 vectors for all categories  
print ('The average feature sparsity is {0:.3f}%'.format(fea_train.nnz/float(fea_train.shape[0]*fea_train.shape[1])*100))



Size of fea_train:(2936, 10000)
Size of fea_train:(1955, 10000)
The average feature sparsity is 1.002%


### 3. 分类

#### 3.1 Multinomial Naive Bayes Classifier

In [19]:
def calculate_result(actual,pred):  
    m_precision = metrics.precision_score(actual,pred, average='micro')
    m_recall = metrics.recall_score(actual,pred)
    print ('predict info:')  
    print ('precision:{0:.3f}'.format(m_precision))  
    print ('recall:{0:0.3f}'.format(m_recall))
    print ('f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred)))

In [20]:
######################################################  
#Multinomial Naive Bayes Classifier  
print ('*************************\nNaive Bayes\n*************************')  
from sklearn.naive_bayes import MultinomialNB  
from sklearn import metrics

newsgroup_test = fetch_20newsgroups(subset = 'test', categories = categories)
fea_test = vectorizer.fit_transform(newsgroup_test.data)
#create the Multinomial Naive Bayesian Classifier  
clf = MultinomialNB(alpha = 0.01)   
clf.fit(fea_train,newsgroup_train.target) 
pred = clf.predict(fea_test)
calculate_result(newsgroup_test.target,pred)
#notice here we can see that f1_score is not equal to 2*precision*recall/(precision+recall)  
#because the m_precision and m_recall we get is averaged, however, metrics.f1_score() calculates  
#weithed average, i.e., takes into the number of each class into consideration. 

*************************
Naive Bayes
*************************




ValueError: Target is multiclass but average='binary'. Please choose another average setting.

#### 3.2 KNN

In [15]:
######################################################  
#KNN Classifier  
from sklearn.neighbors import KNeighborsClassifier  

print ('*************************\nKNN\n*************************')  
knnclf = KNeighborsClassifier()#default with k=5  
knnclf.fit(fea_train,newsgroup_train.target)  
pred = knnclf.predict(fea_test)
calculate_result(newsgroup_test.target,pred)

*************************
KNN
*************************


ValueError: Target is multiclass but average='binary'. Please choose another average setting.

#### 3.3 SVM

In [16]:
######################################################  
#SVM Classifier  
from sklearn.svm import SVC  

print ('*************************\nSVM\n*************************')  
svclf = SVC(kernel = 'linear')#default with 'rbf'  
svclf.fit(fea_train,newsgroup_train.target)
pred = svclf.predict(fea_test)
calculate_result(newsgroup_test.target,pred)

*************************
SVM
*************************


ValueError: Target is multiclass but average='binary'. Please choose another average setting.