### Loading the 20 newsgroups dataset

In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

In [2]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [3]:
type(twenty_train)

sklearn.utils.Bunch

In [4]:
type(twenty_train.data)

list

In [5]:
len(twenty_train.data)

2257

In [6]:
len(twenty_train.filenames)

2257

In [7]:
twenty_train.data[1]

"From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\nSubject: help: Splitting a trimming region along a mesh \nOrganization: University Of Kentucky, Dept. of Math Sciences\nLines: 28\n\n\n\n\tHi,\n\n\tI have a problem, I hope some of the 'gurus' can help me solve.\n\n\tBackground of the problem:\n\tI have a rectangular mesh in the uv domain, i.e  the mesh is a \n\tmapping of a 3d Bezier patch into 2d. The area in this domain\n\twhich is inside a trimming loop had to be rendered. The trimming\n\tloop is a set of 2d Bezier curve segments.\n\tFor the sake of notation: the mesh is made up of cells.\n\n\tMy problem is this :\n\tThe trimming area has to be split up into individual smaller\n\tcells bounded by the trimming curve segments. If a cell\n\tis wholly inside the area...then it is output as a whole ,\n\telse it is trivially rejected. \n\n\tDoes any body know how thiss can be done, or is there any algo. \n\tsomewhere for doing this.\n\n\tAny help would be appreciated.\n\n\tThanks, \n\tAni.\

In [8]:
twenty_train.filenames[1]

'C:\\Users\\sun\\scikit_learn_data\\20news_home\\20news-bydate-train\\comp.graphics\\38479'

In [9]:
twenty_train.target[1]

1

In [10]:
print("\n".join(twenty_train.data[1].split("\n")[:3]))

From: ani@ms.uky.edu (Aniruddha B. Deglurkar)
Subject: help: Splitting a trimming region along a mesh 
Organization: University Of Kentucky, Dept. of Math Sciences


In [11]:
print(twenty_train.target_names[twenty_train.target[1]])

comp.graphics


In [12]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [13]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


### Extracting features from text files

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df=1, stop_words='english')
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35482)

In [16]:
count_vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [17]:
count_vect.vocabulary_.get(u'algorithm')

4683

In [19]:
# Term Frequency times Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [20]:
tfidf_transformer

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

#### USE NLTK

In [55]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorzier(CountVectorizer):
    def build_analyzer(self):
        analyser = super(StemmedCountVectorzier, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyser(doc))
vectorizer = StemmedCountVectorzier(min_df=1, stop_words='english')

In [56]:
vectorizer

StemmedCountVectorzier(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), preprocessor=None, stop_words='english',
            strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
            tokenizer=None, vocabulary=None)

In [57]:
X_train_stemmed_counts = vectorizer.fit_transform(twenty_train.data)

In [59]:
X_train_stemmed_counts.shape

(2257, 26888)

##### TF-IDF

In [64]:
import numpy as np
import scipy as sp
def tfidf(term, doc, docset):
    tf = float(doc.count(term)) / sum(doc.count(term) for doc in docset)
    idf = np.log(float(len(docset)) / (len([doc for doc in docset if term in doc])))
    return tf * idf

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
tfidf_vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')

- 预处理阶段将原始帖子变成小写字母形式（父类中完成）
- 在词语切分阶段提取所有单词（父类中完成）
- 将每个词语转换成词干形式

In [70]:
tfidf_vectorizer

StemmedTfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), norm='l2', preprocessor=None,
            smooth_idf=True, stop_words='english', strip_accents=None,
            sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
            tokenizer=None, use_idf=True, vocabulary=None)

In [72]:
X_train_stemmed_tfidf = tfidf_vectorizer.fit_transform(twenty_train.data)
X_train_stemmed_tfidf.shape

(2257, 26888)

**文本预处理**：

- 切分文本
- 扔掉出现过分频繁，对测试集预测无帮助的词语
- 扔掉出现出现频率很低，只有很小可能出现在测试集的词语
- 统计剩余词语
- 考虑整个语料集合，从词频统计中计算TF-IDF值

**词袋模型**：简单有效，局限性：

- 不能涵盖词语间的关联关系
- 不能正确捕捉否定关系
- 对于拼写错误的词语会处理失败

### Training a classifier

In [21]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [22]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


### Building a pipeline

In [23]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB())])

In [26]:
type(text_clf)

sklearn.pipeline.Pipeline

In [28]:
text_clf.steps

[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None)),
 ('tfidf',
  TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]

In [29]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

### Evaluation of the performance on the test set

In [30]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
        categories=categories, shuffle=True, random_state=42)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.8348868175765646

In [33]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
                target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502



In [34]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]], dtype=int64)

### Parameter tuning using grid search

In [43]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1,1), (1,2)],
             'tfidf__use_idf': (True, False),
             'clf__alpha': (0.01, 0.1, 1, 3, 10)}

In [44]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [45]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [46]:
text_clf.get_params().keys()

dict_keys(['memory', 'steps', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__alpha', 'clf__class_prior', 'clf__fit_prior'])

In [47]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [48]:
gs_clf.best_score_ 

0.93

In [49]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [50]:
gs_clf.best_params_

{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [52]:
gs_clf.cv_results_



{'mean_fit_time': array([0.08843104, 0.27639254, 0.08054733, 0.29578638, 0.0934155 ,
        0.30748407, 0.06834888, 0.30781865, 0.07185777, 0.33372124,
        0.07085522, 0.31082638, 0.08104865, 0.30313889, 0.08689825,
        0.31116025, 0.06985323, 0.31884797, 0.07436474, 0.27439658]),
 'mean_score_time': array([0.03313661, 0.060661  , 0.03676494, 0.05865645, 0.03592865,
        0.06951857, 0.02790713, 0.05631638, 0.02606956, 0.07002012,
        0.02640327, 0.06066179, 0.03442494, 0.05765367, 0.03993956,
        0.07018685, 0.02590156, 0.05798761, 0.02857598, 0.05949624]),
 'mean_test_score': array([0.92  , 0.93  , 0.9175, 0.91  , 0.91  , 0.8775, 0.8375, 0.81  ,
        0.7075, 0.6925, 0.5825, 0.5775, 0.5725, 0.5725, 0.48  , 0.485 ,
        0.4775, 0.49  , 0.435 , 0.445 ]),
 'mean_train_score': array([1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 0.97498428, 0.9924812 , 0.94248261, 0.98247491,
        0.76000864, 0.80874471, 0.82873382, 0.89247933,

### Cluster

In [75]:
num_clusters = 4 
from sklearn.cluster import KMeans
km = KMeans(n_clusters=num_clusters, init='random', 
            n_init=1, verbose=1)
km.fit(X_train_stemmed_tfidf)

Initialization complete
Iteration  0, inertia 4296.482
Iteration  1, inertia 2187.504
Iteration  2, inertia 2180.463
Iteration  3, inertia 2177.597
Iteration  4, inertia 2176.348
Iteration  5, inertia 2175.475
Iteration  6, inertia 2174.642
Iteration  7, inertia 2173.903
Iteration  8, inertia 2173.165
Iteration  9, inertia 2172.320
Iteration 10, inertia 2171.513
Iteration 11, inertia 2170.827
Iteration 12, inertia 2169.797
Iteration 13, inertia 2169.004
Iteration 14, inertia 2168.392
Iteration 15, inertia 2167.943
Iteration 16, inertia 2167.757
Iteration 17, inertia 2167.484
Iteration 18, inertia 2167.286
Iteration 19, inertia 2167.218
Iteration 20, inertia 2167.145
Iteration 21, inertia 2166.945
Iteration 22, inertia 2165.632
Iteration 23, inertia 2164.079
Iteration 24, inertia 2164.006
Iteration 25, inertia 2164.001
Iteration 26, inertia 2163.988
Iteration 27, inertia 2163.985
Converged at iteration 27: center shift 0.000000e+00 within tolerance 3.637172e-09


KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
    n_clusters=4, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=1)

In [76]:
km.labels_

array([0, 2, 3, ..., 2, 0, 2])

In [77]:
km.labels_.shape

(2257,)

> http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

> 《Building Machine Learning Systems with Python》