In [48]:
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description: 
"""
import sys

sys.path.append('..')
from pytextclassifier import TextClassifier
import numpy as np

from sklearn import metrics


class CharTokenizer:
    def tokenize(self, text):
        """Tokenizes a piece of text."""
        # Char seg Chinese
        return list(text.lower())


m = TextClassifier(model_name='lr')
# model_name 是选择分类器，支持lr, random_forest, xgboost, svm, mlp, ensemble, stack
data = [
    ('education', '名师指导托福语法技巧：名词的复数形式'),
    ('education', '中国高考成绩海外认可 是“狼来了”吗？'),
    ('sports', '图文：法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'),
    ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与'),
    ('sports', '米兰客场8战不败国米10年连胜')
]
m.train(data, token_pattern=r"\S+")  # 匹配任何非空白字符

r = m.predict(['福建春季公务员考试报名18日截止 2月6日考试',
               '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'])
print(r)
m.save()
#del m

new_m = TextClassifier(model_name='lr')
new_m.load()
predict_label_prob = new_m.predict_proba(['福建春季公务员考试报名18日截止 2月6日考试'])
print(predict_label_prob)  # [[0.53337174 0.46662826]]
print('classes_: ', new_m.model.classes_)  # the classes ordered as prob
print('sport prob: ', predict_label_prob[0][np.where(np.array(new_m.model.classes_) == 'sports')])

predict_label = new_m.predict(['福建春季公务员考试报名18日截止 2月6日考试',
                               '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'])
print(predict_label)  # ['education', 'sports']

test_data = [
    ('education', '福建春季公务员考试报名18日截止 2月6日考试'),
    ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'),
]
acc_score = new_m.test(test_data)
print(acc_score)  # 1.0


[35m[  DEBUG 20210903 11:57:45 textclassifier:  74] train model[0m
[35m[  DEBUG 20210903 11:57:45 textclassifier:  60] load data_list, X size: 5, label size: 5[0m
[35m[  DEBUG 20210903 11:57:45 textclassifier:  62] num_classes:2[0m
[35m[  DEBUG 20210903 11:57:45 textclassifier:  65] data tokens top 3: ['名师 指导 托福 语法 技巧 ： 名词 的 复数 形式', '中国 高考 成绩 海外 认可 是 “ 狼来了 ” 吗 ？', '图文 ： 法网 孟菲尔 斯 苦战 进 16 强 孟菲尔 斯 怒吼'][0m
[32m[   INFO 20210903 11:57:45 data_utils: 183] save classifier_vectorizer.pkl ok.[0m
[32m[   INFO 20210903 11:57:45 data_utils: 183] save classifier_model.pkl ok.[0m
[32m[   INFO 20210903 11:57:45 textclassifier: 147] save done. vec path: classifier_vectorizer.pkl, model path: classifier_model.pkl[0m
[32m[   INFO 20210903 11:57:45 textclassifier: 161] model loaded from [0m
[35m[  DEBUG 20210903 11:57:45 textclassifier:  92] test model[0m
[35m[  DEBUG 20210903 11:57:45 textclassifier:  60] load data_list, X size: 2, label size: 2[0m
[35m[  DEBUG 20210903 11:57:45 tex

['education' 'sports']
[[0.5 0.5]]
classes_:  ['education' 'sports']
sport prob:  [0.5]
['education' 'sports']
classify_report : 
               precision    recall  f1-score   support

   education       1.00      1.00      1.00         1
      sports       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

confusion_matrix : 
 [[1 0]
 [0 1]]
acc_for_each_class : 
 [1. 1.]
average_accuracy: 1.0000
overall_accuracy: 1.0000
accuracy_score: 1.0000
1.0


In [71]:
from eli5 import show_weights
show_weights(new_m.model, vec=new_m.vectorizer)

Weight?,Feature
+0.136,孟菲尔
+0.136,孟菲尔 斯
+0.136,斯
+0.097,四川
+0.097,举行
+0.097,举行 全国
+0.097,全国
+0.097,全国 长距
+0.097,国米
+0.097,挑战赛 近万人


In [79]:
from eli5 import show_prediction
valid_xs = ['福建春季公务员考试报名18日截止 2月6日考试',
                               '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜']
show_prediction(new_m.model, valid_xs[0], vec=new_m.vectorizer, 
                show_feature_values=True, top=10,target_names=['sports','educ'])

In [11]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42
)
twenty_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42
)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline

vec = CountVectorizer()
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

from sklearn import metrics

def print_report(pipe):
    y_test = twenty_test.target
    y_pred = pipe.predict(twenty_test.data)
    report = metrics.classification_report(y_test, y_pred,
        target_names=twenty_test.target_names)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                        precision    recall  f1-score   support

           alt.atheism       0.91      0.81      0.85       319
         comp.graphics       0.86      0.94      0.90       389
               sci.med       0.92      0.81      0.86       396
soc.religion.christian       0.88      0.98      0.92       398

              accuracy                           0.89      1502
             macro avg       0.89      0.89      0.89      1502
          weighted avg       0.89      0.89      0.89      1502

accuracy: 0.889


In [66]:
import eli5
eli5.show_weights(clf, top=10)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+2.709,x199378,,
+2.601,x938889,,
+1.820,x349126,,
+1.691,x718537,,
+1.562,x242643,,
+1.522,x71928,,
… 53649 more positive …,… 53649 more positive …,,
… 53981 more negative …,… 53981 more negative …,,
-1.694,x683213,,
-1.757,x741207,,

Weight?,Feature
+2.709,x199378
+2.601,x938889
+1.820,x349126
+1.691,x718537
+1.562,x242643
+1.522,x71928
… 53649 more positive …,… 53649 more positive …
… 53981 more negative …,… 53981 more negative …
-1.694,x683213
-1.757,x741207

Weight?,Feature
+3.530,x580586
+1.848,x342790
+1.747,x771885
+1.716,x363686
+1.663,x111283
… 32802 more positive …,… 32802 more positive …
… 32516 more negative …,… 32516 more negative …
-1.643,x1031983
-1.669,x85557
-1.851,x120354

Weight?,Feature
+2.206,x988761
+2.135,x337555
+1.937,x154565
+1.683,x806262
… 46295 more positive …,… 46295 more positive …
… 46148 more negative …,… 46148 more negative …
-1.661,x34701
-1.721,x354651
-1.734,x790864
-1.956,x85557

Weight?,Feature
+3.150,x641063
+2.955,x199709
+2.793,x741207
+2.034,x396081
+1.778,x274863
… 55186 more positive …,… 55186 more positive …
… 55313 more negative …,… 55313 more negative …
-1.983,x672777
-2.066,x443433
-2.087,x199378


In [67]:
eli5.show_weights(clf, vec=vec, top=10,
                  target_names=twenty_test.target_names)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+2.709,x199378,,
+2.601,x938889,,
+1.820,x349126,,
+1.691,x718537,,
+1.562,x242643,,
+1.522,x71928,,
… 53649 more positive …,… 53649 more positive …,,
… 53981 more negative …,… 53981 more negative …,,
-1.694,x683213,,
-1.757,x741207,,

Weight?,Feature
+2.709,x199378
+2.601,x938889
+1.820,x349126
+1.691,x718537
+1.562,x242643
+1.522,x71928
… 53649 more positive …,… 53649 more positive …
… 53981 more negative …,… 53981 more negative …
-1.694,x683213
-1.757,x741207

Weight?,Feature
+3.530,x580586
+1.848,x342790
+1.747,x771885
+1.716,x363686
+1.663,x111283
… 32802 more positive …,… 32802 more positive …
… 32516 more negative …,… 32516 more negative …
-1.643,x1031983
-1.669,x85557
-1.851,x120354

Weight?,Feature
+2.206,x988761
+2.135,x337555
+1.937,x154565
+1.683,x806262
… 46295 more positive …,… 46295 more positive …
… 46148 more negative …,… 46148 more negative …
-1.661,x34701
-1.721,x354651
-1.734,x790864
-1.956,x85557

Weight?,Feature
+3.150,x641063
+2.955,x199709
+2.793,x741207
+2.034,x396081
+1.778,x274863
… 55186 more positive …,… 55186 more positive …
… 55313 more negative …,… 55313 more negative …
-1.983,x672777
-2.066,x443433
-2.087,x199378


In [68]:
twenty_test.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [69]:
twenty_test.data[0]

"As I recall from my bout with kidney stones, there isn't any\nmedication that can do anything about them except relieve the pain.\n\nEither they pass, or they have to be broken up with sound, or they have\nto be extracted surgically.\n\nWhen I was in, the X-ray tech happened to mention that she'd had kidney\nstones and children, and the childbirth hurt less."

In [74]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names)

Contribution?,Feature
0.025,Highlighted in text (sum)
-0.96,<BIAS>

Contribution?,Feature
-0.271,<BIAS>
-0.435,Highlighted in text (sum)

Contribution?,Feature
0.783,Highlighted in text (sum)
-0.558,<BIAS>

Contribution?,Feature
-0.274,Highlighted in text (sum)
-0.667,<BIAS>


In [75]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names,
                     targets=['sci.med'])

Contribution?,Feature
0.783,Highlighted in text (sum)
-0.558,<BIAS>


In [76]:
vec = CountVectorizer(stop_words='english')
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.86      0.76      0.81       319
         comp.graphics       0.85      0.94      0.89       389
               sci.med       0.92      0.85      0.88       396
soc.religion.christian       0.86      0.89      0.87       398

              accuracy                           0.87      1502
             macro avg       0.87      0.86      0.86      1502
          weighted avg       0.87      0.87      0.87      1502

accuracy: 0.868


In [77]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names,
                     targets=['sci.med'])

Contribution?,Feature
1.035,Highlighted in text (sum)
0.176,<BIAS>


# 4. Char-based pipeline


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(stop_words='english', analyzer='char',
                      ngram_range=(3,5))
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
len(twenty_train.data)


2257

In [25]:
twenty_train.data = twenty_train.data[:20]
twenty_train.target = twenty_train.target[:20]
len(twenty_train.data)

20

In [26]:
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)



                        precision    recall  f1-score   support

           alt.atheism       0.94      0.05      0.10       319
         comp.graphics       0.80      0.15      0.25       389
               sci.med       0.25      0.10      0.14       396
soc.religion.christian       0.30      0.94      0.45       398

              accuracy                           0.32      1502
             macro avg       0.57      0.31      0.23      1502
          weighted avg       0.55      0.32      0.24      1502

accuracy: 0.323


In [27]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names)

Contribution?,Feature
-0.205,<BIAS>
-0.206,Highlighted in text (sum)

Contribution?,Feature
0.271,<BIAS>
-0.405,Highlighted in text (sum)

Contribution?,Feature
0.221,Highlighted in text (sum)
-0.316,<BIAS>

Contribution?,Feature
0.39,Highlighted in text (sum)
0.25,<BIAS>


In [28]:
vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5))
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)



                        precision    recall  f1-score   support

           alt.atheism       0.86      0.06      0.11       319
         comp.graphics       0.77      0.21      0.33       389
               sci.med       0.28      0.11      0.16       396
soc.religion.christian       0.30      0.93      0.46       398

              accuracy                           0.34      1502
             macro avg       0.55      0.33      0.26      1502
          weighted avg       0.54      0.34      0.27      1502

accuracy: 0.342


In [29]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names)

Contribution?,Feature
-0.206,<BIAS>
-0.235,Highlighted in text (sum)

Contribution?,Feature
0.296,<BIAS>
-0.392,Highlighted in text (sum)

Contribution?,Feature
0.235,Highlighted in text (sum)
-0.291,<BIAS>

Contribution?,Feature
0.393,Highlighted in text (sum)
0.2,<BIAS>


# HashingVectorizer

In [32]:
twenty_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=['headers', 'footers'],
)
twenty_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=['headers', 'footers'],
)


In [33]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vec = HashingVectorizer(stop_words='english', ngram_range=(1,2))
clf = SGDClassifier(random_state=42)
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.92      0.79      0.85       319
         comp.graphics       0.87      0.96      0.92       389
               sci.med       0.93      0.89      0.91       396
soc.religion.christian       0.88      0.93      0.91       398

              accuracy                           0.90      1502
             macro avg       0.90      0.89      0.90      1502
          weighted avg       0.90      0.90      0.90      1502

accuracy: 0.899


In [35]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names,)

Contribution?,Feature
0.025,Highlighted in text (sum)
-0.96,<BIAS>

Contribution?,Feature
-0.271,<BIAS>
-0.435,Highlighted in text (sum)

Contribution?,Feature
0.783,Highlighted in text (sum)
-0.558,<BIAS>

Contribution?,Feature
-0.274,Highlighted in text (sum)
-0.667,<BIAS>


In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    '福建春季公务员考试报名18日截止 2月6日考试',
    '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜',
]
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range= (1,6))
X = vectorizer.fit_transform(corpus)
print([(len(w),w) for w in vectorizer.get_feature_names()])

[(1, ' '), (2, ' 2'), (3, ' 2月'), (4, ' 2月6'), (5, ' 2月6日'), (6, ' 2月6日考'), (2, ' d'), (3, ' do'), (4, ' doc'), (5, ' docu'), (6, ' docum'), (2, ' f'), (3, ' fi'), (4, ' fir'), (5, ' firs'), (6, ' first'), (2, ' i'), (3, ' is'), (4, ' is '), (2, ' s'), (3, ' se'), (4, ' sec'), (5, ' seco'), (6, ' secon'), (2, ' t'), (3, ' th'), (4, ' the'), (5, ' the '), (4, ' thi'), (5, ' this'), (6, ' this '), (2, ' 意'), (3, ' 意甲'), (4, ' 意甲首'), (5, ' 意甲首轮'), (6, ' 意甲首轮补'), (2, ' 福'), (3, ' 福建'), (4, ' 福建春'), (5, ' 福建春季'), (6, ' 福建春季公'), (1, '.'), (2, '. '), (1, '0'), (2, '0年'), (3, '0年连'), (4, '0年连胜'), (5, '0年连胜 '), (1, '1'), (2, '10'), (3, '10年'), (4, '10年连'), (5, '10年连胜'), (6, '10年连胜 '), (2, '18'), (3, '18日'), (4, '18日截'), (5, '18日截止'), (6, '18日截止 '), (1, '2'), (2, '2月'), (3, '2月6'), (4, '2月6日'), (5, '2月6日考'), (6, '2月6日考试'), (1, '6'), (2, '6日'), (3, '6日考'), (4, '6日考试'), (5, '6日考试 '), (1, '8'), (2, '8战'), (3, '8战不'), (4, '8战不败'), (5, '8战不败国'), (6, '8战不败国米'), (2, '8日'), (3, '8日截'), (4, '8日截止'), (5, 

In [82]:
from IPython.display import display
display(eli5.show_weights(new_m.model))
            

Weight?,Feature
+0.136,x35
+0.136,x36
+0.136,x57
+0.097,x27
+0.097,x16
+0.097,x17
+0.097,x18
+0.097,x19
+0.097,x29
+0.097,x56


In [None]:
display(eli5.show_weights(clf2))

In [9]:

import eli5
eli5.show_weights(m, top=10)



In [6]:
eli5.show_weights(new_m, vec = new_m.vectorizer,
                        top=10,
                        target_names = ['education', 'sports'])