In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42) 

In [2]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [4]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])


In [7]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [8]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
predicted = text_clf.predict(docs_new)

In [9]:
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [10]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
     categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8348868175765646

In [11]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])

text_clf.fit(twenty_train.data, twenty_train.target)  

predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 



0.9127829560585885

In [12]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

             micro avg       0.91      0.91      0.91      1502
             macro avg       0.92      0.91      0.91      1502
          weighted avg       0.92      0.91      0.91      1502



In [13]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

In [14]:
from sklearn.model_selection import GridSearchCV
parameters = {
     'vect__ngram_range': [(1, 1), (1, 2)],
     'tfidf__use_idf': (True, False),
     'clf__alpha': (1e-2, 1e-3),
 }

In [15]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1,verbose=4)

In [16]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  40 | elapsed:    2.6s remaining:    6.0s
[Parallel(n_jobs=-1)]: Done  23 out of  40 | elapsed:    2.8s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done  34 out of  40 | elapsed:    3.1s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    3.3s finished


In [17]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [18]:
gs_clf.best_score_

0.9151349867929058

In [19]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [20]:
from pytorch_pretrained_bert.modeling import BertConfig, BertModel

from allennlp.common.testing import ModelTestCase
from allennlp.data.dataset import Batch
from allennlp.data.fields import TextField, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers.wordpiece_indexer import PretrainedBertIndexer
from allennlp.data.tokenizers import WordTokenizer
from allennlp.data.tokenizers.word_splitter import BertBasicWordSplitter
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.token_embedders.bert_token_embedder import BertEmbedder
from tqdm import tqdm
from torch.autograd import Variable

class MeanEmbeddingVectorizer(object):
    def __init__(self):
      self.token_indexer = PretrainedBertIndexer(pretrained_model="bert-base-cased")
      self.bert_model = BertModel.from_pretrained(pretrained_model_name="bert-base-cased")
      self.token_embedder = BertEmbedder(self.bert_model,top_layer_only=True)
      self.tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
      self.device= "cuda:1"
      self.token_embedder.to(self.device)
    def fit(self, X, y):
      return self

    def softmax(self,x):
      e_x = np.exp(x - np.max(x))
      return e_x / e_x.sum()

    def transform(self, X):
      listofembedds = []
      batchinstances = [] 
      vocab = Vocabulary()
      listofbatches = []
      for sentence1 in tqdm(X,desc="Tokenizing:"):
        tokens1 = self.tokenizer.tokenize(sentence1)
        instance1 = Instance({"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        batchinstances.append(instance1)
        if len(batchinstances)>5:
            listofbatches.append(Batch(batchinstances))
            batchinstances=[]
      listofbatches.append(Batch(batchinstances))
      for batch in tqdm(listofbatches,"Generatin Embedding:"): 
          batch.index_instances(vocab)
          padding_lengths = batch.get_padding_lengths()
          tensor_dict = batch.as_tensor_dict(padding_lengths)
          tokens = tensor_dict["tokens"]
          bert_vectors1 = self.token_embedder(Variable(tokens["bert"]).to(self.device), offsets=Variable(tokens["bert-offsets"]).to(self.device))
          bert_vectors1 = bert_vectors1.detach().cpu().numpy()
          tokens["bert"].to("cpu")
          tokens["bert-offsets"].to("cpu")
          for vectors in bert_vectors1:
            mean = np.mean(vectors,axis=0)
            #print(mean)
            arr = self.softmax(mean)
            listofembedds.append(arr)
      return np.array(listofembedds)


import logging
from torch.multiprocessing import Manager, Process, Queue, get_logger
logger = get_logger()  # pylint: disable=invalid-name
logger.setLevel(logging.ERROR)



In [21]:
from sklearn.pipeline import FeatureUnion

tfidf = Pipeline([('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer())])
featureunion = FeatureUnion([("tfidf",tfidf),("mec",MeanEmbeddingVectorizer())])

text_clf = Pipeline([
     ("fec",featureunion),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])

text_clf.fit(twenty_train.data, twenty_train.target)

01/29/2019 13:47:33 - ERROR - allennlp.data.token_indexers.wordpiece_indexer -   Your BERT model appears to be cased, but your indexer is lowercasing tokens.
01/29/2019 13:47:34 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/pbanerj6/.pytorch_pretrained_bert/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
01/29/2019 13:47:34 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz from cache at /home/pbanerj6/.pytorch_pretrained_bert/a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c
01/29/2019 13:47:34 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /home/pbanerj6/.pytorch_pretrained_bert/a803ce83ca

Pipeline(memory=None,
     steps=[('fec', FeatureUnion(n_jobs=None,
       transformer_list=[('tfidf', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [22]:
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

Tokenizing:: 100%|██████████| 1502/1502 [00:08<00:00, 171.76it/s]
Generatin Embedding:: 100%|██████████| 251/251 [00:22<00:00, 11.34it/s]


0.9141145139813582

In [23]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.88       319
         comp.graphics       0.88      0.98      0.93       389
               sci.med       0.94      0.89      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

             micro avg       0.91      0.91      0.91      1502
             macro avg       0.92      0.91      0.91      1502
          weighted avg       0.92      0.91      0.91      1502



In [24]:
metrics.confusion_matrix(twenty_test.target, predicted)-np.array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

array([[ 1, -2,  0,  1],
       [ 0,  2, -1, -1],
       [ 0,  1, -1,  0],
       [ 0,  0,  0,  0]])