# Traceable n-grams with tf-idf



In [1]:
from jyquickhelper import add_notebook_menu

add_notebook_menu()

## Example with CountVectorizer

### scikit-learn version

In [2]:
import numpy
from sklearn.feature_extraction.text import CountVectorizer

corpus = numpy.array([
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
    "",
]).reshape((5, ))

mod1 = CountVectorizer(ngram_range=(1, 2))
mod1.fit(corpus)
mod1.transform(corpus).todense()

matrix([[0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
         0],
        [0, 0, 2, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
         0],
        [1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
         0],
        [0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
         1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]], dtype=int64)

In [3]:
mod1.vocabulary_

{'this': 18,
 'is': 6,
 'the': 12,
 'first': 4,
 'document': 2,
 'this is': 20,
 'is the': 7,
 'the first': 13,
 'first document': 5,
 'second': 10,
 'this document': 19,
 'document is': 3,
 'the second': 14,
 'second document': 11,
 'and': 0,
 'third': 16,
 'one': 9,
 'and this': 1,
 'the third': 15,
 'third one': 17,
 'is this': 8,
 'this the': 21}

In [4]:
import numpy
from mlinsights.mlmodel.sklearn_text import TraceableCountVectorizer

corpus = numpy.array([
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
    "",
]).reshape((5, ))

mod2 = TraceableCountVectorizer(ngram_range=(1, 2))
mod2.fit(corpus)
mod2.transform(corpus).todense()

matrix([[0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
         0],
        [0, 0, 2, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
         0],
        [1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
         0],
        [0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
         1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]], dtype=int64)

In [5]:
mod2.vocabulary_

{('this',): 18,
 ('is',): 6,
 ('the',): 12,
 ('first',): 4,
 ('document',): 2,
 ('this', 'is'): 20,
 ('is', 'the'): 7,
 ('the', 'first'): 13,
 ('first', 'document'): 5,
 ('second',): 10,
 ('this', 'document'): 19,
 ('document', 'is'): 3,
 ('the', 'second'): 14,
 ('second', 'document'): 11,
 ('and',): 0,
 ('third',): 16,
 ('one',): 9,
 ('and', 'this'): 1,
 ('the', 'third'): 15,
 ('third', 'one'): 17,
 ('is', 'this'): 8,
 ('this', 'the'): 21}

The new class does the exact same thing but keeps n-grams in a more explicit form. The original form as a string is sometimes ambiguous as next example shows.

## Funny example with TfidfVectorizer

### scikit-learn version

In [6]:
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = numpy.array([
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
    "",
]).reshape((5, ))

mod1 = TfidfVectorizer(ngram_range=(1, 2),
                       token_pattern="[a-zA-Z ]{1,4}")
mod1.fit(corpus)
mod1.transform(corpus).todense()

matrix([[0.        , 0.        , 0.28423202, 0.28423202, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.28423202, 0.28423202, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.28423202, 0.28423202, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.28423202,
         0.        , 0.        , 0.28423202, 0.28423202, 0.        ,
         0.        , 0.23593826, 0.28423202, 0.        , 0.        ,
         0.        , 0.23593826, 0.        , 0.28423202, 0.        ,
         0.        ],
        [0.24656798, 0.24656798, 0.        , 0.        , 0.24656798,
         0.24656798, 0.24656798, 0.24656798, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.24656798,
         0.24656798, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.24656798, 0.        ,
         0.24656798, 0.24656798, 0.        , 0.        , 0.24656798,
         0.2

In [7]:
mod1.vocabulary_

{'this': 36,
 ' is ': 2,
 'the ': 31,
 'firs': 17,
 't do': 27,
 'cume': 10,
 'nt': 24,
 'this  is ': 38,
 ' is  the ': 3,
 'the  firs': 32,
 'firs t do': 18,
 't do cume': 28,
 'cume nt': 11,
 ' doc': 0,
 'umen': 39,
 't is': 29,
 ' the': 6,
 ' sec': 4,
 'ond ': 25,
 'docu': 14,
 'ment': 23,
 'this  doc': 37,
 ' doc umen': 1,
 'umen t is': 40,
 't is  the': 30,
 ' the  sec': 7,
 ' sec ond ': 5,
 'ond  docu': 26,
 'docu ment': 15,
 'and ': 8,
 'thir': 34,
 'd on': 12,
 'e': 16,
 'and  this': 9,
 'the  thir': 33,
 'thir d on': 35,
 'd on e': 13,
 'is t': 21,
 'his ': 19,
 'is t his ': 22,
 'his  the ': 20}

### mlinsights version

In [8]:
from mlinsights.mlmodel.sklearn_text import TraceableTfidfVectorizer

mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2),
                                token_pattern="[a-zA-Z ]{1,4}")
mod2.fit(corpus)
mod2.transform(corpus).todense()

matrix([[0.        , 0.        , 0.28423202, 0.28423202, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.28423202, 0.28423202, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.28423202, 0.28423202, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.28423202,
         0.        , 0.        , 0.28423202, 0.28423202, 0.        ,
         0.        , 0.23593826, 0.28423202, 0.        , 0.        ,
         0.        , 0.23593826, 0.        , 0.28423202, 0.        ,
         0.        ],
        [0.24656798, 0.24656798, 0.        , 0.        , 0.24656798,
         0.24656798, 0.24656798, 0.24656798, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.24656798,
         0.24656798, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.24656798, 0.        ,
         0.24656798, 0.24656798, 0.        , 0.        , 0.24656798,
         0.2

In [9]:
mod2.vocabulary_

{('this',): 36,
 (' is ',): 2,
 ('the ',): 31,
 ('firs',): 17,
 ('t do',): 27,
 ('cume',): 10,
 ('nt',): 24,
 ('this', ' is '): 38,
 (' is ', 'the '): 3,
 ('the ', 'firs'): 32,
 ('firs', 't do'): 18,
 ('t do', 'cume'): 28,
 ('cume', 'nt'): 11,
 (' doc',): 0,
 ('umen',): 39,
 ('t is',): 29,
 (' the',): 6,
 (' sec',): 4,
 ('ond ',): 25,
 ('docu',): 14,
 ('ment',): 23,
 ('this', ' doc'): 37,
 (' doc', 'umen'): 1,
 ('umen', 't is'): 40,
 ('t is', ' the'): 30,
 (' the', ' sec'): 7,
 (' sec', 'ond '): 5,
 ('ond ', 'docu'): 26,
 ('docu', 'ment'): 15,
 ('and ',): 8,
 ('thir',): 34,
 ('d on',): 12,
 ('e',): 16,
 ('and ', 'this'): 9,
 ('the ', 'thir'): 33,
 ('thir', 'd on'): 35,
 ('d on', 'e'): 13,
 ('is t',): 21,
 ('his ',): 19,
 ('is t', 'his '): 22,
 ('his ', 'the '): 20}

As you can see, the original 30th n-grams ``'t is  the'`` is a little but ambiguous. It is in fact ``('t is', ' the')`` as the *TraceableTfidfVectorizer* lets you know. The original form could have been ``('t', 'is  the')``, ``('t is', '  the')``, ``('t is ', ' the')``, ``('t is  ', 'the')``, ``('t', 'is  ', 'the')``... The regular expression gives some insights but not some information which can be easily used to guess the right one.