Skip to content

Commit

Permalink
[MRG+1] TST Add test coverage for countVectorizer with ngram_range > 1 (
Browse files Browse the repository at this point in the history
#9318)

* Add coverage countVectorizer

* Add test for analyser=word

* remove redundant test

* Update test

* Change index

* Remove indexing
  • Loading branch information
herilalaina authored and amueller committed Jul 21, 2017
1 parent 894fd72 commit d96a462
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions sklearn/feature_extraction/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,25 @@ def test_char_wb_ngram_analyzer():
assert_equal(cnga(text)[:6], expected)


def test_word_ngram_analyzer():
cnga = CountVectorizer(analyzer='word', strip_accents='unicode',
ngram_range=(3, 6)).build_analyzer()

text = "This \n\tis a test, really.\n\n I met Harry yesterday"
expected = ['this is test', 'is test really', 'test really met']
assert_equal(cnga(text)[:3], expected)

expected = ['test really met harry yesterday',
'this is test really met harry',
'is test really met harry yesterday']
assert_equal(cnga(text)[-3:], expected)

cnga_file = CountVectorizer(input='file', analyzer='word',
ngram_range=(3, 6)).build_analyzer()
file = StringIO(text)
assert_equal(cnga_file(file), cnga(text))


def test_countvectorizer_custom_vocabulary():
vocab = {"pizza": 0, "beer": 1}
terms = set(vocab.keys())
Expand Down

0 comments on commit d96a462

Please sign in to comment.