[MRG+1] TST Add test coverage for countVectorizer with ngram_range > 1 (

#9318) * Add coverage countVectorizer * Add test for analyser=word * remove redundant test * Update test * Change index * Remove indexing
scikit-learn · Jul 21, 2017 · d96a462 · d96a462
1 parent 894fd72
commit d96a462
Showing 1 changed file with 19 additions and 0 deletions.
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -223,6 +223,25 @@ def test_char_wb_ngram_analyzer():
     assert_equal(cnga(text)[:6], expected)
 
 
+def test_word_ngram_analyzer():
+    cnga = CountVectorizer(analyzer='word', strip_accents='unicode',
+                           ngram_range=(3, 6)).build_analyzer()
+
+    text = "This \n\tis a test, really.\n\n I met Harry yesterday"
+    expected = ['this is test', 'is test really', 'test really met']
+    assert_equal(cnga(text)[:3], expected)
+
+    expected = ['test really met harry yesterday',
+                'this is test really met harry',
+                'is test really met harry yesterday']
+    assert_equal(cnga(text)[-3:], expected)
+
+    cnga_file = CountVectorizer(input='file', analyzer='word',
+                                ngram_range=(3, 6)).build_analyzer()
+    file = StringIO(text)
+    assert_equal(cnga_file(file), cnga(text))
+
+
 def test_countvectorizer_custom_vocabulary():
     vocab = {"pizza": 0, "beer": 1}
     terms = set(vocab.keys())