### Custom Count Vectorizer example

In [7]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

In [50]:
# some test text with contractions
test_text = np.array(["does, do, doing1234",
                      "isn't doesn't weren't"])

In [63]:
# create a custom vectorizer class that inherits from base class
# add a few more custom preprocessing and tokenization steps
class CustomVectorizer(CountVectorizer):  
    def build_analyzer(self):
        stop_words = self.get_stop_words()
        
        def analyzer(doc):
            
            # preprocess: do any further cleaning here if needed
            # example: remove everything but letters
            cleaned_doc = re.sub(r"[^A-Za-z]", " ", doc)
            
            # instantiate snowball stemmer
            stemmer = SnowballStemmer("english")
            # create tokens
            tokens = [stemmer.stem(d) for d in word_tokenize(cleaned_doc)] 
            
            return(self._word_ngrams(tokens, stop_words))
        return analyzer

In [61]:
# create an instance of your custom vectorizer
cVectorizer = CustomVectorizer(#strip_accents='ascii',
                               #max_df=0.05,
                               #min_df=3,
                               stop_words = 'english'
                               #ngram_range=(1,2)
                              )

# create document term matrix
dtm = cVectorizer.fit_transform(test_text)
tokens = cVectorizer.get_feature_names()

In [62]:
print(tokens)

['doe', 'doesn', 'isn', 't', 'weren']
