# tokens

> Extract token features.

In [None]:
#| default_exp tokens

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from sklearn.base import BaseEstimator, TransformerMixin
from textplumber.store import TextFeatureStore
from textplumber.core import pass_tokens
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
#| export
class TokensVectorizer(BaseEstimator, TransformerMixin):
    """ Sci-kit Learn pipeline component to extract token features. This component should be used after the SpacyPreprocessor component with the same feature store.
        The component gets the tokens from the feature store and returns a matrix of counts (via CountVectorizer) or Tf-idf scores (using TfidfVectorizer). """
    
    def __init__(self, 
                 feature_store: TextFeatureStore, # the feature store to use - this should be the same feature store used in the SpacyPreprocessor component
                 vectorizer_type:str = 'count', # the type of vectorizer to use - 'count' for CountVectorizer or 'tfidf' for TfidfVectorizer
                 lowercase:bool = False, # whether to lowercase the tokens 
                 min_token_length:int = 0, # the minimum token length to use
                 remove_punctuation:bool = False, # whether to remove punctuation from the tokens
                 remove_numbers:bool = False, # whether to remove numbers from the tokens
                 stop_words:list[str]|None = None, # the stop words to use - passed to CountVectorizer or TfidfVectorizer
                 min_df:float|int = 1, # the minimum document frequency to use - passed to CountVectorizer or TfidfVectorizer
                 max_df:float|int = 1.0, # the maximum document frequency to use - passed to CountVectorizer or TfidfVectorizer
                 max_features:int = 5000, # the maximum number of features to use, setting a default to avoid memory issues - passed to CountVectorizer or TfidfVectorizer
                 ngram_range:tuple = (1, 1), # the ngram range to use (min_n, max_n) - passed to CountVectorizer or TfidfVectorizer
                 vocabulary:list|None = None, # list of tokens to use - passed to CountVectorizer or TfidfVectorizer
                 encoding:str = 'utf-8', # the encoding to use - passed to CountVectorizer or TfidfVectorizer 
                 decode_error:str = 'ignore' # what to do if there is an error decoding 'strict', 'ignore', 'replace' - passed to CountVectorizer or TfidfVectorizer
                ):
        self.vectorizer_type = vectorizer_type
        self.feature_store = feature_store
        self.lowercase = lowercase
        self.min_token_length = min_token_length
        self.remove_punctuation = remove_punctuation
        self.remove_numbers = remove_numbers
        self.stop_words = stop_words
        self.min_df = min_df
        self.max_df = max_df
        self.max_features = max_features
        self.ngram_range = ngram_range
        self.vocabulary = vocabulary
        self.encoding = encoding
        self.decode_error = decode_error

    def fit(self, X, y=None):
        """ Fit the vectorizer to the tokens in the feature store. """
        if self.vectorizer_type == 'tfidf':
            self.vectorizer_ = TfidfVectorizer(tokenizer=pass_tokens, lowercase=False, token_pattern = None, stop_words=self.stop_words, min_df=self.min_df, max_df=self.max_df, max_features=self.max_features, ngram_range=self.ngram_range, vocabulary= self.vocabulary, encoding=self.encoding, decode_error=self.decode_error)
        elif self.vectorizer_type == 'count':
            self.vectorizer_ = CountVectorizer(tokenizer=pass_tokens, lowercase=False, token_pattern = None, stop_words=self.stop_words, min_df=self.min_df, max_df=self.max_df, max_features=self.max_features, ngram_range=self.ngram_range, vocabulary= self.vocabulary, encoding=self.encoding, decode_error=self.decode_error)
        else:
            raise ValueError("Invalid vectorizer_type. Use 'tfidf' or 'count'.")
        tokens = self.feature_store.get_tokens_from_texts(X, lowercase = self.lowercase, min_token_length = self.min_token_length, remove_punctuation = self.remove_punctuation, remove_numbers = self.remove_numbers)
        self.vectorizer_.fit(tokens, y)
        return self
    
    def transform(self, X):
        """ Transform the texts to a matrix of counts or tf-idf scores. """
        tokens = self.feature_store.get_tokens_from_texts(X, lowercase = self.lowercase, min_token_length = self.min_token_length, remove_punctuation = self.remove_punctuation, remove_numbers = self.remove_numbers)
        return self.vectorizer_.transform(tokens)
    
    def get_feature_names_out(self, input_features=None):
        """ Get the feature names out from the vectorizer. """
        return self.vectorizer_.get_feature_names_out(input_features)
    
    

TODO: add an example.

In [None]:
#| hide
import os
from textplumber.preprocess import SpacyPreprocessor
from sklearn.pipeline import Pipeline

In [None]:
#| hide
feature_store = TextFeatureStore('test_tokens.sqlite')
spacy_preprocessor = SpacyPreprocessor(feature_store=feature_store)
spacy_tokens_vectorizer = TokensVectorizer(feature_store=feature_store, vectorizer_type='count', lowercase=False)
pipeline = Pipeline([
    ('spacy_preprocessor', spacy_preprocessor),
    ('spacy_tokens_vectorizer', spacy_tokens_vectorizer)
])
pipeline.fit(['Hello, world!'])
X = pipeline.transform(['Hello, world!'])
assert tuple(X.toarray()[0]) == tuple([1, 1, 1, 1])

spacy_tokens_vectorizer = TokensVectorizer(feature_store=feature_store, vectorizer_type='count', lowercase=True)
pipeline = Pipeline([
    ('spacy_preprocessor', spacy_preprocessor),
    ('spacy_tokens_vectorizer', spacy_tokens_vectorizer)
])
pipeline.fit(['Hello, world!', 'hello, world!'])
X = pipeline.transform(['Hello, world!', 'hello, world!'])
# docs should be equivalent
assert tuple(X.toarray()[1]) == tuple(X.toarray()[1])

# test vocabulary
spacy_tokens_vectorizer = TokensVectorizer(feature_store=feature_store, vectorizer_type='count', lowercase=True, vocabulary=['hello', 'world'])
pipeline = Pipeline([
    ('spacy_preprocessor', spacy_preprocessor),
    ('spacy_tokens_vectorizer', spacy_tokens_vectorizer)
])
pipeline.fit(['Hello, world!', 'hello, world!'])
X = pipeline.transform(['Hello, world!', 'hello, world!'])
assert tuple(X.toarray()[0]) == tuple([1, 1])
assert tuple(X.toarray()[1]) == tuple([1, 1])
id = spacy_tokens_vectorizer.get_feature_names_out().tolist().index('hello')
assert X.todense()[0, id] == 1
assert X.todense()[1, id] == 1
X = pipeline.transform(['something else'])
assert tuple(X.toarray()[0]) == tuple([0, 0])

# test empty text
spacy_tokens_vectorizer = TokensVectorizer(feature_store=feature_store, vectorizer_type='count', lowercase=True)
pipeline = Pipeline([
    ('spacy_preprocessor', spacy_preprocessor),
    ('spacy_tokens_vectorizer', spacy_tokens_vectorizer)
])
pipeline.fit(['Hello, world!', 'hello, world!'])
X = pipeline.transform(['Hello, world!', 'hello, world!', ''])
assert tuple(X.toarray()[0]) == tuple([1, 1, 1, 1])
assert tuple(X.toarray()[1]) == tuple([1, 1, 1, 1])
assert tuple(X.toarray()[2]) == tuple([0, 0, 0, 0])

os.remove('test_tokens.sqlite')

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()