# pos

> Extract parts of speech features.

In [None]:
#| default_exp pos

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from sklearn.base import BaseEstimator, TransformerMixin
from textplumber.store import TextFeatureStore
from textplumber.core import pass_tokens
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#| export
class POSVectorizer(BaseEstimator, TransformerMixin):
	""" Sci-kit Learn pipeline component to extract parts of speech tag features. This component should be used after the SpacyPreprocessor component with the same feature store.
		The component gets the tokens from the feature store and returns a matrix of counts (via CountVectorizer). """
	
	def __init__(self, 
				feature_store: TextFeatureStore, # the feature store to use - this should be the same feature store used in the SpacyPreprocessor component
				ngram_range:tuple = (1, 1), # the ngram range to use (min_n, max_n) - passed to CountVectorizer
				vocabulary:list|None = None, # list of tokens to use - passed to CountVectorizer
				# scale: bool = False, # whether to normalize the counts - not implemented yet
				):
		self.feature_store = feature_store
		self.ngram_range = ngram_range
		self.vocabulary = vocabulary
		# self.scale = scale

	def fit(self, X, y=None):
		self.vectorizer_ = CountVectorizer(tokenizer=pass_tokens,
									lowercase=False, 
									stop_words=None, 
									token_pattern=None, 
									min_df=1,
									max_df=1.0,
									max_features=None,
									ngram_range=self.ngram_range,
									vocabulary= self.vocabulary)
		self.vectorizer_.fit(self.feature_store.get_pos_from_texts(X), y)
		# self.scaler_ = Normalizer(norm='l1')
		return self
	
	def transform(self, X):
		docs_pos = self.feature_store.get_pos_from_texts(X)
		docs_pos = self.vectorizer_.transform(docs_pos)
		#if self.scale:
		#	docs_pos = self.scaler_.transform(docs_pos)
		return docs_pos
	
	def get_feature_names_out(self, input_features=None):
		return self.vectorizer_.get_feature_names_out(input_features)
        

TODO: add an example.

In [None]:
#| hide
from textplumber.preprocess import SpacyPreprocessor
from sklearn.pipeline import Pipeline
import os

feature_store = TextFeatureStore('test_pos.sqlite')
spacy_preprocessor = SpacyPreprocessor(feature_store=feature_store)
spacy_pos_vectorizer = POSVectorizer(feature_store=feature_store)
pipeline = Pipeline([
    ('spacy_preprocessor', spacy_preprocessor),
    ('spacy_pos_vectorizer', spacy_pos_vectorizer)
])
pipeline.fit(['Hello, world!'])
X = pipeline.transform(['Hello, world!'])
id = spacy_pos_vectorizer.get_feature_names_out().tolist().index('NOUN')
assert X.todense()[0, id] == 1
id = spacy_pos_vectorizer.get_feature_names_out().tolist().index('PUNCT')
assert X.todense()[0, id] == 2
id = spacy_pos_vectorizer.get_feature_names_out().tolist().index('INTJ')
assert X.todense()[0, id] == 1
del feature_store
del pipeline

os.remove('test_pos.sqlite')

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()