# textstats

> Extract document-level statistics as features.

In [None]:
#| default_exp textstats

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from sklearn.base import BaseEstimator, TransformerMixin
from textplumber.store import TextFeatureStore

In [None]:
#| export
class TextstatsTransformer(BaseEstimator, TransformerMixin):
	""" Sci-kit Learn pipeline component to extract document-level text statistics based on the textstat library and pre-computed counts. 
		This component should be used after the SpacyPreprocessor component with the same feature store. 
		The statistics currently available are monosyllable count, polysyllable count, token count, sentence count, unique tokens count and average sentence length. """
	def __init__(self, 
			  	feature_store: TextFeatureStore, # the feature store to use
				columns = ['monosyll_count', 'polysyll_count', 'token_count', 'sentence_count', 'unique_tokens_count', 'average_sentence_length']
				#scale: bool = True, # whether to scale the features - not implemented yet
				):
		self.feature_store = feature_store
		#self.scale = scale
		# check that passed columns matches these ...
		possible_columns = ['monosyll_count', 'polysyll_count', 'token_count', 'sentence_count', 'unique_tokens_count', 'average_sentence_length']
		for col in columns:
			if col not in possible_columns:
				raise ValueError(f"Invalid column name: {col}. Possible columns are: {possible_columns}")
		self.columns = columns

	def fit(self, X, y=None):
		""" Fit is implemented but does nothing. """
		#if self.scale:        
		#	self.scaler_ = StandardScaler()
		#	self.scaler_.fit(self.feature_store.get_textstats_from_texts(X, self.columns))
		return self
	
	def transform(self, X):
		""" Transforms the texts to a matrix of text statistics. """
		textstats = self.feature_store.get_textstats_from_texts(X, self.columns)
		#if self.scale:
		#	textstats = self.scaler_.transform(textstats)
		return textstats
	
	def get_feature_names_out(self, input_features=None):
		""" Get the feature names out from the text statistics. """
		return self.columns


TODO: add an example.

In [None]:
#| hide
from textplumber.preprocess import SpacyPreprocessor
from sklearn.pipeline import Pipeline
import os

feature_store = TextFeatureStore('../test-data/test_textstats')

pipeline = Pipeline([
    ('spacy_preprocessor', SpacyPreprocessor(feature_store=feature_store)),
    ('textstats_transformer', TextstatsTransformer(feature_store=feature_store))
])

X = pipeline.fit_transform(['Hello, world!'])
id = pipeline.named_steps['textstats_transformer'].get_feature_names_out().index('monosyll_count')
assert X[0][id] == 1

os.remove('../test-data/test_textstats')

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()