# lexicons

> Extract features from texts based on lexicons.

In [None]:
#| default_exp lexicons

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from sklearn.base import BaseEstimator, TransformerMixin
from textplumber.store import TextFeatureStore
import numpy as np
import requests

In [None]:
#| export
class LexiconCountVectorizer(BaseEstimator, TransformerMixin):
	""" A Sci-kit Learn pipeline component to get document-level counts for one or more lexicons. 
		This component should be used after the SpacyPreprocessor component with the same feature store. """ 
	def __init__(self,
			  	 feature_store: TextFeatureStore, # the feature store to use - this should be the same feature store used in the SpacyPreprocessor component
				 lexicons:dict, # the lexicons to use - a dictionary with the lexicon name as the key and the lexicon (a list of tokens to count) as the value
				 lowercase:bool = True, # whether to lowercase the tokens
				 ):

		self.feature_store = feature_store

		if not isinstance(lexicons, dict):
			raise ValueError("Lexicons should be a dictionary with lexicon name as the key and the lexicon (a list of tokens to count) as the value.")

		self.lexicons = lexicons
		self.lowercase = lowercase
	
	def fit(self, X, y=None):
		""" Fit the vectorizer to the tokens in the feature store. """
		return self
	
	def transform(self, X):
		""" Transform the texts to a matrix of counts. """
		docs_tokens = self.feature_store.get_tokens_from_texts(X, lowercase = self.lowercase)
		X = []
		for doc in docs_tokens:
			lexicon_counts = []
			for lexicon in self.lexicons:
				lexicon_counts.append(sum([1 for token in doc if token in self.lexicons[lexicon]]))
			X.append(lexicon_counts)
		return np.array(X)
		
	def get_feature_names_out(self, input_features=None):
		""" Get the feature names out from the vectorizer. """
		return list(self.lexicons.keys())

In [None]:
#| export
def get_empath_lexicons():
    """ Get the empath lexicons from the empath github repo. """
    empath_lexicon = 'https://raw.githubusercontent.com/Ejhfast/empath-client/refs/heads/master/empath/data/categories.tsv'

    empath_text = requests.get(empath_lexicon).text.strip()

    empath_lexicons = {}
    lines = empath_text.split('\n')
    for line in lines:
        tokens = line.split()
        tokens = [token for token in tokens if token != '']
        if len(tokens) > 0:
            # first token is name and a candidate token
            empath_lexicons[tokens[0]] = tokens

    return empath_lexicons

TODO: add an example.

In [None]:
#| hide
from textplumber.preprocess import SpacyPreprocessor
from sklearn.pipeline import Pipeline
import os

feature_store = TextFeatureStore('../test-data/test_lexicons')
lexicon = {
    'positive': ['good', 'happy', 'joyful'],
    'negative': ['bad', 'sad', 'angry']
}

pipeline = Pipeline([
    ('spacy_preprocessor', SpacyPreprocessor(feature_store=feature_store)),
    ('lexicon_count_vectorizer', LexiconCountVectorizer(feature_store=feature_store, lexicons=lexicon))
])
docs = ['I am happy', 'I am sad', 'I am angry', 'I am happy happy']
X = pipeline.fit_transform(docs)

id = pipeline.named_steps['lexicon_count_vectorizer'].get_feature_names_out().index('positive')
assert X.tolist()[0][id] == 1
assert X.tolist()[0][id] == 1
assert X.tolist()[3][id] == 2
id = pipeline.named_steps['lexicon_count_vectorizer'].get_feature_names_out().index('negative')
assert X.tolist()[0][id] == 0

os.remove('../test-data/test_lexicons')

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()