# textcleaner

> Clean text data before feature extraction.

In [None]:
#| default_exp clean

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| export
from lxml import html
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
#| export
class TextCleaner(BaseEstimator, TransformerMixin):
	""" A component for a Sci-kit learn pipeline to clean clean text data, including normalizing characters and whitespace, stripping whitespace before and after text, and removing html tags, . """
	def __init__(self, 
			  character_replacements:dict = None, # character replacements for character normalization - a dict with key as the character to replace and value as the replacement character
			  remove_html:bool = False, # whether to remove html tags
			  strip_whitespace:bool = False, # whether to remove whitespace from the start and end of the text
			  normalize_whitespace:bool = False # whether to replace one or more whitespace characters with a single space
			  ):
		self.character_replacements = character_replacements
		self.remove_html = remove_html
		self.strip_whitespace = strip_whitespace
		self.normalize_whitespace = normalize_whitespace

		if self.character_replacements is not None:
			self._character_replacement_translator(self.character_replacements)

	def _character_replacement_translator(self, 
									   	  character_replacements:dict = None 
										  ):
		""" Prepare a translation table for character replacements """
		self.character_replacements_translator_ = str.maketrans(character_replacements)

	def _normalize_characters(self,
						  	text:str # the text to normalize 
							) -> str:
		""" Normalize characters in a text using a dictionary of replacements """
		return text.translate(self.character_replacements_translator_)

	def _remove_html_from_string(self, 
								text:str # the string to remove html from
								) -> str:
		tree = html.fromstring(text)
		return tree.text_content()
	
	def _strip_whitespace(self,
						text:str # the text to strip whitespace from
						) -> str:
		""" Strip whitespace from before and after text """
		return text.strip()
	
	def _normalize_whitespace(self,
							text:str # the text to normalize whitespace in
							) -> str:
		""" Normalize whitespace in a text by replacing one or more whitespace characters with a single space """
		return ' '.join(text.split())
	
	def fit(self, X, y=None):
		""" Fit is implemented, but does nothing. """
		return self

	def transform(self, 
			   	  X:list, # the text to transform
			   	  ) -> list: # the transformed text
		""" Applies transformations to the text data."""
		transformations = []

		if self.character_replacements is not None:
			self._character_replacement_translator(self.character_replacements)
			transformations.append(self._normalize_characters)

		if self.remove_html:
			transformations.append(self._remove_html_from_string)

		if self.strip_whitespace:
			transformations.append(self._strip_whitespace)

		if self.normalize_whitespace:
			transformations.append(self._normalize_whitespace)
		
		if len(transformations) > 0:
			return [self.apply_transformations(text, transformations) for text in X]
		else:
			return X

	def apply_transformations(self,
							  text: str, # the text to transform 
							  transformations # the transformations to apply
							  ) -> str: # the transformed text
		""" Apply a series of transformations to a text. """
		for method in transformations:
			text = method(text)
		return text



Here's an example that removes html, strips and normalizes whitespace and normalizes single quotes.

Only apply what makes sense for your use case.

Note: The character_replacement dictionary is used to specify single-character replacements. This will raise a ValueError if the input strings are longer than 1 character. 

In [None]:
documents = [
    "<p>Some text with <b>html</b> tags</p>",
    "Some text with      extra whitespace",
    "Some text with ‘single quotes’",
    "Some text with  \t a tab character",
    "   Some text with whitespace before and after the text  \n ",
]

character_replacements ={
    "‘": "'",
    "’": "'",
}

text_cleaner = TextCleaner(
    character_replacements=character_replacements,
    remove_html=True,
    strip_whitespace=True,
    normalize_whitespace=True,
)
cleaned_documents = text_cleaner.fit_transform(documents)

for i, doc in enumerate(cleaned_documents):
    print(f"Original: {documents[i]}")
    print(f"Cleaned: {doc}")
    print()

Original: <p>Some text with <b>html</b> tags</p>
Cleaned: Some text with html tags

Original: Some text with      extra whitespace
Cleaned: Some text with extra whitespace

Original: Some text with ‘single quotes’
Cleaned: Some text with 'single quotes'

Original: Some text with  	 a tab character
Cleaned: Some text with a tab character

Original:    Some text with whitespace before and after the text  
 
Cleaned: Some text with whitespace before and after the text



In [None]:
#| hide
try:
    text_cleaner = TextCleaner(character_replacements={"123": ""})
    text_cleaner.transform(["123Hello, world! This is a 'test'."])  # Should raise an error
except ValueError as e:
    assert True

text_cleaner = TextCleaner(character_replacements={"'": "", "’": "", "“": "", "”": "", "‘": "", '"': ""})
assert text_cleaner.transform(["Hello, world! This is a 'test'."]) == ["Hello, world! This is a test."]
assert text_cleaner.transform(["Hello, world! This is a ‘test’."]) == ["Hello, world! This is a test."]

text_cleaner = TextCleaner(strip_whitespace=True)
assert text_cleaner.transform(["   Hello, world! This is a 'test'.   "]) == ["Hello, world! This is a 'test'."]

text_cleaner = TextCleaner(normalize_whitespace=True)
assert text_cleaner.transform(["Hello, world!    This is a 'test'.   "]) == ["Hello, world! This is a 'test'."]

text_cleaner = TextCleaner(remove_html=True)
assert text_cleaner.transform(["<p>Hello, world! This is a 'test'.</p>"]) == ["Hello, world! This is a 'test'."]

text_cleaner = TextCleaner(character_replacements={"'": "", "’": "", "“": "", "”": "", "‘": "", '"': ""},
                            remove_html=True,
                            strip_whitespace=True,
                            normalize_whitespace=True)
assert text_cleaner.transform(["   <p>Hello, world! This is a 'test'.</p>   ", "'Hello,   world!', <br>\n he said!"]) == ["Hello, world! This is a test.", "Hello, world!, he said!"]

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()