<a href="https://colab.research.google.com/github/shyamsparrow/Jan-G4---NLP-Chatbot/blob/main/Text_Preprocessing_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Standard libraries
!pip install pyspellchecker
import os
import re
import string
import logging
import csv
from pathlib import Path
from functools import wraps
from unicodedata import normalize
from typing import List, Optional, Union, Callable

# Third party libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)


_CUSTOM_SUB_CSV_FILE_PATH = os.path.join(os.path.dirname("__file__"), 'data/custom_substitutions.csv')
_IGNORE_SPELLCHECK_WORD_FILE_PATH = os.path.join(os.path.dirname("__file__"), 'data/ignore_spellcheck_words.txt')

LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.INFO)

class PreProcessing:
  def __init__(self, to_lower = False,remove_url=False,remove_special_character=False, remove_punctuation=False,
               remove_whitespace=False,check_spelling=False,remove_stopword=False,substitute_token=False,lemmatize_word=False,
               tokenize_word=False, tokenize_sentence= False):
    
    self.to_lower = to_lower
    self.remove_url=remove_url
    self.remove_special_character=remove_special_character
    self.remove_punctuation=remove_punctuation
    self.remove_whitespace=remove_whitespace
    self.check_spelling=check_spelling
    self.remove_stopword=remove_stopword
    self.substitute_token=substitute_token
    self.lemmatize_word=lemmatize_word
    self.tokenize_word=tokenize_word
    self.tokenize_sentence=tokenize_sentence

  
  def preprocess(self, text):

    if self.to_lower:
      text = to_lower(text)

    if self.remove_url:
      text = remove_url(text)
    
    if self.remove_special_character:
      text = remove_special_character(text)
    
    if self.remove_punctuation:
      text = self.remove_punctuation(text)

    if self.remove_whitespace:
      text = remove_whitespace(text)
      
    if self.check_spelling:
      text = check_spelling(text)
      
    if self.remove_stopword:
      text = remove_stopword(text)
      
    if self.substitute_token:
      text = substitute_token(text)
      
    if self.lemmatize_word:
      text = lemmatize_word(text)
            
    if self.tokenize_word:
      text = tokenize_word(text)
            
    if self.tokenize_sentence:
      text = tokenize_sentence(text)

    return text



  def _return_empty_string_for_invalid_input(func):
    """ Return empty string if the input is None or empty """
    @wraps(func)
    def wrapper(*args, **kwargs):
        if 'input_text' in kwargs:
            input_text = kwargs['input_text']
        else:
            try:
                input_text = args[0]
            except IndexError as e:
                LOGGER.exception('No appropriate positional argument is provide.')
                raise e
        if input_text is None or len(input_text) == 0:
            return ''
        else:
            return func(*args, **kwargs)
    return wrapper


  def _return_empty_list_for_invalid_input(func):
    """ Return empty list if the input is None or empty """
    @wraps(func)
    def wrapper(*args, **kwargs):
        if 'input_text_or_list' in kwargs:
            input_text_or_list = kwargs['input_text_or_list']
        else:
            try:
                input_text_or_list = args[0]
            except IndexError as e:
                LOGGER.exception('No appropriate positional argument is provide.')
                raise e
        if input_text_or_list is None or len(input_text_or_list) == 0:
            return []
        else:
            return func(*args, **kwargs)
    return wrapper

  @_return_empty_string_for_invalid_input  
  def to_lower(self,text)-> str:
    """ Convert input text to lower case """
    return text.lower()

  @_return_empty_string_for_invalid_input
  def remove_url(self,text)-> str:
    """ Remove url in the input text """
    return re.sub('(www|http)\S+', '', text)

  @_return_empty_string_for_invalid_input
  def remove_punctuation(self,text)-> str:
    """
    Removes all punctuations from a string, as defined by string.punctuation or a custom list.
    For reference, Python's string.punctuation is equivalent to '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~'
    """
    punctuations = string.punctuation
    processed_text = text.translate(str.maketrans('', '', punctuations))
    return processed_text

  @_return_empty_string_for_invalid_input
  def remove_special_character(self,text)-> str:
    """ Removes special characters """
    special_characters = 'å¼«¥ª°©ð±§µæ¹¢³¿®ä£'
    processed_text = text.translate(str.maketrans('', '', special_characters))
    return processed_text

  @_return_empty_string_for_invalid_input
  def keep_alpha_numeric(self,text):
    """ Remove any character except alphanumeric characters """
    return ''.join(c for c in text if c.isalnum())

  @_return_empty_string_for_invalid_input 
  def remove_whitespace(self,text, remove_duplicate_whitespace: bool = True)-> str:
    """ Removes leading, trailing, and (optionally) duplicated whitespace """
    if remove_duplicate_whitespace:
        return ' '.join(re.split('\s+', input_text.strip(), flags=re.UNICODE))
    return input_text.strip()

  @_return_empty_list_for_invalid_input
  def remove_stopword(input_text_or_list: Union[str, List[str]])-> List[str]:
    """ Remove stop words """

    stop_words = set(stopwords.words('english'))
    if isinstance(stop_words, list):
        stop_words = set(stop_words)
    if isinstance(input_text_or_list, str):
        tokens = word_tokenize(input_text_or_list)
        processed_tokens = [token for token in tokens if token not in stop_words]
    else:
        processed_tokens = [token for token in input_text_or_list
                            if (token not in stop_words and token is not None and len(token) > 0)]
    return processed_tokens
     
  @_return_empty_list_for_invalid_input
  def lemmatize_word(input_text_or_list: Union[str, List[str]])-> List[str]:
    """ Lemmatize each token in a text by finding its base form """
    lemmatizer = WordNetLemmatizer()
    if isinstance(input_text_or_list, str):
        tokens = word_tokenize(input_text_or_list)
        processed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    else:
        processed_tokens = [lemmatizer.lemmatize(token)
                            for token in input_text_or_list if token is not None and len(token) > 0]
    return processed_tokens
     
  def tokenize_word(input_text: str) -> List[str]:
    """ Converts a text into a list of word tokens """
    if input_text is None or len(input_text) == 0:
        return []
    return word_tokenize(input_text)


  def tokenize_sentence(input_text: str) -> List[str]:
    """ Converts a text into a list of sentence tokens """
    if input_text is None or len(input_text) == 0:
        return []
    tokenizer = PunktSentenceTokenizer()
    return tokenizer.tokenize(input_text)

  def substitute_token(token_list: List[str])-> List[str]:
    """ Substitute each token by another token, e.g., 'vs' -> 'versus' """
    # TODO: add more custom substitutions in the csv file specified by _CUSTOM_SUB_CSV_FILE_PATH
    if token_list is None or len(token_list) == 0:
        return []
    with open(_CUSTOM_SUB_CSV_FILE_PATH, 'r') as f:
            csv_file = csv.reader(f)
            sub_dict = dict(csv_file)
    processed_tokens = list()

    for token in token_list:
        if token in sub_dict:
            processed_tokens.append(sub_dict[token])
        else:
            processed_tokens.append(token)
    return processed_tokens

  def check_spelling(input_text_or_list: Union[str, List[str]], lang='en',
                   ignore_word_file_path: Union[str, Path] = _IGNORE_SPELLCHECK_WORD_FILE_PATH) -> str:
    """ Check and correct spellings of the text list """
    if input_text_or_list is None or len(input_text_or_list) == 0:
        return ''
    spelling_checker = SpellChecker(language=lang, distance=1)
    # TODO: add acronyms into spell checker to ignore auto correction specified by _IGNORE_SPELLCHECK_WORD_FILE_PATH
    spelling_checker.word_frequency.load_text_file(ignore_word_file_path)
    if isinstance(input_text_or_list, str):
        if not input_text_or_list.islower():
            input_text_or_list = input_text_or_list.lower()
        tokens = word_tokenize(input_text_or_list)
    else:
        tokens = [token.lower() for token in input_text_or_list if token is not None and len(token) > 0]
    misspelled = spelling_checker.unknown(tokens)
    for word in misspelled:
        tokens[tokens.index(word)] = spelling_checker.correction(word)
    return ' '.join(tokens).strip()



In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
%%writefile config.py
self, to_lower = True
remove_url=True
remove_special_character=True
remove_punctuation=True
remove_whitespace=True
check_spelling=True
remove_stopword=True
substitute_token=True
lemmatize_word=True
tokenize_word=True
tokenize_sentence= False

Writing config.py


In [None]:
import config

In [None]:
config.remove_whitespaces

False

In [None]:
pp = PreProcessing(remove_whitespaces=config.remove_whitespaces)

pp.preprocess("this is a sample text    ")

'this is a sample text    '