# Preprocessing

Inspired from this [article](https://towardsdatascience.com/nlp-building-text-cleanup-and-preprocessing-pipeline-eba4095245a0).

### Removing HTML tags

In [1]:
# imports
from bs4 import BeautifulSoup
# function to remove HTML tags
def remove_html_tags(text):
    return BeautifulSoup(text, 'html.parser').get_text()
# call function
remove_html_tags(" <html> \
 <h1>Article Heading</h1> \
 <p>First sentence of some important article. And another one. And then the last one</p></html>")

'  Article Heading First sentence of some important article. And another one. And then the last one'

### Removing Accented Characters

In [2]:
# imports
import unicodedata
# function to remove accented characters
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text
# call function
remove_accented_chars('Sómě Áccěntěd těxt. Some words such as résumé, café, prótest, divorcé, coördinate, exposé, latté.')

'Some Accented text. Some words such as resume, cafe, protest, divorce, coordinate, expose, latte.'

### Expanding Contractions

In [3]:
# imports
import contractions

contractions.fix("Y’all i’d contractions you’re expanded don’t think.")

'you all I would contractions you are expanded do not think.'

### Removing Special Characters

In [4]:
# imports
import re
# function to remove special characters
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)
 
# call function
remove_special_characters("007 Not sure@ if this % was #fun! 558923 What do# you think** of it.? $500USD!")

'007 Not sure if this  was fun! 558923 What do you think of it.? 500USD!'

### Removing Numbers

In [5]:
# imports
import re
# function to remove numbers
def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)
 
# call function
remove_numbers("007 Not sure@ if this % was #fun! 558923 What do# you think** of it.? $500USD!")

' Not sure if this  was fun!  What do you think of it.? USD!'

### Removing Punctuation

In [6]:
# imports
import string
# function to remove punctuation
def remove_punctuation(text):
    text = ''.join([c for c in text if c not in string.punctuation])
    return text
# call function
remove_punctuation('Article: @First sentence of some, {important} article having lot of ~ punctuations. And another one;!')

'Article First sentence of some important article having lot of  punctuations And another one'

### Stemming

In [7]:
# imports
import nltk
# function for stemming
def get_stem(text):
    stemmer = nltk.porter.PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text
# call function
get_stem("we are eating and swimming ; we have been eating and swimming ; he eats and swims ; he ate and swam ")

'we are eat and swim ; we have been eat and swim ; he eat and swim ; he ate and swam'

### Lemmatization

In [2]:
# imports
import spacy
nlp = spacy.load("en_core_web_sm")
# function to remove special characters
def get_lem(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text
# call function
get_lem("we are eating and swimming ; we have been eating and swimming ; he eats and swims ; he ate and swam ")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

### Removing Stopwords

In [None]:
# imports
import nltk
from nltk.tokenize import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
# custom: removing words from list
stopword_list.remove('not')
# function to remove stopwords
def remove_stopwords(text):
    # convert sentence into token of words
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    # check in lowercase 
    t = [token for token in tokens if token.lower() not in stopword_list]
    text = ' '.join(t)    
    return text
# call function
remove_stopwords("i am myself you the stopwords list and this article is not should removed")

### Removing extra whitespaces and tabs

In [None]:
# imports
import re
# function to remove special characters
def remove_extra_whitespace_tabs(text):
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()
# call function
remove_extra_whitespace_tabs('  This web line  has \t some extra  \t   tabs and whitespaces  ')

### Lowercase

In [None]:
# function to remove special characters
def to_lowercase(text):
    return text.lower()
# call function
to_lowercase('ConVert THIS string to LOWER cASe.')