In [46]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from collections import Counter
from bs4 import BeautifulSoup
import unicodedata

## Decontraction

In [2]:
def decontraction(text):
    """
    function to perform decontraction on provided text
    """
    patterns = [
    ('won\'t', 'will not'), ('\'d', ' would'), ('\'s', ' is'), ('can\'t', 'can not'),
    ('don\'t', 'do not'), ('\'ll', ' will'), ('\'ve', ' have'), ('\'t', ' not'),
    ('\'re', ' are'), ('\'m', ' am')
    ]

    for (pattern, replacer) in patterns:
        regex = re.compile(pattern)
        text = regex.sub(replacer, text)
    return text

## Lemmatization

In [43]:
def get_wordnet_pos(tag):
    """
    return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
    """
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN

In [4]:
def lemmatize(text):
    """
    Function to lemmatize words in provided sentence
    """
    #getting pos for each word in a sentence
    pos = nltk.pos_tag(text.split())
    
    #list to store lemmatize words
    lemmatize_words = [];
    
    #initializing WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    
    #lemmatizing each word in a sentence according to it's pos
    for word_pos in pos:
        lemmatize_words.append(lemmatizer.lemmatize(word_pos[0], self.get_wordnet_pos(word_pos[1]))) 

    return ' '.join(lemmatize_words)

## Stemming

In [45]:
def stemming(text):
    """
    Function to perform Stemming
    """
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

## Removing html tags

In [5]:
def remove_html_tags(text):
    """
    function to remove html tags from provided text/sentence
    """
    soup = BeautifulSoup(text, "lxml")
    text = soup.get_text()
    return text

## Removing Special Characters

In [12]:
def remove_special_characters(text, remove_digits=False):
    """
    Function to remove special characters.
    Parameters: text, remove_digits
    remove_digits: To remove digits also, by default it's False
    """
    text = text.lower()
    
    if remove_digits:
        regex = re.compile(r'[^a-z\s]')
    else:
        regex = re.compile(r'[^a-z0-9\s]')
    
    return regex.sub('', text)

## Remove Stop Words


In [25]:
STOP_WORDS = stopwords.words('english')

def remove_stop_words(text, words_to_remove_from_stopwords=[]):
    """
    Function to remove stop words from given text/sentence
    parameters: text, words_to_remove_from_stopwords
    words_to_remove_from_stopwords = list of words to remove from stopwords list
    """
    if len(words_to_remove_from_stopwords) == 0:
        for word in words_to_remove_from_stopwords:
            STOP_WORDS.remove(word)
    
    text = ' '.join([word for word in text.split() if word.lower() not in STOP_WORDS])
    
    return text

## Remove Roman Numbers
not working properly

In [41]:
def remove_roman_number(text):
    """
    function to remove roman numbers
    """
    ProgRoman = re.compile(u'M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$')
    t = ProgRoman.sub(' ', text)
    return t

## Replacing Accent Characters

In [51]:
def replace_accent_characters(text):
    """
    function to replace accent characters
    """
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")

    return str(text)

## Finding Most Common Words

In [65]:
def find_most_common_words(text, no_of_words=2):
    """
    function to find most common words in a given text
    """
    counter = Counter()
    words = [word.lower() for word in text.split()]
    counter.update(words)
    return counter.most_common(no_of_words)

## Remove Words

In [67]:
def remove_words(text, words):
    """
    function to remove provided words from given text
    """
    text = ' '.join([word for word in text.split() if word.lower not in words])
    return text

## Remove Line breaks

In [1]:
def remove_line_breaks(text):
    """
    function to remove line breaks
    """
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    return text

##  Finding Noun Chunks

In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("We try to explicitly describe the geometry of the edges of the images.")

for np in doc.noun_chunks: # use np instead of np.text
    print(np)


We
the geometry
the edges
the images

