Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
39 lines (26 sloc) 1.1 KB
This module implements text filtering which uses NLTK in-builts.
It helps removing punctuation/stop words, lemmatization etc.
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
class Colander:
def __init__(self):
# takes care of necessary '.' character; like the word 'angular.js'
self.tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+(\.[a-zA-Z0-9]+)?')
self.lemmatizer = WordNetLemmatizer()
def process(self, sentence):
""" filter the sentence and return meaningful words.
# selects onlt alphanumeric words
words = self.tokenizer.tokenize(sentence)
# lemmatize the words
words = [self.lemmatizer.lemmatize(word) for word in words]
# lowercase all the words and remove single characters
words = [word.lower() for word in words if len(word) > 1]
# remove the stopwords using NLTK
words = [word for word in words if word not in stopwords.words('english')]
return words