In [1]:
import nltk
import string
import pandas as pd

In [2]:
''' 
The tokenize method performs some lightweight normalization,stripping punctuation and setting the text to lower case.
'''

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

In [3]:
from collections import defaultdict

def vectorize(doc):
    features = defaultdict(int)
    for token in tokenize(doc):
        features[token] += 1
    return features

In [4]:
corpus =[
    "The elephant sneezed at the sight of potatoes.",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.",
]

In [5]:
df = pd.DataFrame(corpus)
df

Unnamed: 0,0
0,The elephant sneezed at the sight of potatoes.
1,Bats can see via echolocation. See the bat sig...
2,"Wondering, she opened the door to the studio."


In [6]:
vectors_nltk = map(vectorize,corpus)
print(list(vectors_nltk))

[defaultdict(<class 'int'>, {'the': 2, 'eleph': 1, 'sneez': 1, 'at': 1, 'sight': 1, 'of': 1, 'potato': 1}), defaultdict(<class 'int'>, {'bat': 2, 'can': 1, 'see': 2, 'via': 1, 'echoloc': 1, 'the': 1, 'sight': 1, 'sneez': 1}), defaultdict(<class 'int'>, {'wonder': 1, 'she': 1, 'open': 1, 'the': 2, 'door': 1, 'to': 1, 'studio': 1})]


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors_sk = vectorizer.fit_transform(corpus)

In [8]:
print(vectorizer.get_feature_names())

['at', 'bat', 'bats', 'can', 'door', 'echolocation', 'elephant', 'of', 'opened', 'potatoes', 'see', 'she', 'sight', 'sneeze', 'sneezed', 'studio', 'the', 'to', 'via', 'wondering']


In [9]:
print(vectors_sk.toarray())

[[1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 2 0 0 0]
 [0 1 1 1 0 1 0 0 0 0 2 0 1 1 0 0 1 0 1 0]
 [0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 2 1 0 1]]


In [10]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
'''
The Estimator.fit method sets the state of the estimator based on the training data,
X and y. The training data X is expected to be matrix-like—for example, a two-
dimensional NumPy array of shape (n_samples, n_features).
y is the supervised estimator which is fit with a one-dimensional Numpy array.
'''

class Estimator(BaseEstimator):

    def fit(self, X, y=None):
        '''
        Accept input data, X, and optional target data, y. Returns self.
        '''
        return self

    def predict(self, X):
        '''
        Accept input data, X and return a vector of predictions for each row.
        '''
        return yhat

'''
A Transformer is a special type of Estimator that creates a new dataset
from an old one based on rules that it has learned from the fitting process.
'''

class Transformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        '''
        Learn how to transform data based on input data, X.
        '''
        return self
    
    def transform(self, X):
        """
        Transform X into a new dataset, Xprime and return it.
        """
        return Xprime

In [11]:
'''
Creating a custom Gensim vectorization transformer
Gensim vectorization techniques are an interesting case study because Gensim cor‐
pora can be saved and loaded from disk in such a way as to remain decoupled from
the pipeline.
'''
import os
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

class GensimVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, path=None):
        self.path = path
        self.id2word = None
        self.load()
    
    def load(self):
        if os.path.exists(self.path):
            self.id2word = Dictionary.load(self.path)
    
    def save(self):
        self.id2word.save(self.path)
    
    def fit(self, documents, labels=None):
        self.id2word = Dictionary(documents)
        self.save()
        return self

    def transform(self, documents):
        for document in documents:
            docvec = self.id2word.doc2bow(document)
            yield sparse2full(docvec, len(self.id2word))

In [12]:
'''
Creating a custom Text normalization transformer
Text normalization reduces the number of dimensions,
decreasing sparsity. Besides the simple filtering of tokens (removing punctuation and
stopwords), there are two primary methods for text normalization: stemming and
lemmatization.

'''
import unicodedata
from sklearn.base import BaseEstimator, TransformerMixin

class TextNormalizer(BaseEstimator, TransformerMixin):

    def __init__(self, language='english'):
        self.stopwords = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = nltk.stem.WordNetLemmatizer()

    def is_punct(self, token):
        return all(
                    unicodedata.category(char).startswith('P') for char in token
                    )

    def is_stopword(self, token):
        return token.lower() in self.stopwords

    def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)
        
        return self.lemmatizer.lemmatize(token, tag)

    def normalize(self, document):
        return [
            self.lemmatize(token, tag).lower()
            for paragraph in document
            for sentence in paragraph
            for (token, tag) in sentence
            if not self.is_punct(token) and not self.is_stopword(token)
        ]

    def fit(self, X, y=None):

        '''
        Finally, we must add the Transformer interface, allowing us to add this class to a
        Scikit-Learn pipeline, which we’ll explore in the next section:
        '''
        return self
    
    def transform(self, documents):
         for document in documents:
            yield self.normalize(document)


In [16]:
'''
Scikit-Learn's Pipeline objects enable us to integrate a series of transformers that combine normal‐
ization, vectorization, and feature analysis into a single, well-defined mechanism.The purpose of a Pipeline is to chain together multiple estimators representing a fixed sequence of steps into a single unit. All estimators in the pipeline, except the last one, must be transformers. Pipelines are constructed by describing a list of (key, value) pairs where the key is a string that names the step and the value is the estimator object.
'''
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

model = Pipeline([
    ('normalizer', TextNormalizer()),
    ('vectorizer', GensimVectorizer),
    ('bayes', MultinomialNB()),
])

# model.named_steps['bayes']
# model.steps[2]

model.named_steps['normalizer'].fit(df)
# model.named_steps['normalizer'].predict

TextNormalizer(language=None)