In [1]:
# all import statements should go at the top of all your notebooks
# I know the curriculum has the import statements scatter everywhere
# however this is how code should be maintained in a production environment (ie. industry)

import re
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import spacy

___

# Streamlined Steps for Feature Engineering Text Data

1. Load text data
2. Clean text data
3. Tokenize text
4. Vectorize text (i.e. create a document-term matrix)
    - There is some nuance with steps 3 and 4 because you can treat them as two seperate steps or you can combine them into a single step. 
5. Data is ready to train a model !

____

## 1. Load text data

Create a function that does all the work necessary to load the data into dataframe. We want good, clean modular code. 

In [2]:
def load_text_data(subset = None, categories= None):
    """
    Loads the 20newsgroups text data set into a datafarme for specified categories
    LINK: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html
    
    Notes
    -----
    Work with a subset of the data first to make the rest of the analysis as simple as possible. 
    Once all the code works as intended, then optionally increase the size of the data by selecting 
    more categories. 
    
    Keep in mind that by using more than 2 categories in a classification task, 
    you are moving from a binary classification task to a multi-class classification task
    """
    
    if subset is None:
        subset = "train"
        
    # load data from sklearn data api for news articles
    data = fetch_20newsgroups(subset=subset, categories=categories)
    
    # move data into dataframe for ease in manipulating it
    df = pd.DataFrame({
            'content': data['data'],
            'target': data['target'],
            'target_names': [data['target_names'][i] for i in data['target']]})
    
    return df

In [3]:
# this dataset has 20 doc categories, let's use just 2 for simplicity 
categories = ['alt.atheism',
              'talk.religion.misc']

df = load_text_data(categories)

In [4]:
# great, we loaded the data into a dataframe (this step is done.)
df.head()

Unnamed: 0,content,target,target_names
0,From: mangoe@cs.umd.edu (Charley Wingate)\nSub...,0,alt.atheism
1,Subject: Re: There must be a creator! (Maybe)\...,0,alt.atheism
2,From: MANDTBACKA@FINABO.ABO.FI (Mats Andtbacka...,0,alt.atheism
3,From: royc@rbdc.wsnc.org (Roy Crabtree)\nSubje...,1,talk.religion.misc
4,"Subject: Re: ""Imaginary"" Friends - Info and Ex...",1,talk.religion.misc



---
## 2. Clean text data

Create a single function to clean the text data, all cleaning should occur in this single function. There are only 4 cleaning steps performed here, however you're free to make this as extensive a cleaning process as you wish. 

In [5]:
# practice creating regular expression here: https://regexr.com/
def clean_text_data(text):
    """
    Clean data by using regex to remove unwanted text, like emails
    """
    # remove the first and last white space at the tail ends of the text but the white spaces in between
    # the white spaces in between are used to split the words during tokenization
    text = text.strip()
    
    # remove emails from text by subtituting them with empty chars 
    re_exp_for_emails = 'From: \S+@\S+'
    replace_with = ''
    text = re.sub(re_exp_for_emails, replace_with, text)
    
    # Remove new line characters
    re_exp_for_whitespaces = '\\n'
    replace_with = ''
    text = re.sub(re_exp_for_whitespaces, replace_with, text)  
    
    # Remove non-alphanumeric characters
    re_exp_for_whitespaces = '[^0-9 a-zA-Z]+'
    replace_with = ''
    text = re.sub(re_exp_for_whitespaces, replace_with, text)  
    
    return text

In [6]:
# by have a single function for cleaning data, we only need to iterate through the rows ONCE!
# this cuts down on run time, this is how efficient code is written. 
df['content'] = df['content'].apply(lambda row: clean_text_data(row))

In [7]:
df.head()

Unnamed: 0,content,target,target_names
0,Charley WingateSubject Benediktine Metaphysic...,0,alt.atheism
1,Subject Re There must be a creator Maybe Jim H...,0,alt.atheism
2,Mats AndtbackaSubject Re An Anecdote about Is...,0,alt.atheism
3,Roy CrabtreeSubject Re A Message for you Mr P...,1,talk.religion.misc
4,Subject Re Imaginary Friends Info and Experie...,1,talk.religion.misc


----

## 3. Tokenize the text data

Create a function that tokenizes our text. You are free to apply this function onto the text from outside of a vectorizer or from within a vectorizer. 

My advise is to first apply it outside of a vectorizer in order to sanity check that you are getting the results that you expect to see. Once you're sure that the tokenizer is performing as expected, pass the function into a vectorizer so that the tokenization happens internally within the vectoriers, the code is cleaner that way. 

In [8]:
# load the spacy model to help us tokenize the text
nlp = spacy.load("en_core_web_lg")

In [9]:
def custom_tokenizer(text, nlp):
    """
    This function tokenizes our text documents. 
    
    Note
    ____
    You are free to add as many filters as you want, here you have been presented with 3 filters 
    """
    

    lemmas = []
    
    # by loading the text into our nlp model (i.e. word2vec) we can take advantage of the model's functionalities
    # specifically, it has flags for whether or not a token is a stop word, punctuation, or a part-of-speech
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        # if token is not a stop word or punctuation or a pronoun, then take the lemma and save to list 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            # save lowercase leema to list 
            lemmas.append(token.lemma_.lower())
    
    return lemmas

In [10]:
df["lemmas"] = df['content'].apply(lambda row: custom_tokenizer(row))

In [11]:
# cool - these results look like what I expected to see (expect for those white spaces...)
df["lemmas"].head()

0    [ , charley, wingatesubject, benediktine, meta...
1    [subject, creator, maybe, jim, halatreplyto, h...
2    [ , mats, andtbackasubject, anecdote, islaminr...
3    [ , roy, crabtreesubject, message, mr, preside...
4    [subject, imaginary, friend,  , info, experien...
Name: lemmas, dtype: object

----

## 4. Vectorizer Text

**The output of a vectorizer is always a term-doc matrix.**

In [12]:
raw_text = df['content']

# pass in our custom_tokenizer as an argument
tfidf = TfidfVectorizer(tokenizer=custom_tokenizer)
term_doc_matrix = tfidf.fit_transform(raw_text)

In [13]:
# output is stored in a sparse matrix
term_doc_matrix

<857x28793 sparse matrix of type '<class 'numpy.float64'>'
	with 100948 stored elements in Compressed Sparse Row format>

----

## 5. Document classification

This is the step where we train a model. There are many ways to do this, we could simply pass the data into a model and call fit. However, let's be a little more thorough. 

Now that we are clear on the order of operations, we can build out the gridsearch code (and pipeline) that will train our document classifer. 

We are going to combine all the steps that we completed so far into a single cell of code -- nice and concise.

In [None]:
#### 1. Load data
categories = ['alt.atheism',
              'talk.religion.misc']
df = load_text_data(categories)

#### 2. Clean data
df['clean_text'] = df['content'].apply(lambda row: clean_text_data(row))

#### 3 & 4. Create vectorizer using custom tokenizer 
vect = TfidfVectorizer(tokenizer=custom_tokenizer)

#### 5. Train model 

# instantiate model
rfc = RandomForestClassifier()

# fill out pipeline
pipe = Pipeline([
                 #Vectorizer
                 ('vect', vect),
                 # Classifier
                 ('clf', rfc)
                ])

# create parameter dict with params for both the vectorizer and model
parameters = {
#     'vect__max_features': (500,1000),
#     'clf__n_estimators':(5, 10,),
    'clf__max_depth':(15,20)
}

# pass in the pipeline and parameters dict to gridsearch
grid_search = GridSearchCV(pipe, 
                           parameters, 
                           cv=3, 
                           n_jobs=10, # change to suit your resources (i.e. whatever cores you have available)
                           verbose=1)

# because we are using a pipeline, we can pass in raw data! 
grid_search.fit(df.clean_text.iloc[:200], df.target.iloc[:200])


### Notes

Because we are including the vectorizer in the pipeline (which is passed into the gridsearch) this means that for every unique combination of parameters that the girdsearch uses to build a model, it is also re-tokenizing the text and re-vectorizing it each time. This is a time-consuming process, especially if you have a large data set and are fitting a lot of parameters with a lot of parameter values -- so include parameters wisely!

Also, Unit 4 is introducing you into data science at scale with NLP and with Deep Learning. This is why at some point we start doing data science in the cloud, i.e. AWS EC2 instance with 32 or even 64 cores or use a GPU instance for deep learning. 

On option to shorten the gridsearch run time is to exclude the vectorizer in the gridsearch. I'm not going to include an example of this because a side-by-side example of what I'm walking about already exist in the Warm Up for the Topic Modeling lecture notebook. However, I hope that this notebook helps streamline the proceedure of document classification in your mind. So now you could refer back to that lecture notebook and, hopefully, be able to read through those warm up examples with a bit more ease. 

------

### Create NLP Feature Engineering Pipeline Class

In [165]:
class MyException(Exception):
    pass


class NLP_feat_eng_pipeline(object):
    
    def __init__(self, df = None, nlp = None, vectorizer="tfidf"):
        """
        This class serves as a feature engineering pipeline for text data.
        Providing functionality to load, clean, tokenize, and vectorize 
        text data for the use in NLP tasks, such as predictive analytics.  
        
        Parmaters
        ---------
        df: pandas dataframe
            Option to provide text data in a dataframe instead of using this class to load the data.
        """
        
        # data set already in a dataframe 
        self.df = df 
        
        # Spacy's Word2Vec word embedding model 
        self.nlp = nlp
        
        # term document matrix
        self.term_doc_matrix = None
             
        # make sure that the user selects a valid vectorizer 
        if self.vectorizer not in ["tfidf", "bow", "w2v"]:
            msg = "{} is an invalid selection for vectorizer.".format(self.vectorizer)
            raise MyException(msg)
            
        self.vectorizer=vectorizer

    def load_20newsgroups_data(self, subset = None, categories= None):
        """
        Loads the 20newsgroups text data set into a datafarme for specified categories
        LINK: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html

        Notes
        -----
        Work with a subset of the data first to make the rest of the analysis as simple as possible. 
        Once all the code works as intended, then optionally increase the size of the data by selecting 
        more categories. 

        Keep in mind that by using more than 2 categories in a classification task, 
        you are moving from a binary classification task to a multi-class classification task
        """

        if subset is None:
            subset = "train"
            
        if categories is None:
            categories = ["sci.electronics", 
                          "sci.space"]

        # load data from sklearn data api for news articles
        data = fetch_20newsgroups(subset=subset, categories=categories)

        # move data into dataframe for ease in manipulating it
        df = pd.DataFrame({
                'content': data['data'],
                'target': data['target'],
                'target_names': [data['target_names'][i] for i in data['target']]})

        self.df = df
        
        
    # practice creating regular expression here: https://regexr.com/
    def _clean_text_data(self, text):
        """
        Clean data by using regex to remove unwanted text, like emails
        
        Note
        ----
        This method is not intended to be used directly. It is intended to be call 
        within the clean_text method.
        
        Parameters
        ----------
        text: string 
            A single document 
            
        Returns
        -------
        text: string 
            A single document that has been cleaned 
        """
        # remove the first and last white space at the tail ends of the text but the white spaces in between
        # the white spaces in between are used to split the words during tokenization
        text = text.strip()

        # remove emails from text by subtituting them with empty chars 
        re_exp_for_emails = 'From: \S+@\S+'
        replace_with = ''
        text = re.sub(re_exp_for_emails, replace_with, text)

        # Remove new line characters
        re_exp_for_whitespaces = '\\n'
        replace_with = ''
        text = re.sub(re_exp_for_whitespaces, replace_with, text)  

        # Remove non-alphanumeric characters
        re_exp_for_whitespaces = '[^0-9 a-zA-Z]+'
        replace_with = ''
        text = re.sub(re_exp_for_whitespaces, replace_with, text)  

        return text
    
    def clean_text(self):
        """
        Clean data by using regex to remove unwanted text, like emails. 
        
        This method calls self._clean_text_data in a loop. Use to method for cleanng more than a single document. 
        """
        
        self.df['content'] = self.df['content'].apply(lambda row: self._clean_text_data(row))
        
        
    def tokenizer(self, text):
        """
        This function tokenizes our text documents. It filters out certain tokens (like stop words)
        and keeps lower case lemmas of words. 

        Note
        ____
        You are free to add as many filters as you want, here you have been presented with 3 filters
            - stop words
            - punctuation
            - white space
        """

        lemmas = []

        # by loading the text into our nlp model (i.e. word2vec) we can take advantage of the model's functionalities
        # specifically, it has flags for whether or not a token is a stop word, punctuation, or a part-of-speech
        doc = self.nlp(text)

        # Something goes here :P
        for token in doc: 
            # if token is not a stop word or punctuation or a pronoun, then take the lemma and save to list 
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.is_space == False):
                # save lowercase leema to list 
                lemmas.append(token.lemma_.lower())

        return lemmas
    
    def get_tokens(self):
        """
        Tokenize documents and save to a new feature 
        
        This method calls self.tokenizer in a loop. Use this method for tokenizing more than a single document. 
        """
            
        self.df["tokens"] = self.df['content'].apply(lambda doc: self.tokenizer(doc))
        
        
    def vectorize(self):
        """
        Use a vectorizer to create a term-docuemnt matrix. 
        It is this term-document matrix that will be in ML model friendly data format.
        
        Note
        ----
        This method will also perform tokenization, so no need to call self.get_tokens
        if using this method. 
        
        Parameters
        ----------
        vectorizer: string 
            Valid options are ["tfidf", "bow", "w2v"]
            tfidf: TfidfVectorizer
            bow: Bag-of-Words (i.e. CountVectorizer)
            w2v: Word2Vec 
        """
                    
        if self.vectorizer == "tfidf":
            vect = TfidfVectorizer(tokenizer=self.tokenizer)
            
        elif self.vectorizer == "bow":
            vect = CountVectorizer(tokenizer=self.tokenizer)            
            
        # use a term count vectorizer 
        if self.vectorizer  in ["tfidf", "bow"]:
            self.term_doc_matrix = vect.fit_transform(self.df["content"])
            
        # use word embedding (i.e. word2vec)
        else:
            self.term_doc_matrix = self.df["content"].apply(lambda doc: self.nlp(doc).vector)
        
    def get_term_doc_matrix(self):
        """
        Convience function.
        Use this function to load, clean, vectorize a dataset, and return the doc-term matrix.
        """
        
        # load data 
        #self.load_20newsgroups_data()
        
        # clean data
        self.clean_text()
        
        # tokenize and vectorize data
        self.vectorize()
        
        return self.term_doc_matrix

----
### Deploy the Class 

In [3]:
# load the spacy model to help us tokenize the text
nlp = spacy.load("en_core_web_lg")

In [156]:
nlp_pipe = NLP_feat_eng_pipeline(nlp=nlp)

In [157]:
nlp_pipe.load_20newsgroups_data()
nlp_pipe.df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [158]:
nlp_pipe.clean_text()
nlp_pipe.df.head()

Unnamed: 0,content,target,target_names
0,wheres my thingSubject WHAT car is thisNntpPo...,7,rec.autos
1,Guy KuoSubject SI Clock Poll Final CallSumma...,4,comp.sys.mac.hardware
2,Thomas E WillisSubject PB questionsOrganizati...,4,comp.sys.mac.hardware
3,Joe GreenSubject Re Weitek P9000 Organization...,1,comp.graphics
4,Jonathan McDowellSubject Re Shuttle Launch Qu...,14,sci.space


In [159]:
nlp_pipe.df = nlp_pipe.df.sample(n=100)

In [160]:
nlp_pipe.get_tokens()
nlp_pipe.df.head()

Unnamed: 0,content,target,target_names,tokens
8246,Subject RADAR DETECTOR Whistler XKKaOrganizati...,6,misc.forsale,"[subject, radar, detector, whistler, xkkaorgan..."
6611,Gregory LehmanSubject Looking for drawing pac...,5,comp.windows.x,"[gregory, lehmansubject, look, draw, packageso..."
3479,Subject Re A to D hardware for a PCArticleID a...,12,sci.electronics,"[subject, d, hardware, pcarticleid, almaden199..."
5581,PatSubject Plutonium based Nuclear Power plan...,14,sci.space,"[patsubject, plutonium, base, nuclear, power, ..."
11213,David BryantSubject Re GUI toolkit for the Su...,5,comp.windows.x,"[david, bryantsubject, gui, toolkit, sun, spar..."


In [161]:
nlp_pipe.vectorize(vectorizer="w2v")
dtm = nlp_pipe.term_doc_matrix

In [166]:
# output of ETL pipeline is model ready data 
dtm

8246     [-0.024148304, 0.19941829, 0.021181103, -0.059...
6611     [0.022570314, 0.087968335, -0.16035953, -0.029...
3479     [-0.012966986, 0.08954542, -0.07790943, -0.008...
5581     [-0.06252756, 0.05107255, -0.03644632, 0.02560...
11213    [-0.034638315, 0.10063207, -0.09123432, -0.010...
                               ...                        
10264    [-0.013094424, 0.110703565, -0.14540501, -0.05...
1703     [-0.016310517, 0.031511243, -0.042582467, -0.0...
7933     [0.046159443, 0.09234426, -0.16290684, 0.01605...
2360     [-0.04317045, 0.11875365, -0.1119552, -0.07151...
1162     [-0.050491374, 0.11384664, -0.05856432, -0.061...
Name: content, Length: 100, dtype: object