# Analyzing Webpage Content with NLP


*Author*: Frank Fichtenmueller <br>
*Date*: 05/09/2017<br>
<hr>
*Goal*: To describe a simple Processing Pipeline to be applied to Webpage Content to extract relevant pieces of information

In [49]:
# Imports
from urllib import request
import nltk
from bs4 import BeautifulSoup

### Accessing the text from the Page

In [120]:
def get_text(url, tokenize="words", urlencode='lxml', decoder='utf8'):
    html = request.urlopen(url).read().decode(decoder)
    raw = BeautifulSoup(html, urlencode).get_text()
    
    # Apply tokenization to the raw text
    switch = {
        'words': nltk.word_tokenize(raw),
        'sentences': nltk.sent_tokenize(raw),
        'word_sentences': [[word for word in nltk.word_tokenize(sent)] for sent in nltk.sent_tokenize(raw)],
        'raw': raw
    }
    
    return switch[tokenize]

In [121]:
get_text('http://mostly.ai/summit/#undefined', 'word_sentences')

NotImplementedError: To remove HTML markup, use BeautifulSoup's get_text() function

### Creating a Frequency based count on the Words

In [44]:
def freq_words(text, lower=True, n=10):
    """
    Calculate the most frequent words in a given text and return them as a list.
    
    Given a string representation of a text, return the n most common / uncommon 
    words in the text.
    """
    # Create the Frequency Distribution
    if lower:
        freqdist = nltk.FreqDist(word.lower() for word in text)
    else:
        freqdist = nltk.FreqDist(word for word in text)
    
    # Return the n most common words
    return [word for (word, _) in freqdist.most_common(n)]


In [70]:
# Getting the text
text = get_text('http://mostly.ai/summit/#undefined', 'words')

# Extracting the most frequent words
freq_words(text, lower=False, n=15)


["''",
 ',',
 '``',
 ':',
 '.',
 'and',
 'the',
 'of',
 ')',
 '(',
 'in',
 'to',
 '{',
 '}',
 'Learning']

### Cleaning the Text

In [105]:
def clean_text(text, language='english', stopwords=True, punctuation=True):
    """
    Remove stopwords, punctuation and html code from the text
    """
    # Clean Punctuation
    if punctuation:
        # Defining the regex
        regex = re.compile('[%s]' % re.escape(string.punctuation))

        filtered_text = []

        for sent in text:
            new_sent = []
            for token in sent:
                new_token = regex.sub(u'', token)
                if not new_token == u'':
                    new_sent.append(new_token)
            
            filtered_text.append(new_sent)
            text = filtered_text
            
    # Remove Stopwords from Text 
    if stopwords:
        from nltk.corpus import stopwords
        filtered_text = [word for word in text if word not in stopwords.words(language)]
    
    return filtered_text

In [106]:
clean_text(text, language='english', stopwords=True, punctuation=True)

[['AI',
  'Summit',
  'Vienna',
  'AI',
  'Advances',
  'Insights',
  'on',
  'the',
  'latest',
  'advances',
  'in',
  'the',
  'field',
  'and',
  'where',
  'we',
  'are',
  'heading'],
 ['Deep',
  'Learning',
  'Dive',
  'into',
  'the',
  'powerful',
  'model',
  'classes',
  'that',
  'are',
  'behind',
  'the',
  'current',
  'AI',
  'revival'],
 ['ML',
  'Use',
  'Cases',
  'Realworld',
  'applications',
  'of',
  'machine',
  'learning',
  'shared',
  'by',
  'seasoned',
  'practitioners'],
 ['Networking',
  'Informal',
  'gettogether',
  'of',
  'AI',
  'enthusiasts',
  'at',
  'one',
  'of',
  'Europe',
  's',
  'most',
  'modern',
  'campuses'],
 ['Learn',
  'from',
  'Leading',
  'AI',
  'Experts',
  'Selflearning',
  'machines',
  'are',
  'outperforming',
  'humans',
  'in',
  'decision',
  'making',
  'at',
  'an',
  'ever',
  'increasing',
  'pace'],
 ['These',
  'ongoing',
  'advancements',
  'are',
  'fueled',
  'by',
  '1',
  'more',
  'efficient',
  'algorithms',


In [116]:
cleaned_text = clean_text(text, language='english', stopwords=True, punctuation=True)

### Applying Word Stemming and Lemmatization

In [114]:
def stem_text(text, language='english', lemmatize=True, stemmer='porter'):
    """
    Creates a stemmed or lemmatized version of the input text for downstream analysis.
    
    Given a word_sentence tokenized text it returns the stemmed version of the text.
    """
    if lemmatize:
        from nltk.stem.wordnet import WordNetLemmatizer
        stem = WordNetLemmatizer()
        return [[stem.lemmatize(word) for word in sent] for sent in text]
    
    elif stemmer == 'porter':
        from nltk.stem.porter import PorterStemmer
        stem = PorterStemmer()
    elif stemmer == 'snowball':
        from nltk.stem.snowball import SnowballStemmer
        stem = SnowballStemmer(language)
    
    # Apply the chosen stemmer to the Text
    return [[stem.stem(word) for word in sent] for sent in text]

In [117]:
# Creating a stemmed Version of the Text
stem_text(cleaned_text, language='english', lemmatize=True, stemmer='porter')

[['AI',
  'Summit',
  'Vienna',
  'AI',
  'Advances',
  'Insights',
  'on',
  'the',
  'latest',
  'advance',
  'in',
  'the',
  'field',
  'and',
  'where',
  'we',
  'are',
  'heading'],
 ['Deep',
  'Learning',
  'Dive',
  'into',
  'the',
  'powerful',
  'model',
  'class',
  'that',
  'are',
  'behind',
  'the',
  'current',
  'AI',
  'revival'],
 ['ML',
  'Use',
  'Cases',
  'Realworld',
  'application',
  'of',
  'machine',
  'learning',
  'shared',
  'by',
  'seasoned',
  'practitioner'],
 ['Networking',
  'Informal',
  'gettogether',
  'of',
  'AI',
  'enthusiast',
  'at',
  'one',
  'of',
  'Europe',
  's',
  'most',
  'modern',
  'campus'],
 ['Learn',
  'from',
  'Leading',
  'AI',
  'Experts',
  'Selflearning',
  'machine',
  'are',
  'outperforming',
  'human',
  'in',
  'decision',
  'making',
  'at',
  'an',
  'ever',
  'increasing',
  'pace'],
 ['These',
  'ongoing',
  'advancement',
  'are',
  'fueled',
  'by',
  '1',
  'more',
  'efficient',
  'algorithm',
  '2',
  'fa