In [1]:
# TextBlob: An Introduction of Methods
# Installation
# To install TextBlob, open a new Terminal and enter the following:

# Terminal
# $ pip install -U textblob
# $ python3 -m textblob.download_corpora

# Getting Started
#From here on, you can follow along with the notebook and create new notes and try out code as you like.

# import what we need
import pandas as pd
from pandas import DataFrame as DF, Series

import numpy as np

from textblob import TextBlob

# read data

# use only the column called 'text'
data = pd.read_csv('tweets.csv', usecols=['text'])

data.head(3)



Unnamed: 0,text
0,@VirginAmerica What @dhepburn said.
1,@VirginAmerica plus you've added commercials t...
2,@VirginAmerica I didn't today... Must mean I n...


In [2]:
# Create a TextBlob object
# TextBlob objects are the foundation of everything we will be doing. They take a string as an input and create an object on which we can apply many of the TextBlob methods.

# Let's create a blob using a tweet in our data.

# create a blob from the tweet at index 25
tweet = data.text[25]

blob = TextBlob(tweet)

In [3]:
# TextBlob Methods: Tokenization
# Tokenization allows us to split a string (a paragraph, a page, etc.) into various "tokens" that become useful in further processing and analysis. Tokenization also occurs on the back-end of some methods.

# Let's look at some tokenization options.

# Sentences
# Using the sentences method we get a list of Sentence objects, each containing (in order) all of the sentences that make up the string passed to TextBlob.

# return list of Sentence objects
blob.sentences

[Sentence("@VirginAmerica status match program."),
 Sentence("I applied and it's been three weeks."),
 Sentence("Called and emailed with no response.")]

In [4]:
# Similar to TextBlob objects, we can use various methods with Sentence objects.

# get the first sentence
s = blob.sentences[2]
# get tags from this sentence
s.tags[:10]

[('Called', 'VBN'),
 ('and', 'CC'),
 ('emailed', 'VBN'),
 ('with', 'IN'),
 ('no', 'DT'),
 ('response', 'NN')]

In [5]:
# Words
# Instead of a list of sentences, we can get a WordList object that returns all of the individual words in our string.

# return WordList object (works like a standard list in Python)
blob.words

WordList(['VirginAmerica', 'status', 'match', 'program', 'I', 'applied', 'and', 'it', "'s", 'been', 'three', 'weeks', 'Called', 'and', 'emailed', 'with', 'no', 'response'])

In [6]:
# We can access words in a WordList just like a regular Python list:
blob.words[7:9]

# Notice: TextBlob doesn't do the best job of handling contractions and possessive forms. Ex: "it's" is split into "it" and "'s".

WordList(['it', "'s"])

In [7]:
# Word Counts
# We can get a dict that contains all the unique words in our string as keys, and counts for each as values.

# returns defaultdict with unique words as keys and counts as values.
blob.word_counts

defaultdict(int,
            {'virginamerica': 1,
             'status': 1,
             'match': 1,
             'program': 1,
             'i': 1,
             'applied': 1,
             'and': 2,
             'it': 1,
             's': 1,
             'been': 1,
             'three': 1,
             'weeks': 1,
             'called': 1,
             'emailed': 1,
             'with': 1,
             'no': 1,
             'response': 1})

In [8]:
# we can get counts for individual words is two ways
# 1. use the count method on a WordList
print(blob.words.count('and'))
# 2. access a key in the word_counts dict
print(blob.word_counts['and'])

# NOTE!
# if you use word_counts['some_word'] and that word is not originally in the defaultdict, it will be added with a count of zero:

2
2


In [9]:
# example of above
b = TextBlob('a string of words')
b.word_counts

defaultdict(int, {'a': 1, 'string': 1, 'of': 1, 'words': 1})

In [10]:
# get count of word not in dict
b.word_counts['test']

0

In [11]:
# look at contents of dict again
# notice that 'test' is now included
b.word_counts

defaultdict(int, {'a': 1, 'string': 1, 'of': 1, 'words': 1, 'test': 0})

In [12]:
# Noun Phrases
# Noun phrases: a word or group of words that functions in a sentence as subject, object, or prepositional object.

# Examples of noun phrases are underlined in the sentences below. The head noun appears in bold.

# The election-year politics are annoying for many people.
# Almost every sentence contains at least one noun phrase.
# Current economic weakness may be a result of high energy prices.
# Noun phrases can be identified by the possibility of pronoun substitution, as is illustrated in the examples below.

# a. This sentence contains two noun phrases.
# b. It contains them.

# We can get a WordList containing noun phrases using the noun_phrase method on a blob.

blob.sentences

[Sentence("@VirginAmerica status match program."),
 Sentence("I applied and it's been three weeks."),
 Sentence("Called and emailed with no response.")]

In [13]:
# return WordList with noun phrases for tweet at index 11
TextBlob(data.text[11]).noun_phrases

WordList(['virginamerica', 'pretty graphics', 'minimal iconography'])

In [14]:
# TextBlob Methods: POS & Morphology
# Here we will cover all of the following:

# part-of-speech (POS) tagging: get list of tuples containing each word and it’s part of speech (e.g. noun)
# pluralization: get the plural form of any singular words
# singularization: get the singular form of any plural words
# lemmatization: get the stripped/unmodified version of a word (e.g. singing -> sing)

# part-of-speech (POS) tagging
# Using the tags method, we can get a list of doubles that contains every word in our string paired with its part of speech, as determined by the algorithm.

# POS tagging (also grammatical tagging) is useful for understanding context and grammar. Many words can belong to different parts of speech, depending on the context and words around them. POS tagging attempts to disambiguate a text by determining most likely parts of speech for each word based on the content.

# return list of tuples containing words in a string and the part of speech that each belongs to
blob.tags

# The tags each have a unique meaning. For example:

# 'VBX': verb (X indicates type of verb)
# 'DT': determiner
# A comprehensive table can be found at http://www.clips.ua.ac.be/pages/mbsp-tags

[('@', 'NN'),
 ('VirginAmerica', 'NNP'),
 ('status', 'NN'),
 ('match', 'NN'),
 ('program', 'NN'),
 ('I', 'PRP'),
 ('applied', 'VBD'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ("'s", 'VBZ'),
 ('been', 'VBN'),
 ('three', 'CD'),
 ('weeks', 'NNS'),
 ('Called', 'VBN'),
 ('and', 'CC'),
 ('emailed', 'VBN'),
 ('with', 'IN'),
 ('no', 'DT'),
 ('response', 'NN')]

In [15]:
# pluralization
# This is a relatively simple rule-based process that takes the singular form of a word and applies the correct pluralization to it.

# In TextBlob we can pluralize a single word (in the form of a Word obj.) or pluralize all words in a WordList.

# import
from textblob import Word, WordList
# create a Word object
w = Word('company')
# return the plural of a single word
w.pluralize()

'companies'

In [16]:
# Side note: we can also create WordList objects
wl = WordList(['who','what','when','where','why'])
wl

WordList(['who', 'what', 'when', 'where', 'why'])

In [17]:
# singularization
# The opposite of pluralization: take a word (or words) in plural form and singularize them.

wl = WordList(['agencies', 'octopi', 'words'])
wl.singularize()

WordList(['agency', 'octopus', 'word'])

In [18]:
# lemmatization
# Lemmatization takes a word that has been modified or morphed in some way using proper linguistic rules, and returns the stripped/unmodified version of it.

# The lemmatize() method has an optional parameter:

# pos – Part of speech to filter upon. If None, defaults to _wordnet.NOUN.
# options:
# 'n' for noun,
# 'v' for verb,
# 'a' for adjective,
# 'r' for adverb.
# Note: adverbs don't usually work with the standard lemmatize method.

w = Word('singing')
# for some words you have to pass the type
# in this case we pass 'v' for verb (not to be confused with POS tag formats)
w.lemmatize('v')

'sing'

In [19]:
# past participle verb
w = Word('went')
w.lemmatize('v')

'go'

In [20]:
# it doesn't always work: try an adverb
w = Word('kindly')
w.lemmatize('r')

'kindly'

In [21]:
# Parsing & n-grams
# Parsing
# Parsing gives us the syntactic structure of a string or sentence by appending each word with tags that indicate it's place in a hierarchy. See the tree in the PowerPoint slides for a visual example.

# Let's parse the sentence shown in the tree:

# return a string containing each word in the text along with its parts of speech hierarchy
b = TextBlob('John loves Mary')
b.parse()

# John/NNP/B-NP/O gives the position in the hierarchy of the text for the word "John" in our sentence, working from the word to the top of the hierarchy.

# In this case (For the word John):

# NNP indicates it is a "noun, proper singular"
# the B- in B-NP indicates the word is: inside the chunk, preceding word is part of a different chunk
# the NP in B-NP indicates it is part of a noun phrase
# O is "not part of chunk", meaning we are at the end of this particular hierarchy (chunk).
# Details can be read on the page that gives detailed parts of speech (link posted under POS tagging).

# Parsing and syntactic structure is a complex subject, and is not covered in depth here.

'John/NNP/B-NP/O loves/VBZ/B-VP/O Mary/NNP/B-NP/O'

In [22]:
# n-grams
# n-grams are groups of n successive words. Quite often n-grams are created by shifting one word at a time through a text, but there are cases where they skip k-words at a time.

# The usefulness of n-grams comes in with machine learning, where each n-gram is used as a feature for learning. These will be used more in the next workshop, but for now let's look at getting n-grams from a text using TextBlob:

# TextBlob has an ngrams method that will take an optional argument n, which is the size of n-grams to generate. Default is 3.

# The method returns a list of WordList objects.

# return list of n-grams (default n=3)
# get only first 5 n-grams
blob.ngrams()[:5]

[WordList(['VirginAmerica', 'status', 'match']),
 WordList(['status', 'match', 'program']),
 WordList(['match', 'program', 'I']),
 WordList(['program', 'I', 'applied']),
 WordList(['I', 'applied', 'and'])]

In [23]:
# get another set with n = 2
blob.ngrams(n=2)[:5]

[WordList(['VirginAmerica', 'status']),
 WordList(['status', 'match']),
 WordList(['match', 'program']),
 WordList(['program', 'I']),
 WordList(['I', 'applied'])]