## Quick start tutorial
- Common text processing operations with textblob

In [1]:
# import 
from textblob import TextBlob

In [2]:
# list out textblob modules
print(dir(TextBlob))

['__add__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cmpkey', '_compare', '_create_sentence_objects', '_strkey', 'analyzer', 'classify', 'correct', 'ends_with', 'endswith', 'find', 'format', 'index', 'join', 'json', 'lower', 'ngrams', 'noun_phrases', 'np_counts', 'np_extractor', 'parse', 'parser', 'polarity', 'pos_tagger', 'pos_tags', 'raw_sentences', 'replace', 'rfind', 'rindex', 'sentences', 'sentiment', 'sentiment_assessments', 'serialized', 'split', 'starts_with', 'startswith', 'strip', 'subjectivity', 'tags', 'title', 'to_json', 'tokenize', 'tokenizer', 'tokens', 'upper', 'word_counts', 'words']


In [3]:
# create textblob
text = 'I like to read about new facts, innovations and technology.'
tb = TextBlob(text)
tb

TextBlob("I like to read about new facts, innovations and technology.")

In [4]:
tb.tags

[('I', 'PRP'),
 ('like', 'VBP'),
 ('to', 'TO'),
 ('read', 'VB'),
 ('about', 'IN'),
 ('new', 'JJ'),
 ('facts', 'NNS'),
 ('innovations', 'NNS'),
 ('and', 'CC'),
 ('technology', 'NN')]

In [5]:
tb.words

WordList(['I', 'like', 'to', 'read', 'about', 'new', 'facts', 'innovations', 'and', 'technology'])

In [6]:
tb.word_counts

defaultdict(int,
            {'i': 1,
             'like': 1,
             'to': 1,
             'read': 1,
             'about': 1,
             'new': 1,
             'facts': 1,
             'innovations': 1,
             'and': 1,
             'technology': 1})

In [7]:
tb.noun_phrases

WordList(['new facts'])

In [8]:
tb.np_counts

defaultdict(int, {'new facts': 1})

In [9]:
print(f'singualized word : {tb.words[6].singularize()}')
print(f'pluralized word : {tb.words[1].pluralize()}')

singualized word : fact
pluralized word : likes


#### Words Lemmatization and Integration

In [10]:
from textblob import Word
from textblob.wordnet import VERB

In [11]:
w = Word('octopi')
w.lemmatize()

'octopus'

In [12]:
w.synsets 

[Synset('octopus.n.01'), Synset('octopus.n.02')]

In [13]:
Word('copy').get_synsets(pos=VERB)

[Synset('copy.v.01'),
 Synset('imitate.v.01'),
 Synset('replicate.v.02'),
 Synset('copy.v.04')]

In [14]:
Word('copy').definitions # returns list of definitions corresponds to synset

['a reproduction of a written record (e.g. of a legal or school record)',
 'a thing made to be similar or identical to another thing',
 'matter to be printed; exclusive of graphical materials',
 'material suitable for a journalistic account',
 'copy down as is',
 "reproduce someone's behavior or looks",
 'reproduce or make an exact copy of',
 'make a replica of']

In [15]:
# create synset
from textblob.wordnet import Synset
cp = Synset('copy.n.01')
sh = Synset('shrimp.n.03')
print(cp)
print(sh)
print(f'similarity: {cp.path_similarity(sh)}')

Synset('transcript.n.02')
Synset('shrimp.n.03')
similarity: 0.05263157894736842


In [16]:
tb.tokens

WordList(['I', 'like', 'to', 'read', 'about', 'new', 'facts', ',', 'innovations', 'and', 'technology', '.'])

In [17]:
tb.parse().split(' ')

['I/PRP/B-NP/O',
 'like/IN/B-PP/B-PNP',
 'to/TO/I-PP/I-PNP',
 'read/VB/B-VP/O',
 'about/IN/B-PP/B-PNP',
 'new/JJ/B-NP/I-PNP',
 'facts/NNS/I-NP/I-PNP',
 ',/,/O/O',
 'innovations/NNS/B-NP/O',
 'and/CC/I-NP/O',
 'technology/NN/I-NP/O',
 '././O/O']

In [18]:
tb.sentiment

Sentiment(polarity=0.13636363636363635, subjectivity=0.45454545454545453)

#### textblob as python string

In [19]:
zen = TextBlob('If the implementation is hard to explain, it\'s a bad idea.')

In [20]:
zen[3:15]

TextBlob("the implemen")

In [21]:
zen.upper()

TextBlob("IF THE IMPLEMENTATION IS HARD TO EXPLAIN, IT'S A BAD IDEA.")

In [22]:
zen.find('is')

22

In [23]:
# comparisons
cherry = TextBlob('cherry')
berry = TextBlob('berries')
cherry < berry

False

In [24]:
'{0} and {1} are very tasty'.format(cherry, berry)

'cherry and berries are very tasty'

In [26]:
wiki_machine = TextBlob('A machine is a physical system. That uses power to apply forces and control movement. The term is commonly applied to artificial devices. such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.')
print(f'index: {[(s, s.start, s.end) for s in wiki_machine.sentences]}')

index: [(Sentence("A machine is a physical system."), 0, 31), (Sentence("That uses power to apply forces and control movement."), 32, 85), (Sentence("The term is commonly applied to artificial devices."), 86, 137), (Sentence("such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines."), 138, 255)]


#### n-grams

In [28]:
tb.ngrams(n = 4)

[WordList(['I', 'like', 'to', 'read']),
 WordList(['like', 'to', 'read', 'about']),
 WordList(['to', 'read', 'about', 'new']),
 WordList(['read', 'about', 'new', 'facts']),
 WordList(['about', 'new', 'facts', 'innovations']),
 WordList(['new', 'facts', 'innovations', 'and']),
 WordList(['facts', 'innovations', 'and', 'technology'])]

#### spell correction

In [29]:
check = TextBlob('yiu haie thoes beauitfll things.')
check.correct()

TextBlob("you have those beautiful things.")

In [33]:
w = Word('personell')
w.spellcheck()

[('personal', 0.65),
 ('personally', 0.2642857142857143),
 ('peroneal', 0.06428571428571428),
 ('personnel', 0.014285714285714285),
 ('personen', 0.007142857142857143)]