# Textblob

In [1]:
# pip install -U textblob
# python -m textblob.download_corpora

In [1]:
# First, the import.
from textblob import TextBlob
from textblob import Word

from textblob.wordnet import VERB

from textblob.classifiers import NaiveBayesClassifier

In [2]:
# Let’s create our first TextBlob object.
wiki = TextBlob("Python is a high-level, general-purpose programming language.")

In [3]:
wiki

TextBlob("Python is a high-level, general-purpose programming language.")

## Tokenization

In [4]:
zen = TextBlob("Beautiful is better than ugly. "
               "Explicit is better than implicit. "
               "Simple is better than complex.")

In [5]:
zen.words

WordList(['Beautiful', 'is', 'better', 'than', 'ugly', 'Explicit', 'is', 'better', 'than', 'implicit', 'Simple', 'is', 'better', 'than', 'complex'])

In [6]:
zen.words.count('is')

3

In [7]:
zen.sentences

[Sentence("Beautiful is better than ugly."),
 Sentence("Explicit is better than implicit."),
 Sentence("Simple is better than complex.")]

## Words and noun phrase counts

### Using word_counts dictionary.

In [8]:
monty = TextBlob("We are no longer the Knights who say Ni. "
                 "We are now the Knights who say Ekki ekki ekki PTANG.")

In [9]:
monty.word_counts

defaultdict(int,
            {'we': 2,
             'are': 2,
             'no': 1,
             'longer': 1,
             'the': 2,
             'knights': 2,
             'who': 2,
             'say': 2,
             'ni': 1,
             'now': 1,
             'ekki': 3,
             'ptang': 1})

In [10]:
monty.word_counts['ekki']

3

If you access the frequencies this way, the search will not be case sensitive, and words that are not found will have a frequency of 0.

The second way is to use the count() method.

In [11]:
monty.words.count('ekki', case_sensitive=True)

2

In [13]:
monty.noun_phrases.count('we')

0

## Words Inflection and Lemmatization

Each word in TextBlob.words or Sentence.words is a Word object (a subclass of unicode) with useful methods, e.g. for word inflection.

In [12]:
sentence = TextBlob('Use 4 spaces per indentation level.')

In [28]:
sentence.words

WordList(['Use', '4', 'spaces', 'per', 'indentation', 'level'])

In [13]:
sentence.words[2].singularize()

'space'

In [14]:
sentence.words[-1].pluralize()

'levels'

Words can be lemmatized by calling the lemmatize method.

In [31]:
w = Word("octopi")

In [32]:
w.lemmatize()

'octopus'

In [None]:
w = Word("went")

In [33]:
w.lemmatize("v") 

'octopi'

## POS tagging

Part-of-speech tags can be accessed through the tags property.

In [15]:
zen = TextBlob("Beautiful is better than ugly. "
               "Explicit is better than implicit. "
               "Simple is better than complex.")

In [16]:
zen.tags

[('Beautiful', 'NNP'),
 ('is', 'VBZ'),
 ('better', 'JJR'),
 ('than', 'IN'),
 ('ugly', 'RB'),
 ('Explicit', 'NNP'),
 ('is', 'VBZ'),
 ('better', 'JJR'),
 ('than', 'IN'),
 ('implicit', 'NN'),
 ('Simple', 'NN'),
 ('is', 'VBZ'),
 ('better', 'JJR'),
 ('than', 'IN'),
 ('complex', 'JJ')]

In [17]:
for word, pos in zen.tags:
    print(word + " => " + pos)

Beautiful => NNP
is => VBZ
better => JJR
than => IN
ugly => RB
Explicit => NNP
is => VBZ
better => JJR
than => IN
implicit => NN
Simple => NN
is => VBZ
better => JJR
than => IN
complex => JJ


## Noun Phrase Extraction

noun phrases are accessed through the noun_phrases property.

In [18]:
document = ("In computer science, artificial intelligence (AI), \
            sometimes called machine intelligence, is intelligence \
            demonstrated by machines, in contrast to the natural intelligence \
            displayed by humans and animals. Computer science defines AI \
            research as the study of \"intelligent agents\": any device that \
            perceives its environment and takes actions that maximize its\
            chance of successfully achieving its goals.[1] Colloquially,\
            the term \"artificial intelligence\" is used to describe machines\
            that mimic \"cognitive\" functions that humans associate with other\
            human minds, such as \"learning\" and \"problem solving\".[2]")

In [19]:
text_blob_object = TextBlob(document)

In [20]:
text_blob_object = TextBlob(document)
for noun_phrase in text_blob_object.noun_phrases:
    print(noun_phrase)

computer science
artificial intelligence
ai
machine intelligence
natural intelligence
computer
science defines
ai
intelligent agents
colloquially
artificial intelligence
describe machines
human minds


## Spelling Correction
Use the correct() method to attempt spelling correction.

Spelling correction is based on Peter Norvig’s “How to Write a Spelling Corrector” as implemented in the pattern library. It is about 70% accurate

In [40]:
b = TextBlob("I havv goood speling!")
print(b.correct())

I have good spelling!


Word objects have a spellcheck() Word.spellcheck() method that returns a list of (word, confidence) tuples with spelling suggestions.

In [44]:
from textblob import Word
w = Word('falibility')

In [45]:
w.spellcheck()

[('fallibility', 1.0)]

## Translation and Language Detection

One of the most powerful capabilities of the TextBlob library is to translate from one language to another. On the backend, the TextBlob language translator uses the __Google Translate API__

In [46]:
en_blob = TextBlob(u'Simple is better than complex.')

In [47]:
en_blob.translate(to='es')

TextBlob("Simple es mejor que complejo.")

In [48]:
chinese_blob = TextBlob(u"美丽优于丑陋")
chinese_blob.translate(from_lang="zh-CN", to='en')

TextBlob("Beauty is better than ugly")

You can also attempt to detect a TextBlob’s language using TextBlob.detect_language().

In [49]:
b = TextBlob(u"بسيط هو أفضل من مجمع")
b.detect_language()

'ar'

In [50]:
b = TextBlob("tumi kemon aachon")
b.detect_language()

'bn'

In [73]:
b = TextBlob(u"क्या हाल है")
b.detect_language()

'hi'

In [62]:
# क्या हाल है 
# توهان ڪيئن آهيو 
# আপনি কেমন আছেন 
# நீங்கள் எப்படி இருக்கிறீர்கள் 
# तिमीलाई कस्तो छ 
# તમે કેમ છો 
# तू कसा आहेस 
# ന്തൊക്കെയുണ്ട് 
# آپ کیسے ہو 
# ನೀವು ಹೇಗಿದ್ದೀರಿ

## n-grams

N-Grams refer to n combination of words in a sentence. For instance, for a sentence "I love watching football", some 2-grams would be (I love), (love watching) and (watching football). 

N-Grams can play a crucial role in text classification.

The TextBlob.ngrams() method returns a list of tuples of n successive words.

In [74]:
blob = TextBlob("Now is better than never.")

In [75]:
blob.ngrams(n=3)

[WordList(['Now', 'is', 'better']),
 WordList(['is', 'better', 'than']),
 WordList(['better', 'than', 'never'])]

## WordNet Integration

WordNet is a database of English words that are linked together by their semantic relationships. It is like a supercharged dictionary/thesaurus with a graph structure.

TextBlob 0.7 now integrates __NLTK's WordNet__ interface, making it very simple to interact with WordNet.

### Synsets
As you know, synonyms are words that have similar meanings. A synonym set, or synset, is a group of synonyms. A synset, therefore, corresponds to an abstract concept.

In TextBlob, you can access the synsets that a word belongs to by accessing the synsets property of a Word object.

In [78]:
from textblob import Word
word = Word("plant")
word.synsets

[Synset('plant.n.01'),
 Synset('plant.n.02'),
 Synset('plant.n.03'),
 Synset('plant.n.04'),
 Synset('plant.v.01'),
 Synset('implant.v.01'),
 Synset('establish.v.02'),
 Synset('plant.v.04'),
 Synset('plant.v.05'),
 Synset('plant.v.06')]

In [79]:
word.definitions

['buildings for carrying on industrial labor',
 '(botany) a living organism lacking the power of locomotion',
 'an actor situated in the audience whose acting is rehearsed but seems spontaneous to the audience',
 'something planted secretly for discovery by another',
 'put or set (seeds, seedlings, or plants) into the ground',
 'fix or set securely or deeply',
 'set up or lay the groundwork for',
 'place into a river',
 'place something or someone in a certain position in order to secretly observe or deceive',
 'put firmly in the mind']

In [80]:
plant = word.synsets[1]

The synonyms contained within a synset are called lemmas. You can access the string versions of these synonyms via a Synset's lemma_names property.

In [81]:
plant.lemma_names

<bound method Synset.lemma_names of Synset('plant.n.02')>

## Converting to Upper and Lowercase
TextBlob objects are very similar to strings. You can convert them to upper case or lower case, change their values, and concatenate them together as well. In the following script, we convert the text from the TextBlob object to upper case:

In [76]:
text = "I love to watch football, but I have never played it"
text_blob_object = TextBlob(text)

print(text_blob_object.upper())

I LOVE TO WATCH FOOTBALL, BUT I HAVE NEVER PLAYED IT


In [77]:
text = "I LOVE TO WATCH FOOTBALL, BUT I HAVE NEVER PLAYED IT"
text_blob_object = TextBlob(text)

print(text_blob_object.lower())

i love to watch football, but i have never played it


## Sentiment Analysis

The sentiment property returns a named tuple of the form Sentiment(polarity, subjectivity). 

The polarity score is a float within the range [-1.0, 1.0]. 

The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

Polarity is a float value within the range [-1.0 to 1.0] where 

    0 indicates neutral, 
    +1 indicates a very positive sentiment and 
    -1 represents a very negative sentiment.

Subjectivity is a float value within the range [0.0 to 1.0] where 

    0.0 is very objective and 
    1.0 is very subjective. 

Subjective sentence expresses some personal feelings, views, beliefs, opinions, allegations, desires, beliefs, suspicions, and speculations 

where as Objective sentences are factual.

In [21]:
testimonial1 = TextBlob("Textblob is amazingly simple to use. What great fun!")
testimonial2 = TextBlob("Earth is sphere")

In [22]:
print('Sentiment: ', testimonial1.sentiment)
print('Sentiment: ', testimonial2.sentiment)

#print('Polarity: ', testimonial1.sentiment.polarity)

Sentiment:  Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)
Sentiment:  Sentiment(polarity=0.0, subjectivity=0.0)


In [85]:
word = TextBlob('Earth')
word.sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [86]:
# DLSA (Document level sentiment Analysis)

print('Sentiment: ', zen.sentiment)
print('Polarity: ', zen.sentiment.polarity)

Sentiment:  Sentiment(polarity=0.0, subjectivity=0.0)
Polarity:  0.0


In [88]:
# SLSA (Sentence level sentiment analysis)
for sentence in zen.sentences:
    print(sentence.sentiment)

Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.0, subjectivity=0.0)


## Building a Text Classification System

Some classifiers...

    class textblob.classifiers.BaseClassifier
    class textblob.classifiers.DecisionTreeClassifier
    class textblob.classifiers.MaxEntClassifier
    class textblob.classifiers.NLTKClassifier
    class textblob.classifiers.NaiveBayesClassifier
    class textblob.classifiers.PositiveNaiveBayesClassifier
    

In [91]:
from textblob.classifiers import NaiveBayesClassifier

In [23]:
# create some training and test data.
train = [
     ('I love this sandwich.', 'pos'),
     ('this is an amazing place!', 'pos'),
     ('I feel very good about these beers.', 'pos'),
     ('this is my best work.', 'pos'),
     ("what an awesome view", 'pos'),
     ('I do not like this restaurant', 'neg'),
     ('I am tired of this stuff.', 'neg'),
     ("I can't deal with this", 'neg'),
     ('he is my sworn enemy!', 'neg'),
     ('my boss is horrible.', 'neg')
 ]

In [24]:
test = [
     ('the beer was good.', 'pos'),
     ('I do not enjoy my job', 'neg'),
     ("I ain't feeling dandy today.", 'neg'),
     ("I feel amazing!", 'pos'),
     ('Gary is a friend of mine.', 'pos'),
     ("I can't believe I'm doing this.", 'neg')
 ]

In [26]:
cl = NaiveBayesClassifier(train)

In [95]:
# Loading Data from Files
# You can also load data from common file formats including CSV, JSON, and TSV.

# CSV files should be formatted like so:

# I love this sandwich.,pos
# This is an amazing place!,pos
# I do not like this restaurant,neg

In [96]:
# Classifying Text
# Call the classify(text) method to use the classifier.

cl.classify("This is an worst library!")

'pos'

In [97]:
# You can get the label probability distribution with the prob_classify(text) method.

In [98]:
prob_dist = cl.prob_classify("This one's a doozy.")

In [99]:
prob_dist.max()

'pos'

In [100]:
round(prob_dist.prob("pos"), 2)

0.63

In [59]:
round(prob_dist.prob("neg"), 2)

0.37

In [101]:
# Evaluating Classifiers
# To compute the accuracy on our test set, use the accuracy(test_data) method.

cl.accuracy(test)

0.8333333333333334

In [102]:
# Use the show_informative_features() method to display a listing of the most informative features.

In [103]:
cl.show_informative_features(5)  

Most Informative Features
            contains(my) = True              neg : pos    =      1.7 : 1.0
            contains(an) = False             neg : pos    =      1.6 : 1.0
             contains(I) = True              neg : pos    =      1.4 : 1.0
             contains(I) = False             pos : neg    =      1.4 : 1.0
            contains(my) = False             pos : neg    =      1.3 : 1.0


### Updating Classifiers with New Data

In [106]:
new_data = [('She is my best friend.', 'pos'),
             ("I'm happy to have a new friend.", 'pos'),
             ("Stay thirsty, my friend.", 'pos'),
             ("He ain't from around here.", 'neg')]

In [107]:
cl.update(new_data)

True

In [139]:
cl.accuracy(test)

1.0