In [1]:
from textblob import TextBlob

In [2]:
wiki = TextBlob("Python is a high-level, general-purpose programming language.")

词性标注

In [3]:
wiki.tags

[('Python', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('high-level', 'JJ'),
 ('general-purpose', 'JJ'),
 ('programming', 'NN'),
 ('language', 'NN')]

名词词组分析

In [4]:
wiki.noun_phrases

WordList(['python'])

情感分析

In [5]:
testimonial=TextBlob("Textblob is amazingly simple to use. What great fun!")

In [6]:
testimonial.sentiment

Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)

polarity [-1，+1] 正负表示情感的正负<br/>
subjectivity [0,1] 表示情绪主观程度，值越大，越主观

In [7]:
testimonial.polarity

0.39166666666666666

In [8]:
testimonial.subjectivity

0.4357142857142857

<h5>分词 Token</h5>

In [12]:
zen = TextBlob("Beautiful is better than ugly. "
                "Explicit is better than implicit. "
                "Simple is better than complex.")

加逗号会报错，也就证明这个括号内不能加括号

In [13]:
#分词
zen.words

WordList(['Beautiful', 'is', 'better', 'than', 'ugly', 'Explicit', 'is', 'better', 'than', 'implicit', 'Simple', 'is', 'better', 'than', 'complex'])

In [14]:
#分句子
zen.sentences

[Sentence("Beautiful is better than ugly."),
 Sentence("Explicit is better than implicit."),
 Sentence("Simple is better than complex.")]

In [15]:
for sen in zen.sentences:
    print(sen.sentiment)

Sentiment(polarity=0.2166666666666667, subjectivity=0.8333333333333334)
Sentiment(polarity=0.5, subjectivity=0.5)
Sentiment(polarity=0.06666666666666667, subjectivity=0.41904761904761906)


<h5>单复数转化</h5>

In [17]:
zen.words[1].pluralize()

'iss'

In [18]:
sentence = TextBlob('Use 4 spaces per indentation level.')

In [20]:
sentence.words

WordList(['Use', '4', 'spaces', 'per', 'indentation', 'level'])

In [21]:
#变复数
sentence.words[0].pluralize()

'Uses'

In [22]:
#变单数
sentence.words[2].singularize()

'space'

In [24]:
sentence.words[3].singularize()

'per'

<h5>原型变换</h5>

In [25]:
from textblob import Word

In [32]:
w=Word('octopi')

In [34]:
w.lemmatize('n')

'octopus'

In [36]:
w=Word('went')
#lemmatize('v/n') 后面的参数是动词或者名词，表示在哪些词性中进行过滤
w.lemmatize()

'went'

In [37]:
from textblob import Word

In [38]:
from textblob.wordnet import VERB

In [39]:
word=Word('octopus')

一个synset(同义词集：指意义相同的词条的集合)被一个三元组描述：（单词.词性.序号）<br/>
这里的’dog.n.01’指：dog的第一个名词意思;’chase.v.01’指：chase的第一个动词意思

In [40]:
word.synsets

[Synset('octopus.n.01'), Synset('octopus.n.02')]

In [41]:
Word("hack").get_synsets(pos=VERB)

[Synset('chop.v.05'),
 Synset('hack.v.02'),
 Synset('hack.v.03'),
 Synset('hack.v.04'),
 Synset('hack.v.05'),
 Synset('hack.v.06'),
 Synset('hack.v.07'),
 Synset('hack.v.08')]

In [43]:
#查看语句意思
Word('hack').definitions

['one who works hard at boring tasks',
 'a politician who belongs to a small clique that controls a political party for private rather than public ends',
 'a mediocre and disdained writer',
 'a tool (as a hoe or pick or mattock) used for breaking up the surface of the soil',
 'a car driven by a person whose job is to take passengers where they want to go in exchange for money',
 'an old or over-worked horse',
 'a horse kept for hire',
 'a saddle horse used for transportation rather than sport etc.',
 'cut with a hacking tool',
 'be able to manage or manage successfully',
 'cut away',
 'kick on the arms',
 'kick on the shins',
 'fix a computer program piecemeal until it works',
 'significantly cut up a manuscript',
 'cough spasmodically']

In [48]:
#create sysnet
from textblob.wordnet import Synset
# w=Synset('hack.v.08')
octopus = Synset('octopus.n.02')
shrimp = Synset('shrimp.n.03')

octopus.path_similarity(shrimp)

0.1111111111111111

<h4>WordList</h4>

In [49]:
animals = TextBlob("cat dog octopus")

In [51]:
animals.words

WordList(['cat', 'dog', 'octopus'])

In [52]:
animals.words.pluralize()

WordList(['cats', 'dogs', 'octopodes'])

<h5>spelling correction</h5>

In [54]:
b = TextBlob("I havv goood speling!")
print(b.correct())

I have good spelling!


In [55]:
#检查可能的每一个词的概率
w = Word('falibility')
w.spellcheck()

[('fallibility', 1.0)]

In [59]:
w=Word("bok")
w.spellcheck()

[('boy', 0.4033412887828162),
 ('book', 0.29832935560859186),
 ('box', 0.17899761336515513),
 ('bow', 0.1026252983293556),
 ('bob', 0.007159904534606206),
 ('bog', 0.00477326968973747),
 ('ok', 0.002386634844868735),
 ('boa', 0.002386634844868735)]

词频与词组频率统计

In [65]:
from collections import defaultdict

In [60]:
monty = TextBlob("We are no longer the Knights who say Ni. "
                     "We are now the Knights who say Ekki ekki ekki PTANG.")

In [71]:
# monty.word_counts('We')
#后面的【】中填写的是小写字母，并且这种搜索是不区分大小写的
monty.word_counts['we']

2

In [72]:
#不区分大小写
monty.words.count('we')

2

In [75]:
#区分大小写
monty.words.count('we',case_sensitive=True)

0

In [81]:
#词组词频
wiki.noun_phrases.count('python')

1

In [77]:
monty.noun_phrases

WordList(['ni', 'ekki', 'ekki ekki', 'ptang'])

In [80]:
monty.noun_phrases.count('ni')

1

<h4>Translation and Language Detection</h4>

In [82]:
en_blob = TextBlob(u'Simple is better than complex.')

In [86]:
en_blob.translate(to='"zh-CN"')

URLError: <urlopen error [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。>

In [89]:
b=chinese_blob = TextBlob(u"美丽优于丑陋")

In [90]:
b.detect_language()

URLError: <urlopen error [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。>

In [88]:
 chinese_blob.translate(from_lang="zh-CN", to='en')

URLError: <urlopen error [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。>

<h4>parse</h4>

In [91]:
b = TextBlob("And now for something completely different.")

In [92]:
print(b.parse())

And/CC/O/O now/RB/B-ADVP/O for/IN/B-PP/B-PNP something/NN/B-NP/I-PNP completely/RB/B-ADJP/O different/JJ/I-ADJP/O ././O/O


In [93]:
zen[0:19]

TextBlob("Beautiful is better")

In [94]:
zen.upper()

TextBlob("BEAUTIFUL IS BETTER THAN UGLY. EXPLICIT IS BETTER THAN IMPLICIT. SIMPLE IS BETTER THAN COMPLEX.")

In [95]:
zen.find("Simple")

65

In [99]:
apple_blob=TextBlob('apples')
banana_blob=TextBlob('bananas')

In [100]:
apple_blob < banana_blob

True

In [102]:
apple_blob=='apples'

True

In [108]:
blob = TextBlob("Now is better than never.")

In [110]:
blob.ngrams(2)

[WordList(['Now', 'is']),
 WordList(['is', 'better']),
 WordList(['better', 'than']),
 WordList(['than', 'never'])]

In [106]:
#判断开始与结束
for s in zen.sentences:
    print(s)
    print('start with index{},end with index{}'.format(s.start,s.end))

Beautiful is better than ugly.
start with index0,end with index30
Explicit is better than implicit.
start with index31,end with index64
Simple is better than complex.
start with index65,end with index95
