In [1]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
# 1 词性转换

In [2]:
def wordtokenization():
    content = """Stemming is funnier than a bummer says the sushi loving computer scientist.
    She really wants to buy cars. She told me angrily. It is better for you.
    Man is walking. We are meeting tomorrow. You really don't know..!"""
    print(word_tokenize(content))  # 分词

def wordlemmatization():
    """WordNetLemmatizer()改变词性"""
    wordlemma = WordNetLemmatizer()
    print(wordlemma.lemmatize('cars'))
    print(wordlemma.lemmatize('walking',pos='v'))  # 把walking转化成为v动词
    print(wordlemma.lemmatize('meeting',pos='n'))
    print(wordlemma.lemmatize('meeting',pos='v'))
    print(wordlemma.lemmatize('better',pos='a'))
    print(wordlemma.lemmatize('is',pos='v'))
    print(wordlemma.lemmatize('funnier',pos='a'))
    print(wordlemma.lemmatize('expected',pos='v'))
    print(wordlemma.lemmatize('fantasized',pos='v'))

In [3]:
wordtokenization()
print("\n")
print("----------Word Lemmatization----------")
wordlemmatization()

['Stemming', 'is', 'funnier', 'than', 'a', 'bummer', 'says', 'the', 'sushi', 'loving', 'computer', 'scientist', '.', 'She', 'really', 'wants', 'to', 'buy', 'cars', '.', 'She', 'told', 'me', 'angrily', '.', 'It', 'is', 'better', 'for', 'you', '.', 'Man', 'is', 'walking', '.', 'We', 'are', 'meeting', 'tomorrow', '.', 'You', 'really', 'do', "n't", 'know..', '!']


----------Word Lemmatization----------
car
walk
meeting
meet
good
be
funny
expect
fantasize


In [None]:
# 2 句法结构

In [7]:
import nltk
from nltk import CFG
from nltk.tree import *
from collections import defaultdict

In [8]:
def definegrammar_pasrereult():
    Grammar = nltk.CFG.fromstring(""" 
    S -> NP VP 
    PP -> P NP 
    NP -> Det N | Det N PP | 'I' 
    VP -> V NP | VP PP 
    Det -> 'an' | 'my' 
    N -> 'elephant' | 'pajamas' 
    V -> 'shot' 
    P -> 'in' 
    """)
    sent = "I shot an elephant".split()
    parser = nltk.ChartParser(Grammar)
    trees = parser.parse(sent)
    for tree in trees:
        print(tree)

# Part 2: Draw the parse tree
def draw_parser_tree():
    dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])])
    dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])])
    vp = Tree('vp', [Tree('v', ['chased']), dp2])
    tree = Tree('s', [dp1, vp])
    print(tree)
    print(tree.pformat_latex_qtree())
    tree.pretty_print()


In [9]:
print("\n--------Parsing result as per defined grammar-------")
definegrammar_pasrereult()
print("\n--------Drawing Parse Tree-------")
draw_parser_tree()


--------Parsing result as per defined grammar-------
(S (NP I) (VP (V shot) (NP (Det an) (N elephant))))

--------Drawing Parse Tree-------
(s (dp (d the) (np dog)) (vp (v chased) (dp (d the) (np cat))))
\Tree [.s
        [.dp [.d the ] [.np dog ] ]
        [.vp [.v chased ] [.dp [.d the ] [.np cat ] ] ] ]
              s               
      ________|_____           
     |              vp        
     |         _____|___       
     dp       |         dp    
  ___|___     |      ___|___   
 d       np   v     d       np
 |       |    |     |       |  
the     dog chased the     cat



In [None]:
# 3 

In [13]:
import nltk
from nltk import word_tokenize
import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

##  3.1 取得语料treebank

In [14]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print( tagged_sentences[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


## 3.2 针对句子，构建词特征

In [16]:
def features(sentence, index):
    " sentence: [w1, w2, ...], index: the index of the word "
    return {
    'word': sentence[index], # 当前词
    'is_first': index == 0,
    'is_last': index == len(sentence) - 1,
    'is_capitalized': sentence[index][0].upper() == sentence[index][0],
    'is_all_caps': sentence[index].upper() == sentence[index],
    'is_all_lower': sentence[index].lower() == sentence[index],
    'prefix-1': sentence[index][0],
    'prefix-2': sentence[index][:2],
    'prefix-3': sentence[index][:3],
    'suffix-1': sentence[index][-1],
    'suffix-2': sentence[index][-2:],
    'suffix-3': sentence[index][-3:],
    'prev_word': '' if index == 0 else sentence[index - 1],
    'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
    'has_hyphen': '-' in sentence[index],
    'is_numeric': sentence[index].isdigit(),
    'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

def transform_to_dataset(tagged_sentences):
    """tagged_sentences:每个单词都有词性标注
    X是由features构建的每个词的特征
    y是每个词的词性
    """
    X, y = [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])            
    return X, y

In [17]:
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

In [23]:
test_sentences[0]

[('We', 'PRP'),
 ('can', 'MD'),
 ('understand', 'VB'),
 ('and', 'CC'),
 ('share', 'VB'),
 ('the', 'DT'),
 ('compassion', 'NN'),
 ('that', 'WDT'),
 ('*T*-2', '-NONE-'),
 ('makes', 'VBZ'),
 ('judges', 'NNS'),
 ('sometimes', 'RB'),
 ('wish', 'VB'),
 ('*-3', '-NONE-'),
 ('to', 'TO'),
 ('offer', 'VB'),
 ('a', 'DT'),
 ('kind', 'NN'),
 ('of', 'IN'),
 ('Solomonic', 'JJ'),
 ('aid', 'NN'),
 ('to', 'TO'),
 ('those', 'DT'),
 ('who', 'WP'),
 ('*T*-4', '-NONE-'),
 ("'ve", 'VBP'),
 ('been', 'VBN'),
 ('hurt', 'VBN'),
 ('*-1', '-NONE-'),
 ('.', '.')]

In [19]:
X, y = transform_to_dataset(training_sentences)
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

clf.fit(X[:10000], y[:10000]) 

Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('classifier', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [None]:
# 3.3预测

In [29]:
X_test, y_test = transform_to_dataset(test_sentences)

print( "Accuracy:", clf.score(X_test, y_test))


def pos_tag(sentence):
    tagged_sentence = []
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)


print(pos_tag(word_tokenize('This is my friend, John.')))

Accuracy: 0.8951068616422947
<zip object at 0x7fe81a49c988>
