In [28]:
import en_core_web_md
from spacy import displacy
from sklearn.feature_extraction import DictVectorizer

In [29]:
# Load the models and parse a sentence

nlp = en_core_web_md.load()
sentence = nlp("The pound extended losses against both the Dollar and the Euro.")

In [30]:
displacy.render(sentence, style='dep', options={"collapse_punct": False, "distance": 110}, jupyter=True)

In [46]:
# Collect features for each word in the sentence

data, labels = [], []
for i in range(len(sentence)):
    features = dict()
    features["word"] = sentence[i].text.lower()
    features["is_capitalized"] = sentence[i].text.istitle()
    features["length"] = len(sentence[i].text)
    features["word-1"] = sentence[i-1].text if i > 0 else "NONE"
    features["tag-1"] = sentence[i-1].tag_ if i > 0 else "NONE"
    features["right-bigram"] = sentence[i+1].text + "_" + sentence[i+2].text \
        if i < (len(sentence) - 2) else "NONE"
    data.append(features)
    labels.append(sentence[i].tag_)

# Label and features for each word:
for i, j in zip(data, labels):
    print(j, i)

DT {'word': 'the', 'is_capitalized': True, 'length': 3, 'word-1': 'NONE', 'tag-1': 'NONE', 'right-bigram': 'pound_extended'}
NN {'word': 'pound', 'is_capitalized': False, 'length': 5, 'word-1': 'The', 'tag-1': 'DT', 'right-bigram': 'extended_losses'}
VBD {'word': 'extended', 'is_capitalized': False, 'length': 8, 'word-1': 'pound', 'tag-1': 'NN', 'right-bigram': 'losses_against'}
NNS {'word': 'losses', 'is_capitalized': False, 'length': 6, 'word-1': 'extended', 'tag-1': 'VBD', 'right-bigram': 'against_both'}
IN {'word': 'against', 'is_capitalized': False, 'length': 7, 'word-1': 'losses', 'tag-1': 'NNS', 'right-bigram': 'both_the'}
CC {'word': 'both', 'is_capitalized': False, 'length': 4, 'word-1': 'against', 'tag-1': 'IN', 'right-bigram': 'the_Dollar'}
DT {'word': 'the', 'is_capitalized': False, 'length': 3, 'word-1': 'both', 'tag-1': 'CC', 'right-bigram': 'Dollar_and'}
NNP {'word': 'dollar', 'is_capitalized': True, 'length': 6, 'word-1': 'the', 'tag-1': 'DT', 'right-bigram': 'and_the'}

In [41]:
# Vectorize the features
vec = DictVectorizer()

# The resulting sparse matrix
x = vec.fit_transform(data).toarray()
print(x)

[[1. 3. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 5. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 8. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 6. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 7. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 3. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 6. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0

In [43]:
# The full feature set:
print(vec.get_feature_names())
print(len(vec.get_feature_names()))

['is_capitalized', 'length', 'right-bigram=Dollar_and', 'right-bigram=Euro_.', 'right-bigram=NONE', 'right-bigram=against_both', 'right-bigram=and_the', 'right-bigram=both_the', 'right-bigram=extended_losses', 'right-bigram=losses_against', 'right-bigram=pound_extended', 'right-bigram=the_Dollar', 'right-bigram=the_Euro', 'tag-1=CC', 'tag-1=DT', 'tag-1=IN', 'tag-1=NN', 'tag-1=NNP', 'tag-1=NNS', 'tag-1=NONE', 'tag-1=VBD', 'word-1=Dollar', 'word-1=Euro', 'word-1=NONE', 'word-1=The', 'word-1=against', 'word-1=and', 'word-1=both', 'word-1=extended', 'word-1=losses', 'word-1=pound', 'word-1=the', 'word=.', 'word=against', 'word=and', 'word=both', 'word=dollar', 'word=euro', 'word=extended', 'word=losses', 'word=pound', 'word=the']
42


In [45]:
# The feature set for an unknown word:
print(vec.transform({"word": "hi", "is_capitalized": True, "length": 2,
                     "word-1": "NONE", "tag-1": "NONE", "right-bigram": "he_says"})
      .toarray())

[[1. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
