# Spacy

https://spacy.io/

A good NLP library in Python.

Main features: 
* Non-destructive tokenization
* Named entity recognition
* Support for over 25 languages
* Statistical models models for 8 languages
* Pre-trained word vectors
* Part-of-speech tagging
* Labelled dependency parsing
* Syntax-driven sentence segmentation
* Text classification
* Built-in visualizers for syntax and named entities
* Deep learning integration

Many alternatives, incl. NLTK (https://www.nltk.org/book/)

## Load models

In [1]:
import spacy # conda install -c conda-forge spacy (or https://spacy.io/usage/)

# download models:
# python -m spacy.en.download
# for a bigger one: python -m spacy.en_core_web_sm.download
# german: python -m spacy.de.download (see https://spacy.io/models/de#de_core_news_sm)

nlp = spacy.load('en')
'done loading'

'done loading'

In [2]:
doc = nlp(u"This is a sentence.")
[(w.text, w.pos_) for w in doc]

[('This', 'DET'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('sentence', 'NOUN'),
 ('.', 'PUNCT')]

In [3]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
    print(token)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [4]:
# many things happening under the hood:
'   '.join(dir(token))

'__bytes__   __class__   __delattr__   __dir__   __doc__   __eq__   __format__   __ge__   __getattribute__   __gt__   __hash__   __init__   __init_subclass__   __le__   __len__   __lt__   __ne__   __new__   __pyx_vtable__   __reduce__   __reduce_ex__   __repr__   __setattr__   __sizeof__   __str__   __subclasshook__   __unicode__   ancestors   check_flag   children   cluster   conjuncts   dep   dep_   doc   ent_iob   ent_iob_   ent_type   ent_type_   has_vector   head   i   idx   is_alpha   is_ancestor_of   is_ascii   is_bracket   is_digit   is_left_punct   is_lower   is_oov   is_punct   is_quote   is_right_punct   is_space   is_stop   is_title   lang   lang_   left_edge   lefts   lemma   lemma_   lex_id   like_email   like_num   like_url   lower   lower_   n_lefts   n_rights   nbor   norm   norm_   orth   orth_   pos   pos_   prefix   prefix_   prob   rank   repvec   right_edge   rights   shape   shape_   similarity   string   subtree   suffix   suffix_   tag   tag_   text   text_with

In [5]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [6]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [7]:
# visualize
from spacy import displacy
displacy.serve(doc, style='dep')

ImportError: cannot import name 'displacy'

In [8]:
# Determine semantic similarities
doc1 = nlp(u"my fries were super gross")
doc2 = nlp(u"such disgusting fries")
similarity = doc1.similarity(doc2)
print(similarity)

0.6930155149452184


# Word vectors

In [17]:
def most_similar(word):
    by_similarity = sorted(word.vocab, key=lambda w: word.similarity(w), reverse=True)
    return [w.orth_ for w in by_similarity[:20]]

most_similar(nlp.vocab[u'Paris'])

['Paris',
 'paris',
 'PARIS',
 'Strasbourg',
 'strasbourg',
 'STRASBOURG',
 'Brussels',
 'brussels',
 'BRUSSELS',
 'Rouen',
 'rouen',
 'ROUEN',
 'Marseilles',
 'marseilles',
 'MARSEILLES',
 'Lausanne',
 'LAUSANNE',
 'lausanne',
 'AIX-EN-PROVENCE',
 'Aix-en-Provence']

In [20]:
def most_similar(word):
    queries = [w for w in word.vocab if w.is_lower == word.is_lower and w.prob >= -15]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return by_similarity[:20]

[w.lower_ for w in most_similar(nlp.vocab[u'paris'])]

['paris',
 'cologne',
 'bologna',
 'rome',
 'london',
 'madrid',
 'tokyo',
 'liege',
 'sens',
 'okc',
 'france',
 'vegas',
 'montreal',
 'nyc',
 'italy',
 'tienen',
 'pouvoir',
 'toronto',
 'gent',
 'ganja']