In [1]:
# pip install -U spaCy

# -- pre-trained models for French
# python -m spacy download fr
# python -m spacy download fr_core_news_md

In [None]:
import spacy  # import library
from spacy import displacy  
nlp = spacy.load('fr')  # initialize with a language

doc = nlp('Demain je travaille à la maison. Je travaille en France chez Tableau. Je vais pouvoir faire du NLP. Terrible désillusion pour la championne du monde.')

for token in doc: print(token.text) # tokens
for sent in doc.sents: print(sent)  # sentences
for ent in doc.ents: print(ent.text, ent.label_) # NER
for token in doc: # dependencies
    print('\n{}/{}\n  {}\n    {}/{}'.format(token.text,
                                            token.tag_,
                                            token.dep_,
                                            token.head.text,
                                            token.head.tag_))
    
displacy.render(doc, style='dep', jupyter=True, options={'distance': 99})

# Input

In [2]:
import spacy  # import library
from spacy import displacy  
nlp = spacy.load('fr')  # initialize with a language

doc = nlp('Demain je travaille à la maison. Je vais pouvoir faire du NLP. Terrible désillusion pour la championne du monde.')
dir(doc)

**SEE**:
* [Spacy's Linguistic Features Guide](https://spacy.io/usage/linguistic-features)  
* [Language Processing Pipelines](https://spacy.io/usage/processing-pipelines)
* [Spacy 101](https://spacy.io/usage/spacy-101)   

https://www.datacorner.fr/spacy-1/

# Tokens

* Text: The original word text.
* Lemma: The base form of the word.
* POS: The simple UPOS part-of-speech tag.
* Tag: The detailed part-of-speech tag.
* Dep: Syntactic dependency, i.e. the relation between tokens.
* Shape: The word shape – capitalization, punctuation, digits.
* is alpha: Is the token an alpha character?
* is stop: Is the token part of a stop list, i.e. the most common words of the language?

**TIPS**
* ```spacy.explain``` will show you a short description – for example, ```spacy.explain("VBZ")``` returns “verb, 3rd person singular present”.
* Dependency visualizer: https://spacy.io/usage/visualizers

In [3]:
for token in doc:
    print(token.text)

Demain
je
travaille
à
la
maison
.
Je
vais
pouvoir
faire
du
NLP
.
Terrible
désillusion
pour
la
championne
du
monde
.


In [47]:
for token in doc:
    print('\n{}\t{}\t{}\t{}\t{}\t{}\n\t{}\t{}\t{}'.format(
            token.idx,
            token.text,
            token.is_punct,
            token.is_space,
            token.lemma_,
            token.shape_,
            token.pos_,
            token.ent_type_,
            token.tag_,))


0	Demain	False	False	demain	Xxxxx
	ADV		ADV

7	je	False	False	je	xx
	PRON		PRON__Number=Sing|Person=1

10	travaille	False	False	travaille	xxxx
	VERB		VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin

20	en	False	False	en	xx
	ADP		ADP

23	France	False	False	France	Xxxxx
	PROPN	LOC	PROPN__Gender=Fem|Number=Sing

30	chez	False	False	chez	xxxx
	ADP		ADP

35	Tableau	False	False	tableau	Xxxxx
	NOUN	ORG	NOUN__Gender=Masc|Number=Sing

42	.	True	False	.	.
	PUNCT		PUNCT


In [5]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(doc, style='dep', options=options)

# Sentences

In [6]:
for sent in doc.sents:
    print(sent)

Demain je travaille à la maison.
Je vais pouvoir faire du NLP.
Terrible désillusion pour la championne du monde.


In [7]:
dir(sent)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_py_tokens',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_disk',
 'get_extension',
 'get_lca_matrix',
 'has_extension',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'merge',
 'noun_chunks',
 'noun_chunks_iterator',
 'print_tree',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set_extension',
 'similarity',
 'tensor',
 'text',
 'text_with_ws',
 'to_array',
 'to_byte

In [8]:
# Show noun chuncks
for chunk in doc.noun_chunks:
    print(chunk.text, " --> ", chunk.label_)

je  -->  NP
Je  -->  NP
pour la championne du monde  -->  NP


In [None]:
dir(doc.noun_chunks[0])

# NER

In [12]:
doc = nlp('Demain je travaille en France chez Tableau.')
for ent in doc.ents:
    print(ent.text, ent.label_)

France LOC
Tableau ORG


In [None]:
dir(doc.ents[0])

# Dependencies

In [30]:
for token in doc:
    print('\n{}/{}\n  {}\n    {}/{}'.format(token.text,
                                            token.tag_,
                                            token.dep_,
                                            token.head.text,
                                            token.head.tag_))


Demain/ADV
  advmod
    travaille/VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin

je/PRON__Number=Sing|Person=1
  nsubj
    travaille/VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin

travaille/VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
  ROOT
    travaille/VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin

en/ADP
  case
    France/PROPN__Gender=Fem|Number=Sing

France/PROPN__Gender=Fem|Number=Sing
  obl:mod
    travaille/VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin

chez/ADP
  case
    Tableau/NOUN__Gender=Masc|Number=Sing

Tableau/NOUN__Gender=Masc|Number=Sing
  obl:mod
    travaille/VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin

./PUNCT
  punct
    travaille/VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin


In [33]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 99})