# SpaCy Tutorial on https://www.machinelearningplus.com/spacy-tutorial-nlp/#docobject & https://blog.dominodatalab.com/natural-language-in-python-using-spacy/

In [1]:
import spacy
import fr_core_news_md
from spacy import displacy
nlp = fr_core_news_md.load()
#nlp = spacy.load("fr_core_web_md")
print(nlp.pipe_names)
!python -m spacy download fr_core_web_lg

['tagger', 'parser', 'ner']
[!] Skipping model package dependencies and setting `--no-deps`. You don't seem
to have the spaCy package itself installed (maybe because you've built from
source?), so installing the model dependencies would cause spaCy to be
downloaded, which probably isn't what you want. If the model package has other
dependencies, you'll have to install them manually.

[x] No compatible model found for 'fr_core_web_lg' (spaCy v2.3.2).



In [2]:
tokens = nlp("Cela est bien dit, répondit Candide, mais il faut cultiver notre jardin.")

for token in tokens:
  print(token.text ,' ',token.has_vector)

Cela   True
est   True
bien   True
dit   True
,   True
répondit   True
Candide   True
,   True
mais   True
il   True
faut   True
cultiver   True
notre   True
jardin   True
.   True


In [3]:
tokens=nlp("Cela est bien dit, répondit Candide, mais il faut cultiver notre jardin.")
for token in tokens:
  print(token.text,' ',token.vector_norm, token.lemma_, token.pos_)

Cela   44.669857 cela PRON
est   51.685936 être AUX
bien   34.909897 bien ADV
dit   49.06739 dire VERB
,   32.17065 , PUNCT
répondit   22.246864 répondre VERB
Candide   21.422577 Candide PROPN
,   32.17065 , PUNCT
mais   32.46687 mais CCONJ
il   87.90838 il PRON
faut   51.740517 falloir VERB
cultiver   22.795557 cultiver VERB
notre   33.395786 notre DET
jardin   29.51032 jardin NOUN
.   43.718 . PUNCT


In [11]:
#Compute Similarity
token_1=nlp("gentilhomme")
token_2=nlp("monseigneur")

similarity_score=token_1.similarity(token_2)
print(similarity_score)

0.4861187762850686


In [12]:
# Compute Similarity between texts 
pizza=nlp('pizza')
burger=nlp('burger')
chair=nlp('chair')

print('Pizza and burger  ',pizza.similarity(burger))
print('Pizza and chair  ',pizza.similarity(chair))

Pizza and burger   0.9999999815526787
Pizza and chair   0.25339610850610766


In [14]:
# POS tagging using spaCy
my_text='Travaillons sans raisonner, dit Martin.'
my_doc=nlp(my_text)
for token in my_doc:
  print(token.text,'---- ',token.pos_)

Travaillons ----  VERB
sans ----  ADP
raisonner ----  VERB
, ----  PUNCT
dit ----  VERB
Martin ----  PROPN
. ----  PUNCT


In [18]:
# Importing displacy
from spacy import displacy
my_text='Il faut cultiver notre jardin.'
my_doc=nlp(my_text)

# displaying tokens with their POS tags
displacy.render(my_doc,style='dep',jupyter=True)

In [23]:
# Preparing the spaCy document
text='Vous savez comment périrent Crésus, Astyage, Darius, Denys de Syracuse, Pyrrhus, Persée, Annibal, Jugurtha, Arioviste, César, Pompée, Néron, Othon, Vitellius, Domitien?'

doc=nlp(text)

# Printing the named entities
print(doc.ents)

(Crésus, Astyage, Darius, Denys de Syracuse, Pyrrhus, Persée, Annibal, Jugurtha, Arioviste, César, Pompée, Néron, Othon, Vitellius, Domitien)


In [24]:
# Printing labels of entities.
for entity in doc.ents:
  print(entity.text,'--- ',entity.label_)

Crésus ---  PER
Astyage ---  PER
Darius ---  PER
Denys de Syracuse ---  PER
Pyrrhus ---  PER
Persée ---  PER
Annibal ---  PER
Jugurtha ---  PER
Arioviste ---  PER
César ---  PER
Pompée ---  PER
Néron ---  PER
Othon ---  PER
Vitellius ---  PER
Domitien ---  PER


In [25]:
# Using displacy for visualizing NER
from spacy import displacy
displacy.render(doc,style='ent',jupyter=True)


In [3]:
import spacy

nlp = spacy.load("fr_core_news_lg")
tokens = nlp("Salons Satire Scepticisme Sensibilité Sexualité Sensualisme Siècle Silhouette Socrate Soldat Sublime Théosophie Newtonianisme")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm)



Salons True 27.719257
Satire True 17.798481
Scepticisme True 17.683435
Sensibilité True 23.768318
Sexualité True 24.379883
Sensualisme False 0.0
Siècle True 29.93297
Silhouette True 17.037134
Socrate True 19.442907
Soldat True 22.530884
Sublime True 17.932327
Théosophie True 18.343145
Newtonianisme False 0.0


In [48]:
# Compute Similarity between texts 
sensibilité=nlp('sensibilité')
sexualité=nlp('sexualité')
salons=nlp('salons')

print('sensibilité et sexualité', sexualité.similarity(sensibilité))
print('sensibilité et salons', sensibilité.similarity(salons))

sensibilité et sexualité 0.6800283708260398
sensibilité et salons 0.10804756734687604


In [None]:
lettre=nlp('lettre')

#opening plain text version of Voltaire's Candide from URI
import requests
response = requests.get('https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Laclos_Liaisons.txt')
laclos_liaisons = response.text
doc = nlp(laclos_liaisons)

print('laclos_liaisons et lettre', lettre.similarity(laclos_liaisons))

In [28]:
# process a sentence using the model
doc = nlp('Salons Satire Scepticisme Sensibilité Sexualité Sensualisme Siècle Silhouette Socrate Soldate Sublime')
# It's that simple - all of the vectors and words are assigned after this point
# Get the vector for 'text':
doc[3].vector
# Get the mean vector for the entire sentence (useful for sentence classification etc.)
doc.vector

array([ 4.45409082e-02, -6.15381775e-03,  6.46736398e-02, -3.30393016e-02,
       -1.59137677e-02, -1.76747069e-02,  1.88228905e-01,  7.11481869e-02,
       -1.87705457e-02,  4.94494528e-01,  3.82729061e-02, -3.55001837e-02,
       -6.40914515e-02,  2.06593629e-02, -1.65406684e-03,  4.02992778e-03,
        1.85627292e-03,  1.16481826e-01,  5.43477312e-02, -1.19795270e-01,
       -1.29290909e-01, -3.39690894e-02, -7.31890975e-03,  3.00314538e-02,
       -3.86636355e-03,  4.93573584e-02,  4.69811819e-02,  6.17290959e-02,
       -4.05062698e-02,  4.15974557e-02, -5.41727000e-04,  7.39171803e-02,
       -1.00750372e-01,  2.42963433e-03,  1.56963635e-02, -8.17630962e-02,
       -1.29146352e-01,  4.21435460e-02, -5.13892807e-02,  9.10463631e-02,
       -2.84402613e-02,  1.04187459e-01, -4.51709069e-02, -1.06588183e-02,
        7.93296173e-02,  3.62472683e-02, -2.92837266e-02,  2.07281820e-02,
        1.04081683e-01, -7.43318200e-02, -7.59636285e-03,  7.37091824e-02,
        9.94609073e-02,  

In [62]:
#opening plain text version of Laclos_Liaisons from URI
import requests
response = requests.get('https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Laclos_Liaisons.txt')
laclos_liaisons = response.text
doc = nlp(laclos_liaisons)
laclos_liasons_short = (laclos_liaisons[0:150])
tokens = nlp(laclos_liasons_short)

In [67]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm)

 
 False 0.0
AVERTISSEMENT True 14.837525
DE True 74.96739
L' True 109.51171
ÉDITEUR True 31.915905
. True 43.718

 False 0.0
Nous True 54.0559
croyons True 21.24981
devoir True 28.550219
prévenir True 27.262228
le True 68.99325
Public True 28.366117
que True 50.804264
, True 32.17065
malgré True 26.515179
le True 68.99325
titre True 34.160946
de True 58.922653
cet True 58.825214
ouvrage True 28.421127
et True 43.582584
ce True 75.84347
qu' True 55.358067
en True 70.23479
dit True 49.06739
le True 68.99325
rédacteur True 22.528395
dans True 33.814415
sa True 96.66397
préface True 21.833696


In [82]:
from bs4 import BeautifulSoup
import requests
import traceback
 
def get_text (url):
    buf = []
  
    try:
        soup = BeautifulSoup(requests.get(url).text, "html.parser")
        
        for p in soup.find_all("p"):
            buf.append(p.get_text())
 
        return "\n".join(buf)
    except: 
        print(traceback.format_exc())
        sys.exit(-1)

In [None]:
lic = {}
lic["Laclos_Liaisons"] = nlp(get_text("https://raw.githubusercontent.com/MiMoText/roman-dixhuit/master/plain/files/Laclos_Liaisons.txt"))
lic["asl"] = nlp(get_text("https://opensource.org/licenses/Apache-2.0"))
lic["bsd"] = nlp(get_text("https://opensource.org/licenses/BSD-3-Clause"))
 
for sent in lic["bsd"].sents:
    print(">", sent)

In [84]:
pairs = [
    ["Laclos_Liaisons", "asl"], 
    ["asl", "bsd"], 
    ["bsd", "Laclos_Liaisons"]
]
 
for a, b in pairs:
    print(a, b, lic[a].similarity(lic[b]))

Laclos_Liaisons asl 0.0
asl bsd 0.9268238265800538
bsd Laclos_Liaisons 0.0


  print(a, b, lic[a].similarity(lic[b]))


In [100]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import spacy
# Load the spacy model that you have installed
nlp = spacy.load('fr_core_web_md')
# process a sentence using the model
doc = nlp("This is some text that I am processing with Spacy")
# It's that simple - all of the vectors and words are assigned after this point
# Get the vector for 'text':
doc[3].vector
# Get the mean vector for the entire sentence (useful for sentence classification etc.)
doc.vector

In [None]:
lemmatizer = Lemmatizer(lookups)

In [6]:
doc = nlp('Lumières')
doc.vector

array([-1.97    , -3.128   ,  0.84188 ,  2.4594  , -3.9371  ,  0.53843 ,
       -0.6954  , -0.59731 ,  0.48815 , -2.0252  , -1.6655  ,  0.035474,
        1.5994  , -0.65497 ,  0.85439 ,  0.76914 ,  0.96057 , -0.31809 ,
       -0.030133,  0.57971 ,  1.7804  ,  0.45991 , -1.3055  , -0.78577 ,
       -1.7592  ,  0.54111 , -0.30987 , -0.16627 ,  1.1548  , -2.0847  ,
       -0.88335 ,  0.63039 ,  2.2595  , -1.0256  ,  3.2796  , -0.98301 ,
       -0.72063 , -1.395   ,  1.6252  ,  0.51389 ,  2.8148  ,  0.62128 ,
       -0.89088 ,  1.7662  , -1.2811  , -0.32423 , -2.3852  , -1.4047  ,
        1.7208  , -1.1711  ,  0.20749 ,  0.334   ,  0.32861 ,  1.2355  ,
       -1.252   , -0.526   , -1.39    ,  0.61178 ,  1.7188  ,  1.5763  ,
       -0.73748 , -1.2155  , -0.87986 , -0.62867 , -0.81942 , -0.7477  ,
        1.6957  ,  0.6963  , -1.8163  , -0.11045 , -2.497   ,  2.0221  ,
       -0.39865 , -0.02999 , -0.055437,  1.3216  ,  1.4194  ,  1.0508  ,
       -2.0811  , -2.5842  ,  1.8958  ,  2.4295  , 