# spacy

In [1]:
import spacy

In [2]:
nlp = spacy.load('en')

In [3]:
doc = nlp(u'John works at Google in London which is big city in the United kingdom.')

In [4]:
doc1 = nlp(U'Apple is looking at buying U.K. startup for $1 billion')

In [5]:
doc.to_json()

{'text': 'John works at Google in London which is big city in the United kingdom.',
 'ents': [{'start': 0, 'end': 4, 'label': 'PERSON'},
  {'start': 14, 'end': 20, 'label': 'ORG'},
  {'start': 24, 'end': 30, 'label': 'GPE'},
  {'start': 52, 'end': 70, 'label': 'GPE'}],
 'sents': [{'start': 0, 'end': 71}],
 'tokens': [{'id': 0,
   'start': 0,
   'end': 4,
   'pos': 'PROPN',
   'tag': 'NNP',
   'dep': 'nsubj',
   'head': 1},
  {'id': 1,
   'start': 5,
   'end': 10,
   'pos': 'VERB',
   'tag': 'VBZ',
   'dep': 'ROOT',
   'head': 1},
  {'id': 2,
   'start': 11,
   'end': 13,
   'pos': 'ADP',
   'tag': 'IN',
   'dep': 'prep',
   'head': 1},
  {'id': 3,
   'start': 14,
   'end': 20,
   'pos': 'PROPN',
   'tag': 'NNP',
   'dep': 'pobj',
   'head': 2},
  {'id': 4,
   'start': 21,
   'end': 23,
   'pos': 'ADP',
   'tag': 'IN',
   'dep': 'prep',
   'head': 1},
  {'id': 5,
   'start': 24,
   'end': 30,
   'pos': 'PROPN',
   'tag': 'NNP',
   'dep': 'pobj',
   'head': 4},
  {'id': 6,
   'start': 31

In [6]:
doc.ents

(John, Google, London, the United kingdom)

In [7]:
for ent in doc.ents:
    print(ent.label_,ent.text)

PERSON John
ORG Google
GPE London
GPE the United kingdom


In [8]:
for np in doc.noun_chunks:
    print(np)

John
Google
London
big city
the United kingdom


In [9]:
doc1

Apple is looking at buying U.K. startup for $1 billion

In [10]:
for np in doc1.noun_chunks:
    print(np.text, np.lemma_)

Apple Apple
U.K. startup U.K. startup


In [11]:
nlp = spacy.load('fr')

In [12]:
doc2 = nlp(u"Cette entreprise a été acheté 1 millions d'euros")

In [13]:
for np in doc2:
    print(np.text, np.pos_, np.dep_)


Cette DET det
entreprise NOUN nsubj:pass
a AUX aux:tense
été AUX aux:pass
acheté VERB ROOT
1 NUM nummod
millions NOUN obj
d' ADP case
euros NOUN nmod


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vectorizer = CountVectorizer()

In [16]:
vectorizer

CountVectorizer()

In [90]:
url = "https://lessurligneurs.eu/tag/europe/feed/"
url1 = "https://lessurligneurs.eu/tag/education/feed/"
url2 = "https://lessurligneurs.eu/tag/criminalite/feed/"

In [117]:
import feedparser

In [118]:
f = feedparser.parse(url)
f1 = feedparser.parse(url1)
f2 = feedparser.parse(url2)

In [122]:
f.keys()

dict_keys(['bozo', 'entries', 'feed', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])

In [130]:
type(f)


feedparser.util.FeedParserDict

In [131]:
f

{'bozo': False,
 'entries': [{'title': 'Selon Jacques Attali “Le RGPD, personne ne lui obéit”',
   'title_detail': {'type': 'text/plain',
    'language': None,
    'base': 'https://lessurligneurs.eu/tag/europe/feed/',
    'value': 'Selon Jacques Attali “Le RGPD, personne ne lui obéit”'},
   'links': [{'rel': 'alternate',
     'type': 'text/html',
     'href': 'https://lessurligneurs.eu/selon-jacques-attali-le-rgpd-personne-ne-lui-obeit/'}],
   'link': 'https://lessurligneurs.eu/selon-jacques-attali-le-rgpd-personne-ne-lui-obeit/',
   'comments': 'https://lessurligneurs.eu/selon-jacques-attali-le-rgpd-personne-ne-lui-obeit/#respond',
   'authors': [{'name': 'Rachid Merimi'}],
   'author': 'Rachid Merimi',
   'author_detail': {'name': 'Rachid Merimi'},
   'published': 'Thu, 28 Jan 2021 16:26:12 +0000',
   'published_parsed': time.struct_time(tm_year=2021, tm_mon=1, tm_mday=28, tm_hour=16, tm_min=26, tm_sec=12, tm_wday=3, tm_yday=28, tm_isdst=0),
   'tags': [{'term': 'Article Court', 'sch

In [17]:
corpus = ['un jour ici et la', 'demain mais pas ici', 'la la la lere']

In [19]:
vectorizer.fit(corpus)


CountVectorizer()

In [20]:
bow = vectorizer.transform(corpus)
bow

<3x9 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [21]:
vectorizer.get_feature_names()

['demain', 'et', 'ici', 'jour', 'la', 'lere', 'mais', 'pas', 'un']

In [22]:
print(bow.toarray())

[[0 1 1 1 1 0 0 0 1]
 [1 0 1 0 0 0 1 1 0]
 [0 0 0 0 3 1 0 0 0]]


In [23]:
import pandas as pd

In [24]:
pd.DataFrame(bow.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,demain,et,ici,jour,la,lere,mais,pas,un
0,0,1,1,1,1,0,0,0,1
1,1,0,1,0,0,0,1,1,0
2,0,0,0,0,3,1,0,0,0


In [25]:
from sklearn.feature_extraction.text import TfidfTransformer


In [26]:
vectorizer = CountVectorizer()

In [27]:
counts = vectorizer.fit_transform(corpus)

In [28]:
counts


<3x9 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [29]:
transformer = TfidfTransformer()

In [30]:
t= transformer.fit_transform(counts)

In [31]:
pd.DataFrame(t.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,demain,et,ici,jour,la,lere,mais,pas,un
0,0.0,0.490479,0.373022,0.490479,0.373022,0.0,0.0,0.0,0.490479
1,0.528635,0.0,0.40204,0.0,0.0,0.0,0.528635,0.528635,0.0
2,0.0,0.0,0.0,0.0,0.91589,0.401429,0.0,0.0,0.0


In [32]:
dic = dict(zip(vectorizer.get_feature_names(), transformer.idf_))
pd.DataFrame.from_dict(dic, orient = 'index' )

Unnamed: 0,0
demain,1.693147
et,1.693147
ici,1.287682
jour,1.693147
la,1.287682
lere,1.693147
mais,1.693147
pas,1.693147
un,1.693147


# Word embedding

In [33]:
import gensim 

In [34]:
from gensim.models import Word2Vec

In [35]:
senteces = [["cat", "say", "meow"], ["dog", "say", "woof"]]

In [36]:
model = Word2Vec(senteces, min_count = 1)

In [37]:
model['cat']

  model['cat']


array([ 1.8583199e-03, -1.2254459e-03,  1.0737818e-03, -3.3591418e-03,
       -1.2251452e-04, -8.1389780e-06,  7.9774333e-04,  2.3004417e-03,
        3.7269401e-03, -1.7398535e-04,  3.7158267e-03,  3.9687809e-03,
       -4.4385507e-03,  2.4430919e-03, -4.2998217e-04, -1.8934465e-03,
       -3.8388958e-03, -2.5617806e-04,  2.7638718e-03, -3.3141559e-03,
       -7.6831994e-04, -3.6443237e-03,  1.4072409e-03, -2.2637506e-04,
        1.1597004e-03, -1.3777991e-03, -2.3089116e-03,  1.4992351e-04,
       -4.2650285e-03, -3.3129649e-03,  8.4178668e-04,  2.1491530e-04,
       -4.2579547e-03,  1.1581988e-03, -3.8360425e-03, -2.1280539e-03,
       -3.6555415e-03, -3.5970388e-03,  4.0683290e-03, -9.0518978e-04,
        1.9371402e-03,  2.8541903e-03,  4.1533951e-03,  7.2380836e-04,
        1.4402561e-03, -4.5270505e-03, -3.1808482e-03, -1.9528379e-03,
       -1.7143231e-03, -3.9309054e-03, -2.3246955e-03,  1.6368072e-03,
       -3.1408712e-03, -4.1803964e-03,  3.6973089e-03, -5.1952718e-04,
      

In [38]:
model.most_similar("cat")

  model.most_similar("cat")


[('meow', 0.2516365647315979),
 ('say', 0.19602951407432556),
 ('woof', 0.10126127302646637),
 ('dog', 0.003978310152888298)]

In [39]:
model.most_similar(positive=['cat'], negative= ['dog'])

  model.most_similar(positive=['cat'], negative= ['dog'])


[('meow', 0.18738606572151184),
 ('woof', 0.16422852873802185),
 ('say', 0.04688487946987152)]

In [40]:
spacy.explain("VBZ")

'verb, 3rd person singular present'

In [41]:
[(t, t.lemma_, t.pos_, t.tag_, t.dep_) for t in doc]

[(John, 'John', 'PROPN', 'NNP', 'nsubj'),
 (works, 'work', 'VERB', 'VBZ', 'ROOT'),
 (at, 'at', 'ADP', 'IN', 'prep'),
 (Google, 'Google', 'PROPN', 'NNP', 'pobj'),
 (in, 'in', 'ADP', 'IN', 'prep'),
 (London, 'London', 'PROPN', 'NNP', 'pobj'),
 (which, 'which', 'DET', 'WDT', 'nsubj'),
 (is, 'be', 'AUX', 'VBZ', 'relcl'),
 (big, 'big', 'ADJ', 'JJ', 'amod'),
 (city, 'city', 'NOUN', 'NN', 'attr'),
 (in, 'in', 'ADP', 'IN', 'prep'),
 (the, 'the', 'DET', 'DT', 'det'),
 (United, 'United', 'PROPN', 'NNP', 'compound'),
 (kingdom, 'kingdom', 'NOUN', 'NN', 'pobj'),
 (., '.', 'PUNCT', '.', 'punct')]

In [42]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion .")

# Composants du texte, accessibles en terminant leur attribut par '_' : 
for token in doc:
    print(token.i, token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop, token.like_url)

0 Apple Apple PROPN PROPN ROOT Xxxxx True False False
1 is is PROPN PROPN det xx True False False
2 looking looking VERB VERB__Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin nmod xxxx True False False
3 at at X X dep xx True False False
4 buying buying NOUN NOUN__Gender=Masc|Number=Sing nmod xxxx True False False
5 U.K. u.k. ADJ ADJ__Number=Plur ROOT X.X. False False False
6 startup startup VERB VERB__VerbForm=Inf acl xxxx True False False
7 for for ADP ADP case xxx True False False
8 $ dollar NOUN NOUN__Gender=Masc obl:mod $ False False False
9 1 1 NUM NUM__NumType=Card nummod d False False False
10 billion billion NOUN NOUN__Gender=Fem|Number=Sing nmod xxxx True False False
11 . . PUNCT PUNCT punct . False False False
