# Textacy & Spacy
Librerias de procesado de NLP

In [1]:
'''
Versiones compatibles
Despues hay que reiniciar el entorno de ejecución
'''
!pip install -U spacy==3.0.2
!pip install textacy==0.10.1

# Para siguientes ejecuciones, solo ejecutar esto y reiniciar despues el entorno
!python -m spacy download en_core_web_lg
!python -m spacy download es_core_news_lg


Collecting spacy==3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/74/0c/1506978404c6363cae08af5e81cf1d2589fca889f27a1c70c333251a55df/spacy-3.0.2-cp37-cp37m-manylinux2014_x86_64.whl (12.7MB)
[K     |████████████████████████████████| 12.7MB 17.1MB/s 
[?25hCollecting catalogue<2.1.0,>=2.0.1
  Downloading https://files.pythonhosted.org/packages/48/5c/493a2f3bb0eac17b1d48129ecfd251f0520b6c89493e9fd0522f534a9e4a/catalogue-2.0.1-py3-none-any.whl
Collecting typer<0.4.0,>=0.3.0
  Downloading https://files.pythonhosted.org/packages/90/34/d138832f6945432c638f32137e6c79a3b682f06a63c488dcfaca6b166c64/typer-0.3.2-py3-none-any.whl
Collecting pydantic<1.8.0,>=1.7.1
[?25l  Downloading https://files.pythonhosted.org/packages/b3/0a/52ae1c659fc08f13dd7c0ae07b88e4f807ad83fb9954a59b0b0a3d1a8ab6/pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl (9.1MB)
[K     |████████████████████████████████| 9.1MB 29.8MB/s 
[?25hCollecting pathy
  Downloading https://files.pythonhosted.org/packages/

# Spacy
https://spacy.io/

NOTA: Recuerda reiniciar el entorno de ejecucion despues de la instalacion

In [1]:
import spacy

# Load the large English NLP model. Tokenizer and parser
nlp = spacy.load('en_core_web_lg')

## Text basics
Veamos como trabajar cn estos primeros ejemplos con la libreria ´spacy´. Cosas que podemos hacer:
1. Tokenizar en frases
2. Tokenizar en palabras
3. Acceder a los atributos de cada token
4. Acceder a las entidades del texto
5. Visualizar las entidades del texto

In [5]:
# The text we want to examine
text = """
London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.
"""

doc = nlp(text)

print(doc)

# Podemos tokenizar el doc en frases. Sents son las frases (sentences)
for num, sentence in enumerate(doc.sents):
    print(f'{num}: {sentence}')

# Tokenizar palabras
# En realidad el doc es una lista de tokens de palabras
for word in doc[:5]:
    print(word.text)

'''
Atributos del token
lemma_: raiz de la palabra
pos_: analisis morfologico universal de la palabra. https://universaldependencies.org/docs/u/pos/
tag_: analisis morfologico universal de la palabra. Algo mas desagregado
dep_: relacion sintactica de dependencia
is_stop: es una stop word
'''
for word in doc[:5]:
    print((word.text, word.lemma_, word.pos_, word.tag_, word.dep_, word.is_stop))


London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.

0: 
London is the capital and most populous city of England and 
the United Kingdom.  
1: Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia.
2: It was founded by the Romans, who named it Londinium.



London
is
the
capital
('\n', '\n', 'SPACE', '_SP', '', False)
('London', 'London', 'PROPN', 'NNP', 'nsubj', False)
('is', 'be', 'AUX', 'VBZ', 'ROOT', True)
('the', 'the', 'DET', 'DT', 'det', True)
('capital', 'capital', 'NOUN', 'NN', 'attr', False)


## Syntactic analysis
Doing the school homework

In [10]:
from spacy import displacy

doc2 = nlp("London is the capital and most populous city of England and the United Kingdom")
displacy.render(doc2, jupyter=True, style="dep")

## Entities in text

In [11]:
# 'doc' now contains a parsed version of text. We can use it to do anything we want!
# For example, this will print out all the named entities that were detected:
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

London (GPE)
England (GPE)
the United Kingdom (GPE)
the River Thames (LOC)
the south east (LOC)
Great Britain (GPE)
London (GPE)
two millennia (DATE)
Romans (NORP)
Londinium (ORG)


In [12]:
# Doubts with labels
spacy.explain('GPE')

'Countries, cities, states'

In [13]:
from spacy import displacy

displacy.render(doc,style='ent',jupyter=True)

## Replacing names
Hide names for GDPR

In [14]:
# Replace a token with "REDACTED" if it is a name
def replace_name_with_placeholder(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    else:
        return token.string

# Loop through all the entities in a document and check if they are names
def scrub(text):
    doc = nlp(text)
    for ent in doc.ents:
        ent.merge()
    tokens = map(replace_name_with_placeholder, doc)
    return "".join(tokens)

s = """
In 1950, Alan Turing published his famous article "Computing Machinery and Intelligence". In 1957, Noam Chomsky’s 
Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of syntactic structures.
"""

print(scrub(s))


In 1950, [REDACTED] published his famous article "Computing Machinery and Intelligence". In 1957, [REDACTED] ’s 
Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of syntactic structures.



  if sys.path[0] == '':
  if sys.path[0] == '':


## Lematize

In [16]:
for w in doc:
    print(w.text, w.lemma_, w.pos_)


 
 SPACE
London London PROPN
is be AUX
the the DET
capital capital NOUN
and and CCONJ
most most ADV
populous populous ADJ
city city NOUN
of of ADP
England England PROPN
and and CCONJ

 
 SPACE
the the DET
United United PROPN
Kingdom Kingdom PROPN
. . PUNCT
    SPACE
Standing stand VERB
on on ADP
the the DET
River River PROPN
Thames Thames PROPN
in in ADP
the the DET
south south PROPN
east east PROPN

 
 SPACE
of of ADP
the the DET
island island NOUN
of of ADP
Great Great PROPN
Britain Britain PROPN
, , PUNCT
London London PROPN
has have AUX
been be AUX
a a DET
major major ADJ
settlement settlement NOUN

 
 SPACE
for for ADP
two two NUM
millennia millennium NOUN
. . PUNCT
It -PRON- PRON
was be AUX
founded found VERB
by by ADP
the the DET
Romans Romans PROPN
, , PUNCT
who who PRON
named name VERB
it -PRON- PRON
Londinium Londinium PROPN
. . PUNCT

 
 SPACE


## Stopwords

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

# para observar las palabras dentro de la lista lo único que se #   # tiene que hacer eso imprimir
print(list(STOP_WORDS)[:30])

['go', 'n’t', 'others', 'throughout', 'meanwhile', 'therein', 'whom', 'although', 'own', 'take', 'often', 'yourselves', 'they', '’d', 'nothing', 'every', 'my', 'where', 'am', 'sometimes', 'ours', 'herein', 'same', 'whereafter', 'less', 'also', 'than', 'anyone', 'over', 'anyhow']


In [None]:
# Para filtrar stopwords
lista=[palabra for palabra in doc if palabra.is_stop==False and palabra.is_punct==False]
print(lista)

[London, capital, populous, city, England, 
, United, Kingdom,  , Standing, River, Thames, south, east, 
, island, Great, Britain, London, major, settlement, 
, millennia, founded, Romans, named, Londinium, 
]


## Most frequent words
In a Wikipedia page

In [17]:
!pip install wikipedia

Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-cp37-none-any.whl size=11686 sha256=f23bfc0df89fa892708a38449ff60f641cb073ede66c7854d33fec3fd113b0dc
  Stored in directory: /root/.cache/pip/wheels/87/2a/18/4e471fd96d12114d16fe4a446d00c3b38fb9efcb744bd31f4a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [19]:
import wikipedia
donald = wikipedia.page("Donald Trump")
text = donald.content
text



In [20]:
nombres=[w.text for w in nlp(text) if w.is_stop!=True and w.is_punct!=True and w.pos_=='NOUN']
nombres[:3]

['media', 'personality', 'businessman']

In [21]:
from collections import Counter
word_freq = Counter(nombres)

nombres_comunes=word_freq.most_common(10)
print(nombres_comunes)

[('=', 173), ('campaign', 66), ('election', 50), ('administration', 43), ('president', 38), ('government', 34), ('tax', 30), ('percent', 30), ('pandemic', 26), ('office', 25)]


## Textacy London text
No we want to know things about London, from our previous text

In [22]:
import spacy
import textacy.extract

# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

# The text we want to examine
text = """London is the capital and most populous city of England and  the United Kingdom.  
Standing on the River Thames in the south east of the island of Great Britain, 
London has been a major settlement  for two millennia.  It was founded by the Romans, 
who named it Londinium.
"""

# Parse the document with spaCy
doc = nlp(text)

# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, "London")

# Print the results
print("Here are the things I know about London:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")


Here are the things I know about London:
 - the capital and most populous city of England and  the United Kingdom.  

 - a major settlement  for two millennia.  


## Textacy with Wikipedia API
We don't want the most frequent words, we want sentences about London

In [23]:
import wikipedia
london = wikipedia.page("London")
text = london.content

In [24]:
doc = nlp(text)

# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, "London")

# Print the results
print("Here are the things I know about London:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")

Here are the things I know about London:
 - the capital and largest city of England and the United Kingdom
 - a major settlement for two millennia, and was originally called Londinium, which was founded by the Romans
 - one of the world's most important global cities
 - an ancient name, already attested in the first century AD, usually in the Latinised form Londinium
 - beyond all comparison the largest town in England
 - still very compact
 - the world's largest city from c.1831 to 1925, with a population density of 325 people per hectare
 - the seat of the Government of the United Kingdom
 - vulnerable to flooding
 - "one of the World's Greenest Cities" with more than 40 per cent green space or open water
 - the most populous city and metropolitan area of the European Union and the second most populous in Europe
 - the 19th largest city and the 18th largest metropolitan region.



 - Christian, and has a large number of churches, particularly in the City of London
 - also home to siz

# Spanish
## Spacy  and entities

In [26]:
nlp_es = spacy.load('es_core_news_lg')

text = '''Londres (en inglés, London, pronunciado /ˈlʌndən/ ( escuchar)) es la capital y mayor ciudad de Inglaterra y del Reino Unido.2​3​ Situada a orillas del río Támesis, Londres es un importante asentamiento humano desde que fue fundada por los romanos con el nombre de Londinium hace casi dos milenios.4​ El núcleo antiguo de la urbe, la City de Londres, conserva básicamente su perímetro medieval de una milla cuadrada. Desde el siglo XIX el nombre «Londres» también hace referencia a toda la metrópolis desarrollada alrededor de este núcleo.5​ El grueso de esta conurbación forma la región de Londres y el área administrativa del Gran Londres,6​ gobernado por el alcalde y la asamblea de Londres.7​
Londres es una ciudad global, uno de los centros neurálgicos en el ámbito de las artes, el comercio, la educación, el entretenimiento, la moda, las finanzas, los medios de comunicación, la investigación, el turismo o el transporte.8​ Es el principal centro financiero del mundo9​10​11​ y una de las áreas metropolitanas con mayor PIB.12​13​ Londres es también una capital cultural mundial,14​15​16​17​ la ciudad más visitada considerando el número de visitas internacionales18​ y tiene el mayor sistema aeroportuario del mundo según el tráfico de pasajeros.19​ Asimismo, las 43 universidades de la ciudad conforman la mayor concentración de centros de estudios superiores de toda Europa.20​ En el año 2012 Londres se convirtió en la única ciudad en albergar la celebración de tres Juegos Olímpicos de Verano.21​
En esta ciudad multirracial convive gente de un gran número de culturas que hablan más de trescientos idiomas distintos.22​ La Autoridad del Gran Londres estima que en 2015 la ciudad tiene 8,63 millones de habitantes,23​ que supone el 12,5 % del total de habitantes del Reino Unido.24​ El área urbana del Gran Londres, con 10 470 00025​ habitantes, es la segunda más grande de Europa, pero su área metropolitana, con una población estimada de entre 12 y 14 millones,26​27​ es la mayor del continente. Desde 1831 a 1925 Londres, como capital del Imperio británico, fue la ciudad más poblada del mundo.'''
doc = nlp_es(text)

# Para ver las entidades que ha detectado
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")


Londres (LOC)
London (LOC)
Inglaterra (LOC)
Reino Unido.2​3​ Situada (LOC)
Támesis (LOC)
Londres (LOC)
Londinium (LOC)
City de Londres (LOC)
Londres (LOC)
El grueso de esta conurbación forma la región de Londres (MISC)
Gran Londres,6​ (LOC)
Londres.7​ (LOC)
Londres (LOC)
PIB.12​13​ Londres (MISC)
Europa.20​ (ORG)
Londres (LOC)
Juegos Olímpicos de Verano.21​ (MISC)
La Autoridad del Gran Londres (MISC)
Reino Unido.24​ El (LOC)
Gran Londres (LOC)
Europa (LOC)
Londres (LOC)
Imperio británico (LOC)


In [27]:
# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, "Londres")

# Print the results
print("Here are the things I know about London:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")

Here are the things I know about London:


In [None]:
response =  urllib.request.urlopen('https://es.wikipedia.org/wiki/Londres')
html = response.read()

soup = BeautifulSoup(html,'html5lib')
text = soup.get_text(strip = True)



## Newspaper test

In [38]:
import urllib
from bs4 import BeautifulSoup
import textacy
nlp = spacy.load('en_core_web_lg')

response =  urllib.request.urlopen('https://arxiv.org/pdf/1905.12787.pdf')
html = response.read()

soup = BeautifulSoup(html,'html5lib')
text = soup.get_text(strip = True)

doc = nlp(text)

# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, "overfitting")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")