# Lab 7.2. Advanced Text Processing

## This lab will cover:


1. Use of Spacy


## 0. Install required packages

In [30]:
!pip install -U spacy

Collecting spacy
  Using cached spacy-3.0.5-cp38-cp38-win_amd64.whl (11.8 MB)
Collecting thinc<8.1.0,>=8.0.2
  Using cached thinc-8.0.2-cp38-cp38-win_amd64.whl (1.0 MB)


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'c:\\programdata\\anaconda3\\lib\\site-packages\\catalogue-2.0.1.dist-info\\METADATA'





In [31]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[!] Skipping model package dependencies and setting `--no-deps`. You don't seem
to have the spaCy package itself installed (maybe because you've built from
source?), so installing the model dependencies would cause spaCy to be
downloaded, which probably isn't what you want. If the model package has other
dependencies, you'll have to install them manually.
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [32]:
!pip install spacy-langdetect





In [33]:
!pip install spacy-readability

Collecting spacy<3.0,>=2.0
  Using cached spacy-2.3.5-cp38-cp38-win_amd64.whl (9.7 MB)
Collecting thinc<7.5.0,>=7.4.1
  Using cached thinc-7.4.5-cp38-cp38-win_amd64.whl (910 kB)
Collecting catalogue<1.1.0,>=0.0.7
  Using cached catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting srsly<1.1.0,>=1.0.2
  Using cached srsly-1.0.5-cp38-cp38-win_amd64.whl (178 kB)
Installing collected packages: srsly, catalogue, thinc, spacy
  Attempting uninstall: srsly
    Found existing installation: srsly 2.4.0


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'c:\\programdata\\anaconda3\\lib\\site-packages\\srsly-2.4.0.dist-info\\RECORD'



## 1. Let's import spacy and load english language model

In [34]:
import spacy
from spacy import displacy
from spacy_readability import Readability


In [35]:
# Create the nlp object
nlp = spacy.load("en_core_web_sm")

## 2. Let's do some text processing using spacy

In [36]:
# Process a text
doc = nlp("This is a really long sentence with many words and arguments.")

In [37]:
doc.text

'This is a really long sentence with many words and arguments.'

In [38]:
first_token = doc[0]
second_token = doc[1]

In [39]:
first_token

This

In [40]:
second_token

is

In [41]:
some_tokens=doc[5:8]

In [42]:
some_tokens

sentence with many

## 3. Let's extract some linguistic entities

In [43]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
AppleText = nlp(text)

In [44]:
AppleText[4]

Apple

In [45]:
AppleText[4].text

'Apple'

In [46]:
AppleText[4].pos_

'PROPN'

In [47]:
AppleText[4].dep_

'nsubj'

In [48]:
for token in AppleText:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

It          PRON      nsubj     
’s          VERB      ccomp     
official    ADJ       dobj      
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [49]:
displacy.render(AppleText, style="dep")

## 4. Named Entity Recognition

In [50]:
# Iterate over the predicted entities
for ent in AppleText.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [51]:
displacy.render(AppleText, style="ent")

## 5. Let's compute text similarity

In [52]:
doc1 = nlp("Apple manufactures mobile phones")
doc2 = nlp("Samsung also manufactures mobile phones")
doc1.similarity(doc2)

  doc1.similarity(doc2)


0.7463467035987559

In [53]:
doc1 = nlp("Apple manufactures mobile phones")
doc2 = nlp("This is a text about kangaroos")
doc1.similarity(doc2)

  doc1.similarity(doc2)


0.4482861727480409

## 6. Let's do language detection

In [54]:
from spacy_langdetect import LanguageDetector

nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
text = 'This is an english text.'
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'en', 'score': 0.9999983547168506}
This is an english text. {'language': 'en', 'score': 0.9999968103875707}


In [55]:
text = 'This is an english text. Also some texto en castellano '
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'en', 'score': 0.9999976138735395}
This is an english text. {'language': 'en', 'score': 0.999996085264483}
Also some texto en castellano {'language': 'es', 'score': 0.8571404671076155}


In [56]:
text = 'Hola, Cómo estás?'
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'es', 'score': 0.9999968014362172}
Hola, Cómo estás? {'language': 'es', 'score': 0.9999964568859501}


## 7. Let's compute the readibility of the text

In [57]:

nlp.add_pipe(Readability())

doc = nlp("I am some really difficult text to read because I use obnoxiously large words.")

print(doc._.flesch_kincaid_grade_level)
print(doc._.flesch_kincaid_reading_ease)
print(doc._.dale_chall)
print(doc._.smog)
print(doc._.coleman_liau_index)
print(doc._.automated_readability_index)
print(doc._.forcast)

8.412857142857145
59.68214285714288
7.714471428571429
0
8.96571428571428
7.1014285714285705
0


In [58]:
doc = nlp("Birds are animals than can fly.")

print(doc._.flesch_kincaid_grade_level)
print(doc._.flesch_kincaid_reading_ease)
print(doc._.dale_chall)
print(doc._.smog)
print(doc._.coleman_liau_index)
print(doc._.automated_readability_index)
print(doc._.forcast)

2.4833333333333343
87.94500000000002
0.2976
0
3.7666666666666657
1.1950000000000003
0
