# Deep Learning and Image Recognition

## NLP using Spacy

### Segmentation, Synonyms, POS tagging

- https://spacy.io/

- https://nlpforhackers.io/complete-guide-to-spacy/

`pip install spacy`

`python -m spacy download en_core_web_sm`

`python -m spacy download en_core_web_lg`

In [1]:
import spacy
from spacy import displacy
from scipy import spatial

In [2]:
#smaller model
nlp = spacy.load("en_core_web_sm")

#larger model
#nlp = spacy.load("en_core_web_lg")

### Entity Detection

In [3]:
# document level entities
def detectEntities(doc):   
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
    displacy.render(doc, style='ent', jupyter=True)

In [4]:
# document level sentences
def detectSentences(doc):   
    for sent in doc.sents:
        print("Sentence: ", sent)

In [5]:
#geopolitical entity
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
detectEntities (doc)

San Francisco 0 13 GPE


In [6]:
#organization
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion. Google might make their own offer.')
detectEntities (doc)
detectSentences (doc)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY
Google 56 62 ORG


Sentence:  Apple is looking at buying U.K. startup for $1 billion.
Sentence:  Google might make their own offer.


In [7]:
#some ambiguities
doc = nlp(u'Amazon is the largest forest in Brazil. Amazon is the largest retailer in the world.')
detectEntities (doc)

Amazon 0 6 ORG
Brazil 32 38 GPE
Amazon 40 46 ORG


### Extract Relationships

In [8]:
def extract_currency_relations(doc):
    # merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations = []
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
        if money.dep_ in ('attr', 'dobj'):
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
            relations.append((money.head.head, money))
    return relations

In [9]:
para = [
    'Net income was $9.4 million compared to the prior year of $2.7 million.',
    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
    'Bill Gates is the richest person with $100,000,000,000',
]

In [10]:
print("Processing %d texts" % len(para))

for text in para:
    doc = nlp(text)
    displacy.render(doc, style='ent', jupyter=True)
    relations = extract_currency_relations(doc)
    for r1, r2 in relations:
        print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))

Processing 3 texts


Net income	MONEY	$9.4 million
the prior year	MONEY	$2.7 million


Revenue   	MONEY	twelve billion dollars
a loss    	MONEY	1b


the richest person	MONEY	100,000,000,000


In [11]:
#Dependency Parsing and visualization

oc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

### Word Vector Arithmetic

A larger model will be used in this case

In [None]:
nlp = spacy.load('en_core_web_lg')

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
 
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector
 
# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []
 
for word in nlp.vocab:
    # Ignore words without vectors
    if not word.has_vector:
        continue
 
    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))
 
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])
 
# ['Queen', 'QUEEN', 'queen', 'King', 'KING', 'king', 'KIng', 'KINGS', 'kings', 'Kings']

### Computing Similarity

In [None]:
target = nlp("Cats are beautiful animals.")
 
doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")
 
print(target.similarity(doc1))  # 0.8901765218466683
print(target.similarity(doc2))  # 0.9115828449161616
print(target.similarity(doc3))  # 0.7822956752876101