In [1]:
import spacy


nlp = spacy.load("en_core_web_sm")

## Lemmatization

**Exercise 1.1**

You are given the words "playing", "played", "play". Find the lemma using spaCy for all of them.


In [6]:
words = ["playing", "played", "playing"]

for word in words:
    token = nlp(word)[0]
    print(f"The lemma of the word '{word}' is {token.lemma_}")

The lemma of the word 'playing' is play
The lemma of the word 'played' is play
The lemma of the word 'playing' is play


**Exercise 1.2**

Assign the spaCy lemmatizer to a variable instead of using the whole `nlp` pipeline.

In [None]:
lemmatizer = nlp.vocab.morphology.lemmatizer


**Exercise 1.3**

Find the verb, the noun and the adjective forms of the words: "playing", "played", "surfing".

In [15]:
from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB

lemmatizer = nlp.vocab.morphology.lemmatizer
print(lemmatizer("playing", VERB))
print(lemmatizer("playing", NOUN))
print(lemmatizer("playing", ADJ))
print("-----------")
print(lemmatizer("played", VERB))
print(lemmatizer("played", NOUN))
print(lemmatizer("played", ADJ))
print("-----------")
print(lemmatizer("surfing", VERB))
print(lemmatizer("surfing", NOUN))
print(lemmatizer("surfing", ADJ))

['play']
['playing']
['playing']
-----------
['play']
['played']
['played']
-----------
['surf']
['surfing']
['surfing']


## Spell Checker

**Exercise 1.4**

In the following sentences there are some mispelling errors. Can you preprocess them to get rid of them? 

*N.B. there's no perfect spell-checker. Don't waste time on this. But be aware that it can be useful sometimes.*



In [59]:
sentences = ["i realy like this exerxise", 
             "tis sentences are surely writen by an italian",
             "lets fix thissssss"
            ]
from autocorrect import Speller

spell = Speller()

for sentence in sentences:
    print(spell(sentence))

i really like this exercise
tis sentences are surely writer by an italian
lets fix thissssss


## POS Tagging

**Exercise 1.5 (★☆☆)**

The training of TAs to survive the first two weeks at Strive School consists of the following sets:

- 1000 reps of "Did you google it?"
- 1000 reps of "Did you search it on Google already?"

Use spaCy to explain the difference of the word "google" in the two sentences.

In [61]:
sentences = ["Did you google it?",
             "Did you search it on Google already?"]

for sentence in sentences:
    sentence = nlp(sentence)
    for token in sentence:
        if "google" in token.text.lower():
            print(f'{token.text:{12}} {token.pos_:{10}} {token.tag_:{8}} {spacy.explain(token.tag_)}')



google       VERB       VB       verb, base form
Google       PROPN      NNP      noun, proper singular


**Exercise 1.5 (★★☆)**

Get the frequencies of the POS tags in the example sentence:

In [69]:
sentence = """This is an example sentence.
            Count the POS tags in it."""

sentence = nlp(sentence)
num_pos = sentence.count_by(spacy.attrs.POS)
for k, v in num_pos.items():
    print(sentence.vocab[k].text, v)

DET 3
AUX 1
NOUN 3
PUNCT 2
SPACE 1
PROPN 2
ADP 1
PRON 1


**Exercise 1.7 (★★★)**

(This exercises requires many steps in Pandas, no unique solution. You can hard code the name of the columns for this example if you get stuck.)

Loading 10 tweets from the twitter datasets, create a dataframe containing the frequencies of each POS per tweet (see example).

N.B. The column names must be the tags not the indices of the tags.

In [141]:
import pandas as pd

df = pd.read_csv("data/stock_data.csv")

df = df.iloc[:10]

In [142]:
df

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1
5,PGNX Over 3.04,1
6,AAP - user if so then the current downtrend wi...,-1
7,Monday's relative weakness. NYX WIN TIE TAP IC...,-1
8,GOOG - ower trend line channel test & volume s...,1
9,AAP will watch tomorrow for ONG entry.,1


In [85]:
def get_pos_and_frequency(sentence, tagger=nlp):
    sentence = tagger(sentence)
    num_pos = sentence.count_by(spacy.attrs.POS)
    named_pos = {}
    for k, v in num_pos.items():
        named_pos[sentence.vocab[k].text] = v
    return named_pos
    

In [92]:
columns = []
for i, row in df.iterrows():
    pos_dict = get_pos_and_frequency(row.Text)
    for key in pos_dict.keys():
        if key not in columns:
            columns.append(key)
            

In [93]:
columns

['NOUN',
 'ADP',
 'DET',
 'PROPN',
 'SPACE',
 'NUM',
 'CCONJ',
 'PUNCT',
 'VERB',
 'SYM',
 'ADV',
 'PRON',
 'AUX',
 'ADJ',
 'PART',
 'SCONJ']

In [132]:
import numpy as np
array = np.zeros((10,len(columns)+1))
data = pd.DataFrame(array, columns=["text", *columns])

In [133]:
data

Unnamed: 0,text,NOUN,ADP,DET,PROPN,SPACE,NUM,CCONJ,PUNCT,VERB,SYM,ADV,PRON,AUX,ADJ,PART,SCONJ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [134]:
for i, row in df.iterrows():
    pos_dict = get_pos_and_frequency(row.Text)
    for key, value in pos_dict.items():
        data.loc[i, key] = value
    data.loc[i, "text"] = row.Text


In [135]:
data

Unnamed: 0,text,NOUN,ADP,DET,PROPN,SPACE,NUM,CCONJ,PUNCT,VERB,SYM,ADV,PRON,AUX,ADJ,PART,SCONJ
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,7.0,1.0,1.0,7.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,6.0,2.0,2.0,5.0,2.0,2.0,0.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,user I'd be afraid to short AMZN - they are lo...,4.0,1.0,2.0,2.0,0.0,0.0,1.0,5.0,2.0,0.0,0.0,2.0,2.0,3.0,1.0,2.0
3,MNTA Over 12.00,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OI Over 21.37,0.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,PGNX Over 3.04,0.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,AAP - user if so then the current downtrend wi...,7.0,1.0,2.0,0.0,0.0,0.0,0.0,5.0,2.0,0.0,4.0,0.0,0.0,3.0,0.0,1.0
7,Monday's relative weakness. NYX WIN TIE TAP IC...,3.0,0.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
8,GOOG - ower trend line channel test & volume s...,6.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,AAP will watch tomorrow for ONG entry.,2.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Named Entities Recognition

**Exercise 1.8**

In the Twitter dataset sample above, count how many entities are in each tweet.

In [152]:
def count_ents(sentence, ner=nlp):
    sentence = ner(sentence)
    return len(sentence.ents)

df["ents"] = df.Text.apply(count_ents)

In [150]:
df

Unnamed: 0,Text,Sentiment,ents
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,3
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,5
2,user I'd be afraid to short AMZN - they are lo...,1,1
3,MNTA Over 12.00,1,1
4,OI Over 21.37,1,1
5,PGNX Over 3.04,1,2
6,AAP - user if so then the current downtrend wi...,-1,1
7,Monday's relative weakness. NYX WIN TIE TAP IC...,-1,3
8,GOOG - ower trend line channel test & volume s...,1,1
9,AAP will watch tomorrow for ONG entry.,1,3


**Exercise 1.9**

In the Twitter dataset sample above, create an extra column with the name of the Organization entities in the tweet.

In [153]:
def get_ents(sentence, ner=nlp):
    sentence = ner(sentence)
    return sentence.ents

def get_orgs(sentence, ner=nlp):
    ents = get_ents(sentence, ner=ner)
    return [ent.text for ent in ents if ent.label_=="ORG"]

In [154]:
df["Organizations"] = df.Text.apply(get_orgs)

In [155]:
df

Unnamed: 0,Text,Sentiment,ents,Organizations
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,3,[]
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,5,"[AAP MOVIE, FEA/GEED]"
2,user I'd be afraid to short AMZN - they are lo...,1,1,[eBooks]
3,MNTA Over 12.00,1,1,[]
4,OI Over 21.37,1,1,[]
5,PGNX Over 3.04,1,2,[PGNX]
6,AAP - user if so then the current downtrend wi...,-1,1,[AAP]
7,Monday's relative weakness. NYX WIN TIE TAP IC...,-1,3,[NYX]
8,GOOG - ower trend line channel test & volume s...,1,1,[GOOG]
9,AAP will watch tomorrow for ONG entry.,1,3,"[AAP, ONG]"


## Matcher

**Exercise 2.0**

You have scraped many websites collecting a list of users but written in many different form: someone has an email like:

```
username: antonio
user: antonio.marsella@email.com
USER:antonio
USERNAME: antonio_
```
How would you match all of them using a spaCy matcher?`

In [160]:
from spacy.matcher import Matcher

doc = nlp("""
username: antonio
user: antonio.marsella@email.com
USER:antonio
USERNAME: antonio_
""")

pattern = [{"LOWER": {"IN":["user","username"]}}, {"ORTH": ":"}, {}]
matcher = Matcher(nlp.vocab)
matcher.add("USERNAME", [pattern])

matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(string_id, start, end, span.text)

USERNAME 1 4 username: antonio
USERNAME 5 8 user: antonio.marsella@email.com
USERNAME 9 12 USER:antonio
USERNAME 13 16 USERNAME: antonio
