# IHLT labs
## Document Structure

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
# help(spacy.tokens.doc.Doc)

### Tokenizers

In [2]:
source = 'Men want children. They get relaxed with kids.'

#### sentence splitting

In [3]:
doc = nlp(source)
[sent.text for sent in doc.sents]

['Men want children.', 'They get relaxed with kids.']

#### tokenizer

In [4]:
[token.text for token in doc]

['Men', 'want', 'children', '.', 'They', 'get', 'relaxed', 'with', 'kids', '.']

In [5]:
[[token.text for token in sent] for sent in doc.sents]

[['Men', 'want', 'children', '.'],
 ['They', 'get', 'relaxed', 'with', 'kids', '.']]

### Similarities

In [6]:
s1 = nlp("The cat eats fish.")
s2 = nlp("The cat eats blue fish.")

#### Jaccard

In [7]:
def jaccard(sa, sb):
    a = set([token.text for token in sa]) 
    b = set([token.text for token in sb])
    return len(a.intersection(b)) / len(a.union(b))

In [8]:
jaccard(s1, s2)

0.8333333333333334

#### Cosine

In [9]:
def cosine(sa, sb):
    a = set([token.text for token in sa]) 
    b = set([token.text for token in sb])
    return len(a.intersection(b)) / (len(a) * len(b)) ** .5

In [10]:
cosine(s1, s2)

0.9128709291752769

## Morphology

In [11]:
s = nlp("Women want children.")

### Part of Speech & lemmas

In [12]:
[(token.text, token.pos_, token.tag_, token.lemma_) for token in s]

[('Women', 'NOUN', 'NNS', 'woman'),
 ('want', 'VERB', 'VBP', 'want'),
 ('children', 'NOUN', 'NNS', 'child'),
 ('.', 'PUNCT', '.', '.')]

## Word Sequences
### NERC

In [13]:
from spacy import displacy
import warnings
warnings.filterwarnings("ignore")

In [14]:
sentence = "Mark Pedersen and John Smith are working at Google since 1994 for $1000 per week."
doc = nlp(sentence)
[(ent.text, ent.label_) for ent in doc.ents]

[('Mark Pedersen', 'PERSON'),
 ('John Smith', 'PERSON'),
 ('Google', 'ORG'),
 ('1994', 'DATE'),
 ('1000', 'MONEY')]

In [15]:
[(token.text, token.ent_iob_, token.ent_type_) for token in doc]

[('Mark', 'B', 'PERSON'),
 ('Pedersen', 'I', 'PERSON'),
 ('and', 'O', ''),
 ('John', 'B', 'PERSON'),
 ('Smith', 'I', 'PERSON'),
 ('are', 'O', ''),
 ('working', 'O', ''),
 ('at', 'O', ''),
 ('Google', 'B', 'ORG'),
 ('since', 'O', ''),
 ('1994', 'B', 'DATE'),
 ('for', 'O', ''),
 ('$', 'O', ''),
 ('1000', 'B', 'MONEY'),
 ('per', 'O', ''),
 ('week', 'O', ''),
 ('.', 'O', '')]

In [16]:
displacy.render(doc, style='ent')

#### Merging

In [17]:
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[0:2], attrs={"LEMMA": "Mark Pedersen"})
    retokenizer.merge(doc[3:5], attrs={"LEMMA": "John Smith"})

In [18]:
[(token.text, token.pos_, token.tag_, token.lemma_, token.ent_type_) for token in doc]

[('Mark Pedersen', 'PROPN', 'NNP', 'Mark Pedersen', 'PERSON'),
 ('and', 'CCONJ', 'CC', 'and', ''),
 ('John Smith', 'PROPN', 'NNP', 'John Smith', 'PERSON'),
 ('are', 'VERB', 'VBP', 'be', ''),
 ('working', 'VERB', 'VBG', 'work', ''),
 ('at', 'ADP', 'IN', 'at', ''),
 ('Google', 'PROPN', 'NNP', 'Google', 'ORG'),
 ('since', 'ADP', 'IN', 'since', ''),
 ('1994', 'NUM', 'CD', '1994', 'DATE'),
 ('for', 'ADP', 'IN', 'for', ''),
 ('$', 'SYM', '$', '$', ''),
 ('1000', 'NUM', 'CD', '1000', 'MONEY'),
 ('per', 'ADP', 'IN', 'per', ''),
 ('week', 'NOUN', 'NN', 'week', ''),
 ('.', 'PUNCT', '.', '.', '')]

### Noun Chunks

In [19]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
[(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text) for chunk in doc.noun_chunks]

[('Autonomous cars', 'cars', 'nsubj', 'shift'),
 ('insurance liability', 'liability', 'dobj', 'shift'),
 ('manufacturers', 'manufacturers', 'pobj', 'toward')]

## Dependency Parsing

In [20]:
displacy.render(doc, style='dep')