# 1. Getting Started

Spacy is available in 55+ languages.

[Language Available in spaCy](https://spacy.io/usage/models/#languages)

The general syntax to import a language is: `from spacy.lang.__ import Language`

In [None]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

# Process a text
doc = nlp("This is a sentence.")

# Print the document text
print(doc.text)

In [None]:
# Import the German language class
from spacy.lang.de import German

# Create the nlp object
nlp = German()

# Process a text (this is German for: "Kind regards!")
doc = nlp("Liebe Grüße!")

# Print the document text
print(doc.text)

In [None]:
# Import the spannish language class
from spacy.lang.es import Spanish

# Create the nlp object
nlp = Spanish()

# Process a text (this is a Spanish for: )
doc = nlp("¿Cómo estás?")
print(doc)

# 2. Documents, spans and tokens

When you call `nlp` on a string, spaCy first tokenizes the text and creates a document object.

In [None]:
# Import the English language class and create the nlp object
from spacy.lang.en import English

nlp = English()

# Process the text 
doc = nlp('I like tree kangaroos and narwhals.')
print(doc)

# Select the first token
first_token = doc[0]
# Print the first token
print("First word: ", first_token.text)

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print("Tree Kangarous slice: ", tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhales" (without the '.')
tree_kangaroos_and_narwhals = doc[2:6]
print("Tree Kangaroos and Narwhals: ", tree_kangaroos_and_narwhals)

# 3. Lexical Attributes

We can use spaCy's `Doc` and `Token` objects, and lexical attributes to find a pattern in a text.

In [None]:
# In this example we will be looking for two subsequent tokens: a number and a percent sign

from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number using `like_num` token attribute
    if token.like_num: 
        # Get the next token in the document
        # The index of the next token in the `doc` is `token.i + 1`
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print(f"Percentage found: {token.text}%")

# 4. Loading models
    
You can install spaCy models using this command: `python -m spacy download en_core_web_sm`

In [None]:
import spacy

# Load the "en_core_web_sm"
nlp = spacy.load('en_core_web_sm')

text = "It's official: Apple is the first U.S. puplic company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Print the document text
print(doc.text)

# 5. Predicting linguistic annotations

We will get to try one of spaCy's pre-trained model packages and see its predictions in action.

In [None]:
for token in doc:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

## Additional Token Attributes
We'll see these again in upcoming lectures. For now we just want to illustrate some of the other information that spaCy assigns to tokens:

|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

# 6. Predicting named entities in context

Models are statistical and not always right. Whether their predictions are correct depends on the training data and the text you're processing.

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

# 7. Using the Matcher

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

matcher = Matcher(nlp.vocab)

pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]
matcher.add('IPHONE_X_PATTERN', None, pattern)

matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

# 8. Writing match patterns

- Write one pattern that only matches mentions of full iOS versions: "iOS 7", "iOS 11" and "iOS 10".

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found: ", len(matches))

for match_id, start, end in matches:
    print("Match Found: ", doc[start:end].text)

- Write one pattern that only matches forms of "download" (tokens with the lemma "download"), followed by a token with the part-of-speech tag `PROPN` (proper noun)

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

pattern = [{'LEMMA': 'download'}, {'POS': 'PROPN'}]
matcher.add('DOWNLOAD_THINGS_PATTERN', None, pattern)
matches = matcher(doc)

print("Total matches found: ", len(matches))

for match_id, start, end in matches:
    print("Match Found: ", doc[start:end].text)

- Write one pattern that matches adjectives (`"ADJ"`) followed by one or two `"NOUN"`s (one noun and one optional name).

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP":"?"}]
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)

print("Total matches found: ", len(matches))

for match_id, start, end in matches:
    print("Match Found: ", doc[start:end].text)