# 1. Create a Doc object from the file peterrabbit.txt

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
# Read the contents of the file
with open('peterrabbit.txt', 'r') as file:
    text = file.read()

# Create a Doc object
doc = nlp(text)

# 2. For every token in the third sentence, print the token text, the POS tag, the fine-grained TAG tag, and the description of the fine-grained tag

In [2]:
# Get the third sentence
third_sentence = list(doc.sents)[2] 
for token in third_sentence:
    print(f"Text: {token.text}, POS: {token.pos_}, Fine-grained TAG: {token.tag_}, Description: {token.tag_}")

Text: They, POS: PRON, Fine-grained TAG: PRP, Description: PRP
Text: lived, POS: VERB, Fine-grained TAG: VBD, Description: VBD
Text: with, POS: ADP, Fine-grained TAG: IN, Description: IN
Text: their, POS: PRON, Fine-grained TAG: PRP$, Description: PRP$
Text: Mother, POS: PROPN, Fine-grained TAG: NNP, Description: NNP
Text: in, POS: ADP, Fine-grained TAG: IN, Description: IN
Text: a, POS: DET, Fine-grained TAG: DT, Description: DT
Text: sand, POS: NOUN, Fine-grained TAG: NN, Description: NN
Text: -, POS: PUNCT, Fine-grained TAG: HYPH, Description: HYPH
Text: bank, POS: NOUN, Fine-grained TAG: NN, Description: NN
Text: ,, POS: PUNCT, Fine-grained TAG: ,, Description: ,
Text: underneath, POS: ADP, Fine-grained TAG: IN, Description: IN
Text: the, POS: DET, Fine-grained TAG: DT, Description: DT
Text: root, POS: NOUN, Fine-grained TAG: NN, Description: NN
Text: of, POS: ADP, Fine-grained TAG: IN, Description: IN
Text: a, POS: DET, Fine-grained TAG: DT, Description: DT
Text: 
, POS: SPACE, Fi

# 3. Provide a frequency list of POS tags from the entire document 

In [3]:
from collections import Counter
pos_tags = [token.pos_ for token in doc]
pos_freq = Counter(pos_tags)
print("Frequency of POS tags:")
for pos, freq in pos_freq.items():
    print(f"{pos}: {freq}")

Frequency of POS tags:
DET: 90
PROPN: 75
ADP: 124
PUNCT: 172
NUM: 8
SPACE: 99
ADV: 65
SCONJ: 20
NOUN: 173
PRON: 108
VERB: 131
ADJ: 54
CCONJ: 61
AUX: 50
PART: 28


# 4. CHALLENGE: What percentage of tokens are nouns?

In [4]:
num_nouns = sum(1 for token in doc if token.pos_ == "NOUN")
total_tokens = len(doc)
percentage_nouns = (num_nouns / total_tokens) * 100

print(f"Percentage of tokens that are nouns: {percentage_nouns:.2f}%")

Percentage of tokens that are nouns: 13.75%


# 5. Display the Dependency Parse for the third sentence

In [10]:
import spacy.displacy as displacy
displacy.render(third_sentence, style='dep', jupyter=True)

# 6. Show the first two named entities from Beatrix Potter's The Tale of Peter Rabbit

In [6]:
named_entities = list(doc.ents)
print("First two named entities:")
for entity in named_entities[:2]:
    print(f"Text: {entity.text}, Label: {entity.label_}")

First two named entities:
Text: The Tale of Peter Rabbit, Label: WORK_OF_ART
Text: Beatrix Potter, Label: PERSON


# 7. How many sentences are contained in The Tale of Peter Rabbit?

In [7]:
# Count the number of sentences
num_sentences = len(list(doc.sents))
print(f"Number of sentences: {num_sentences}")

Number of sentences: 57


# 8. CHALLENGE: How many sentences contain named entities?

In [8]:
# Count sentences that contain named entities
sentences_with_entities = sum(1 for sent in doc.sents if any(ent for ent in sent.ents))
print(f"Number of sentences containing named entities: {sentences_with_entities}")

Number of sentences containing named entities: 38


# 9. Display the named entity visualization for list_of_sents[0] from the previous problem

In [11]:
first_sentence = list(doc.sents)[0]
displacy.render(first_sentence, style='ent', jupyter=True)