<a href="https://colab.research.google.com/github/sabiretutardemir/100-Day-ML-Codes-Challenge/blob/main/NLP_Harry_Potter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load the harry_potter book. You can find this text corpus in the datasets
folder.

* Segment the text of the book into sentences. How many sentences does this book have?

* Compute the frequency of each token in the book. What are the most frequent tokens?

* Choose a sentence from the book. Analyze this chosen sentence by
  * Calculating all n-grams.
  * Finding POS tags of tokens.
  * Stemming and lemmatizing tokens.

* Check the documentation to identify the most important hyperparameters, attributes, and methods. Use them in practice.

In [1]:
#load harry potter dataset

with open('/content/harry_potter.txt', 'r', encoding='utf-8') as file:
    book_text = file.read()

# Now you can use book_text with NLTK or spaCy
print(book_text[:500])  # Just printing the first 500 characters as a sample


CHAPTER ONE THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. 

Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and 


In [2]:
#How many sentences does this book have?
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(book_text)
num_sentences = len(sentences)
print(f"The book has {num_sentences} sentences.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


The book has 6394 sentences.


In [3]:
#do the same thing with spacy
import spacy

#Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Create a spaCy document object
doc = nlp(book_text)

# Segment the text into sentences
sentences = [sent.text for sent in doc.sents]

# Print the number of sentences
print(f'Total number of sentences: {len(sentences)}')


Total number of sentences: 6186


In [4]:

from nltk.tokenize import word_tokenize
from collections import Counter

# Tokenize the text into words (tokens)
tokens = word_tokenize(book_text.lower())  #lowercase for uniformity

# Compute token frequency
frequency = Counter(tokens)

# Get the most common tokens
most_common_tokens = frequency.most_common(10)  # Top 10 most frequent tokens

print("Most frequent tokens:", most_common_tokens)


Most frequent tokens: [(',', 5658), ('.', 5112), ('the', 3625), ("''", 2443), ('``', 2305), ('and', 1916), ('to', 1855), ('he', 1756), ('a', 1688), ('harry', 1324)]


In [6]:
most_common_tokens

[(',', 5658),
 ('.', 5112),
 ('the', 3625),
 ("''", 2443),
 ('``', 2305),
 ('and', 1916),
 ('to', 1855),
 ('he', 1756),
 ('a', 1688),
 ('harry', 1324)]

In [7]:
doc = nlp(book_text.lower())  #lowercase for uniformity

# Extract tokens (filter out punctuation and spaces)
tokens = [token.text for token in doc if not token.is_punct and not token.is_space]

# Compute token frequency
frequency = Counter(tokens)

# Get the most common tokens
most_common_tokens = frequency.most_common(10)  # Top 10 most frequent tokens

print("Most frequent tokens:", most_common_tokens)

Most frequent tokens: [('the', 3627), ('and', 1919), ('to', 1861), ('he', 1756), ('a', 1690), ('harry', 1325), ('of', 1266), ('was', 1261), ('it', 1185), ('you', 1027)]


In [8]:
most_common_tokens

[('the', 3627),
 ('and', 1919),
 ('to', 1861),
 ('he', 1756),
 ('a', 1690),
 ('harry', 1325),
 ('of', 1266),
 ('was', 1261),
 ('it', 1185),
 ('you', 1027)]

In [9]:
import random
# 1. Tokenize the text into sentences
sentences = sent_tokenize(book_text)

# 2. Choose a random sentence
random_sentence = random.choice(sentences)


In [10]:
from nltk import word_tokenize, ngrams, pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Example sentence from the book (replace this with an actual sentence)
sentence = random_sentence

# 1. Tokenize the sentence
tokens = word_tokenize(sentence.lower())  # Lowercased for consistency

# 2. Calculate all n-grams (unigrams, bigrams, trigrams)
unigrams = list(ngrams(tokens, 1))
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

print("Unigrams:", unigrams)
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)

# 3. Find POS tags
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)

# 4. Stemming (PorterStemmer)
stemmer = PorterStemmer()
stems = [stemmer.stem(token) for token in tokens]
print("Stems:", stems)

# 5. Lemmatization (WordNetLemmatizer)
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in tokens]
print("Lemmas:", lemmas)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unigrams: [('hermione',), ('opened',), ('her',), ('mouth',), (',',), ('perhaps',), ('to',), ('tell',), ('ron',), ('exactly',), ('how',), ('to',), ('use',), ('the',), ('curse',), ('of',), ('the',), ('bogies',), (',',), ('but',), ('harry',), ('hissed',), ('at',), ('her',), ('to',), ('be',), ('quiet',), ('and',), ('beckoned',), ('them',), ('all',), ('forward',), ('.',)]
Bigrams: [('hermione', 'opened'), ('opened', 'her'), ('her', 'mouth'), ('mouth', ','), (',', 'perhaps'), ('perhaps', 'to'), ('to', 'tell'), ('tell', 'ron'), ('ron', 'exactly'), ('exactly', 'how'), ('how', 'to'), ('to', 'use'), ('use', 'the'), ('the', 'curse'), ('curse', 'of'), ('of', 'the'), ('the', 'bogies'), ('bogies', ','), (',', 'but'), ('but', 'harry'), ('harry', 'hissed'), ('hissed', 'at'), ('at', 'her'), ('her', 'to'), ('to', 'be'), ('be', 'quiet'), ('quiet', 'and'), ('and', 'beckoned'), ('beckoned', 'them'), ('them', 'all'), ('all', 'forward'), ('forward', '.')]
Trigrams: [('hermione', 'opened', 'her'), ('opened', 

In [11]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# 1. Process the sentence with spaCy
doc = nlp(sentence.lower())

# 2. Calculate n-grams manually (unigrams, bigrams, trigrams)
unigrams = [token.text for token in doc]
bigrams = [f'{doc[i].text} {doc[i+1].text}' for i in range(len(doc)-1)]
trigrams = [f'{doc[i].text} {doc[i+1].text} {doc[i+2].text}' for i in range(len(doc)-2)]

print("Unigrams:", unigrams)
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)

# 3. Find POS tags
pos_tags = [(token.text, token.pos_) for token in doc]
print("POS Tags:", pos_tags)

# 4. Stemming (spaCy doesn’t have a stemmer, but we can use the lemma as a similar approach)
# 5. Lemmatization (spaCy's built-in lemmatizer)
lemmas = [token.lemma_ for token in doc]
print("Lemmas:", lemmas)


Unigrams: ['hermione', 'opened', 'her', 'mouth', ',', 'perhaps', 'to', 'tell', 'ron', 'exactly', 'how', 'to', 'use', 'the', 'curse', 'of', 'the', 'bogies', ',', 'but', 'harry', 'hissed', 'at', 'her', 'to', 'be', 'quiet', 'and', 'beckoned', 'them', 'all', 'forward', '.']
Bigrams: ['hermione opened', 'opened her', 'her mouth', 'mouth ,', ', perhaps', 'perhaps to', 'to tell', 'tell ron', 'ron exactly', 'exactly how', 'how to', 'to use', 'use the', 'the curse', 'curse of', 'of the', 'the bogies', 'bogies ,', ', but', 'but harry', 'harry hissed', 'hissed at', 'at her', 'her to', 'to be', 'be quiet', 'quiet and', 'and beckoned', 'beckoned them', 'them all', 'all forward', 'forward .']
Trigrams: ['hermione opened her', 'opened her mouth', 'her mouth ,', 'mouth , perhaps', ', perhaps to', 'perhaps to tell', 'to tell ron', 'tell ron exactly', 'ron exactly how', 'exactly how to', 'how to use', 'to use the', 'use the curse', 'the curse of', 'curse of the', 'of the bogies', 'the bogies ,', 'bogies