In [90]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\allex\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1.Download it through python (inside the code, so you don't have to upload the file too when you send the solution for this exercise) with urlopen() from module urllib and read the entire text in one single string. If the download takes too much time at each running, download the file, but leave the former instructions in a comment (to show that you know how to access an online file)

In [91]:
from urllib.request import urlopen

def download_text(url):
    try:
        with urlopen(url) as response:
            text = response.read().decode('utf-8')
        return text
    except Exception as e:
        print("Error:", e)
        return None

url = "https://www.gutenberg.org/files/1342/1342-0.txt"

# Download the text
text = download_text(url)

if text:
    # Print the first 500 characters of the text
    print(text[:500])
else:
    print("Failed to download the text.")


﻿The Project Gutenberg eBook of Pride and prejudice, by Jane Austen

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online at
www.gutenberg.org. If you are not located in the United States, you
will have to check the laws of the country where you are located before


# 2. Remove the header (keep only the text starting from the title)

In [92]:
def remove_header(text):
    start_of_book = "*** START OF THE PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE ***"
    start_index = text.find(start_of_book)
    if start_index != -1:
        return text[start_index + len(start_of_book):]
    else:
        print("Header pattern not found.")
        return text

text_without_header = remove_header(text)

print(text_without_header[:500])








                            [Illustration:

                             GEORGE ALLEN
                               PUBLISHER

                        156 CHARING CROSS ROAD
                                LONDON

                             RUSKIN HOUSE
                                   ]

                            [Illustration:

               _Reading Jane’s Letters._      _Chap 34._
                                   ]




                                


# 3. Print the number of sentences in the text. Print the average length (number of words) of a sentence

In [93]:
def count_sentences_and_avg_length(text):
    sentences = nltk.sent_tokenize(text) # tokenize text into senteces

    num_sentences = len(sentences) 

    total_words = sum(len(nltk.word_tokenize(sentence)) for sentence in sentences) # tokenize each sentence into words and calculate total of words

    avg_length = total_words / num_sentences if num_sentences > 0 else 0

    return num_sentences, avg_length

num_sentences, avg_length = count_sentences_and_avg_length(text_without_header)

print("Number of sentences:", num_sentences)
print("Average length of a sentence (in terms of number of words):", avg_length)


Number of sentences: 4910
Average length of a sentence (in terms of number of words): 31.418126272912424


# 4. Find the collocations in the text (bigram and trigram). Use the nltk.collocations module You will print them only once not each time they appear.

In [94]:
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

def find_collocations(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Initialize BigramCollocationFinder and TrigramCollocationFinder
    bigram_finder = nltk.BigramCollocationFinder.from_words(words)
    trigram_finder = nltk.TrigramCollocationFinder.from_words(words)

    # Find bigram collocations
    bigram_collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 10)

    # Find trigram collocations
    trigram_collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 10)

    return bigram_collocations, trigram_collocations

# Call the function with the text without the header
bigram_collocations, trigram_collocations = find_collocations(text_without_header)

# Print the bigram collocations
print("Bigram Collocations:")
for bigram in bigram_collocations:
    print(' '.join(bigram))

# Print the trigram collocations
print("\nTrigram Collocations:")
for trigram in trigram_collocations:
    print(' '.join(trigram))


Bigram Collocations:
'AS-IS '
1500 West
20 %
24 Tailpiece
809 North
AGREEMENT WILL
ALLEN PUBLISHER
August 2_
BE LIABLE
CAROLINE BINGLEY.

Trigram Collocations:
809 North 1500
CHARING CROSS ROAD
CONTRACT EXCEPT THOSE
EXCEPT THOSE PROVIDED
Elizabeth._ _GEORGE SAINTSBURY._
Frontispiece iv Title-page
GEORGE ALLEN PUBLISHER
H.T Feb 94
Internal Revenue Service
J. Comyns Carr


# 5. Create a list of all the words (in lower case) from the text, without the punctuation.

In [95]:
words = nltk.word_tokenize(text_without_header)

words_without_punctuation = [word.lower() for word in words if word.isalnum()] # alpha-numeric strings

print(words_without_punctuation[:50])

['illustration', 'george', 'allen', 'publisher', '156', 'charing', 'cross', 'road', 'london', 'ruskin', 'house', 'illustration', 'jane', 's', 'pride', 'and', 'prejudice', 'by', 'jane', 'austen', 'with', 'a', 'preface', 'by', 'george', 'saintsbury', 'and', 'illustrations', 'by', 'hugh', 'thomson', 'illustration', '1894', 'ruskin', '156', 'charing', 'house', 'cross', 'road', 'london', 'george', 'allen', 'chiswick', 'press', 'charles', 'whittingham', 'and', 'tooks', 'court', 'chancery']


# 6. Print the first N most frequent words (alphanumeric strings) together with their number of appearances.

In [96]:
from collections import Counter

# Tokenize the text into words
words = nltk.word_tokenize(text_without_header)

# Remove punctuation from the words and convert to lowercase
words = [word.lower() for word in words if word.isalnum()]

# Calculate word frequencies
word_freq = Counter(words)

# Sort word frequencies by count in descending order
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

# Number of most frequent words to print
N = 20  # Change N to print more or fewer words

# Print the first N most frequent words and their counts
print(f"Top {N} most frequent words:")
for word, freq in sorted_word_freq[:N]:
    print(f"{word}: {freq}")


Top 20 most frequent words:
the: 4832
to: 4377
of: 3950
and: 3816
her: 2248
i: 2097
a: 2090
in: 2033
was: 1870
she: 1732
not: 1629
that: 1619
it: 1571
you: 1392
he: 1349
his: 1288
be: 1279
as: 1238
had: 1180
with: 1145


# 7. Remove stopwords and assign the result to variable lws

In [97]:
from nltk.corpus import stopwords
nltk.download('stopwords')

words = nltk.word_tokenize(text_without_header)

words = [word.lower() for word in words if word.isalnum()]

stop_words = set(stopwords.words('english'))

lws = [word for word in words if word not in stop_words]

print(lws[:20])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\allex\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['illustration', 'george', 'allen', 'publisher', '156', 'charing', 'cross', 'road', 'london', 'ruskin', 'house', 'illustration', 'jane', 'pride', 'prejudice', 'jane', 'austen', 'preface', 'george', 'saintsbury']


# 8. Apply stemming (Porter) on the list of words (lws). Print the first 200 words. Do you see any words that don't appear in the dictionary?

Stemming might produce non-words or words that are not present in a standard English dictionary. Stemming involves reducing words to their root or base form, so the stemmed words may not always correspond to actual words in the English language. Therefore, it's possible that some of the stemmed words won't appear in a dictionary.

Another better approach is using Lemmatization, which aims to return the base or dictionary from of a words using a vocabulary and morphological analysis of words to ensure that the resulting lemma is a valid word in the language

In [98]:
from nltk.stem import PorterStemmer

# Initialize Porter stemmer
porter_stemmer = PorterStemmer()

# Apply stemming to the list of words
stemmed_words = [porter_stemmer.stem(word) for word in lws]

# Print the first 200 stemmed words
print(stemmed_words[:200])

['illustr', 'georg', 'allen', 'publish', '156', 'chare', 'cross', 'road', 'london', 'ruskin', 'hous', 'illustr', 'jane', 'pride', 'prejudic', 'jane', 'austen', 'prefac', 'georg', 'saintsburi', 'illustr', 'hugh', 'thomson', 'illustr', '1894', 'ruskin', '156', 'chare', 'hous', 'cross', 'road', 'london', 'georg', 'allen', 'chiswick', 'press', 'charl', 'whittingham', 'took', 'court', 'chanceri', 'lane', 'london', 'illustr', 'comyn', 'carr', 'acknowledg', 'owe', 'friendship', 'advic', 'illustr', 'grate', 'prefac', 'illustr', 'whitman', 'somewher', 'fine', 'distinct', 'love', 'allow', 'love', 'person', 'distinct', 'appli', 'book', 'well', 'men', 'women', 'case', 'numer', 'author', 'object', 'person', 'affect', 'bring', 'curiou', 'consequ', 'much', 'differ', 'best', 'work', 'case', 'other', 'love', 'allow', 'convent', 'felt', 'right', 'proper', 'thing', 'love', 'sect', 'fairli', 'larg', 'yet', 'unusu', 'choic', 'austenian', 'janit', 'would', 'probabl', 'found', 'partisan', 'claim', 'primaci',

# 9. Print a table of three columns (of size N, where N is the maximum length for the words in the text). The columns will be separated with the character "|". The head of the table will be:
## Porter    |Lancaster |Snowball
# The table will contain only the words that give different stemming results for the three stemmers (for example, suppose that we have both "runs" and "being" inside the text. The word "runs" should not appear in the list, as all three results are "run"; however "being" should appear in the table). The stemming result for the word for each stemmer will appear in the table according to the head of the table. The table will contain the results for the first NW words from the text (the number of rows will obviously be less than NW, as not all words match the requirements). For example, NW=500. Try to print only distinct results inside the table (for example, if a word has two occurnces inside the text, and matches the requirments for appearing in the table, it should have only one corresponding row).

In [99]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snowball_stemmer = SnowballStemmer('english')
NW = 50

distinct_stemmed_results = set()

table = f"{'Porter':<15} | {'Lancaster':<15} | {'Snowball':<15}\n"

count = 0

for word in words:
    porter_stem = porter_stemmer.stem(word)
    lancaster_stem = lancaster_stemmer.stem(word)
    snowball_stem = snowball_stemmer.stem(word)
    
    # Check if stemming results are different
    if porter_stem != lancaster_stem or porter_stem != snowball_stem:
        # Check if the stemmed result is not already in the set
        if (porter_stem, lancaster_stem, snowball_stem) not in distinct_stemmed_results:
            # Add the stemmed result to the set
            distinct_stemmed_results.add((porter_stem, lancaster_stem, snowball_stem))
            
            # Add the stemmed result to the table
            table += f"{porter_stem:<15} | {lancaster_stem:<15} | {snowball_stem:<15}\n"
            
            count += 1
            
            if count == NW:
                break

# Print the table
print(table)
print(len(distinct_stemmed_results))

Porter          | Lancaster       | Snowball       
illustr         | illust          | illustr        
allen           | al              | allen          
publish         | publ            | publish        
chare           | char            | chare          
jane            | jan             | jane           
pride           | prid            | pride          
prejudic        | prejud          | prejudic       
austen          | aust            | austen         
saintsburi      | saintsbury      | saintsburi     
chanceri        | chancery        | chanceri       
lane            | lan             | lane           
carr            | car             | carr           
all             | al              | all            
owe             | ow              | owe            
hi              | his             | his            
friendship      | friend          | friendship     
advic           | adv             | advic          
these           | thes            | these          
are         

# 10. Print a table of two columns, simillar to the one above, that will compare the results of stemming and lemmatization. The head of the table will contain the values: "Snowball" and "WordNetLemmatizer". The table must contain only words that give different results in the process of stemming and lemmatization (for example, the word "running"). The table will contain the results for the first NW words from the text (the number of rows will obviously be less than NW, as not all words match the requirements). For example, NW=500. Try to print only distinct results inside the table (for example, if a word has two occurnces inside the text, and matches the requirments for appearing in the table, it should have only one corresponding row).

In [100]:
from nltk.stem import SnowballStemmer, WordNetLemmatizer

nltk.download('wordnet')

snowball_stemmer = SnowballStemmer('english')
wordnet_lemmatizer = WordNetLemmatizer()

distinct_results = set()

table = f"{'Snowball':<15} | {'WordNetLemmatizer':<15}\n"

count = 0

for word in words:
    snowball_stem = snowball_stemmer.stem(word)
    wordnet_lemma = wordnet_lemmatizer.lemmatize(word)
    
    if snowball_stem != wordnet_lemma:
        if (snowball_stem, wordnet_lemma) not in distinct_results:
            distinct_results.add((snowball_stem, wordnet_lemma))
            
            table += f"{snowball_stem:<15} | {wordnet_lemma:<15}\n"
            
            count += 1
            
            if count == NW:
                break

# Print the table
print(table)


Snowball        | WordNetLemmatizer
illustr         | illustration   
georg           | george         
publish         | publisher      
chare           | charing        
hous            | house          
prejudic        | prejudice      
prefac          | preface        
saintsburi      | saintsbury     
charl           | charles        
took            | tooks          
chanceri        | chancery       
comyn           | comyns         
acknowledg      | acknowledgment 
advic           | advice         
grate           | gratefully     
has             | ha             
somewher        | somewhere      
distinct        | distinction    
love            | loving         
allow           | allowance      
person          | personal       
appli           | applies        
as              | a              
women           | woman          
veri            | very           
numer           | numerous       
affect          | affection      
bring           | brings         
consequ     

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\allex\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 11. Print the first N most frequent lemmas (after the removal of stopwords) together with their number of appearances.

In [101]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter

words = nltk.word_tokenize(text_without_header)

# Remove punctuation from the words and convert to lowercase
words = [word.lower() for word in words if word.isalnum()]

stop_words = set(stopwords.words('english'))

words_without_stopwords = [word for word in words if word not in stop_words]

wordnet_lemmatizer = WordNetLemmatizer()

lemmas = [wordnet_lemmatizer.lemmatize(word) for word in words_without_stopwords]

lemma_freq = Counter(lemmas)

sorted_lemma_freq = sorted(lemma_freq.items(), key=lambda x: x[1], reverse=True)

N = 20

print(f"Top {N} most frequent lemmas (after removing stopwords):")
for lemma, freq in sorted_lemma_freq[:N]:
    print(f"{lemma}: {freq}")


Top 20 most frequent lemmas (after removing stopwords):
elizabeth: 643
could: 530
would: 482
darcy: 424
said: 406
bennet: 346
much: 333
must: 321
miss: 315
bingley: 307
jane: 302
one: 293
sister: 288
lady: 279
know: 248
though: 238
never: 228
think: 222
time: 219
may: 216


# 12. Change all the numbers from lws into words. Print the number of changes, and also the portion of list that contains first N changes (for example N=10)

In [102]:
import sys
import inflect

p = inflect.engine()

changes_count = 0

N = 10
first_N_changes = []

min_index = sys.maxsize
max_index = 0

for i, word in enumerate(lws):
    if word.isdigit(): 
        # Convert number to word representation
        word_in_words = p.number_to_words(word)
        # Replace number with word representation
        lws[i] = word_in_words
        
        changes_count += 1
        if len(first_N_changes) < N:
            if i < min_index:
                min_index = i
            if i > max_index:
                max_index = i
            
            first_N_changes.append((word, word_in_words))

print("Number of changes:", changes_count)

print(f"\nFirst {N} changes:")
for number, word in first_N_changes:
    print(f"{number} -> {word}")

print("\nFirst 50 elements of lws after changes:")
print(lws[min_index:max_index][:50])

Number of changes: 161

First 10 changes:
156 -> one hundred and fifty-six
1894 -> one thousand, eight hundred and ninety-four
156 -> one hundred and fifty-six
1796 -> one thousand, seven hundred and ninety-six
1813 -> one thousand, eight hundred and thirteen
1 -> one
2 -> two
5 -> five
6 -> six
9 -> nine

First 50 elements of lws after changes:
['one hundred and fifty-six', 'charing', 'cross', 'road', 'london', 'ruskin', 'house', 'illustration', 'jane', 'pride', 'prejudice', 'jane', 'austen', 'preface', 'george', 'saintsbury', 'illustrations', 'hugh', 'thomson', 'illustration', 'one thousand, eight hundred and ninety-four', 'ruskin', 'one hundred and fifty-six', 'charing', 'house', 'cross', 'road', 'london', 'george', 'allen', 'chiswick', 'press', 'charles', 'whittingham', 'tooks', 'court', 'chancery', 'lane', 'london', 'illustration', 'comyns', 'carr', 'acknowledgment', 'owe', 'friendship', 'advice', 'illustrations', 'gratefully', 'preface', 'illustration']


# 13. Create a function that receives an integer N and a word W as parameter (it can also receive the list of words from the text). We want to print the concordance data for that word. This means printing the window of text (words on consecutive positions) of length N, that has the givend word W in the middle. For example, for the text ""I have two dogs and a cat. Do you have pets too? My cat likes to chase mice. My dogs like to chase my cat." and a window of length 3, the concordance data for the word "cat" would be ["dogs", "cat", "pets"] and ["pets","cat", "likes"] (we consider the text without stopwords and punctuation). However, as you can see, the window of text may contain words from different sentences. Create a second function that prints windows of texts that contain words only from the phrase containing word W. We want to print concordance data for all the inflexions of word W.

In [103]:
import nltk
from nltk.tokenize import word_tokenize

def concordance_data_all_sentences(text, N, W):
    words = [word.lower() for word in word_tokenize(text) if word.isalnum()]
    for i, word in enumerate(words):
        if word == W:
            start_index = max(0, i - N)
            end_index = min(len(words), i + N + 1)
            window = words[start_index:end_index]
            print(window)

def concordance_data_same_sentence(text, N, W):
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = [word.lower() for word in word_tokenize(sentence) if word.isalnum()]
        for i, word in enumerate(words):
            if word == W:
                start_index = max(0, i - N)
                end_index = min(len(words), i + N + 1)
                window = words[start_index:end_index]
                print(window)

# Example usage:
text = "I have two dogs and a cat. Do you have pets too? My cat likes to chase mice. My dogs like to chase my cat."
N = 1
W = "cat"

print("Concordance Data (All Sentences):")
concordance_data_all_sentences(text, N, W)

print("\nConcordance Data (Same Sentence):")
concordance_data_same_sentence(text, N, W)


Concordance Data (All Sentences):
['a', 'cat', 'do']
['my', 'cat', 'likes']
['my', 'cat']

Concordance Data (Same Sentence):
['a', 'cat']
['my', 'cat', 'likes']
['my', 'cat']
