In [1]:
%%capture
!pip install boilerpipe3

In [4]:
import json
import time
from boilerpipe.extract import Extractor

In [3]:
def get_content(url):
    view_logs = True
    
    if view_logs:
        print("Trying to fetch content from: " + str(url), flush = True)
    
    try:
        extractor = Extractor(extractor = 'ArticleExtractor', url = url)
        # Try out 'KeepEverythingExtractor'
        
        text = extractor.getText()
        # Try out extractor.getHTML()
        
        text = str(text)
        
        text = text.replace("\n", " ")
        
        mapping = [url, text]

        # Uncomment to Test on Small Datasets
        if view_logs:
            print("Successful Retrieval", flush = True)
        return mapping

    except Exception as E:
        if view_logs:
            print("Retrieval Failed", flush = True)

        mapping = [url, "!FAILURE!"]
        return mapping

In [4]:
url = "https://www.indiatoday.in/india/story/new-ac-3-tier-economy-class-coach-fare-lower-than-regular-railways-1846614-2021-08-29"

begin = time.time()
content = get_content(url)

end = time.time()
duration = int(end - begin)
print("Time Taken: " + str(int(duration / 60)) + " minutes, " + str(int(duration % 60)) + " seconds", flush = True)

Trying to fetch content from: https://www.indiatoday.in/india/story/new-ac-3-tier-economy-class-coach-fare-lower-than-regular-railways-1846614-2021-08-29
Successful Retrieval
Time Taken: 0 minutes, 1 seconds


In [5]:
view_now = False

if view_now:
    print(content)

In [6]:
with open('content.txt', "w+") as f:
    f.write(content[1])

In [5]:
%%capture
# nltk ~ covering the basics

!pip install nltk

import nltk
nltk.download('punkt')

In [13]:
# Processing the Extracted Text
from_above = False

if from_above:
    text = content[1]
else:
    text = text = '“Mr. Ocean lived in the U.S. … not anymore?”, asked Noah. Marvin replied that Mr. Ocean moved back to England. Noah was somewhat surprised.'
    
display = True

if display:
    print('Raw Text: ')
    print(text)

Raw Text: 
“Mr. Ocean lived in the U.S. … not anymore?”, asked Noah. Marvin replied that Mr. Ocean moved back to England. Noah was somewhat surprised.


In [14]:
# Extracting sentences

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text)

print('Number of sentences: ' + str(len(sentences)))
print()

if display:
    for index, sentence in enumerate(sentences):
        print(str(index + 1) + '. ' + sentence)

Number of sentences: 5

1. “Mr.
2. Ocean lived in the U.S. … not anymore?”, asked Noah.
3. Marvin replied that Mr.
4. Ocean moved back to England.
5. Noah was somewhat surprised.


In [17]:
type(sentences)

if not from_above:
    sentences = ['“Mr. Ocean lived in the U.S. … not anymore?”, asked Noah.', 'Marvin replied that Mr. Ocean moved back to England.', 'Noah was somewhat surprised.']

for index, sent in enumerate(sentences):
    sentences[index] = sent.lower()
    
print(sentences)

['“mr. ocean lived in the u.s. … not anymore?”, asked noah.', 'marvin replied that mr. ocean moved back to england.', 'noah was somewhat surprised.']


In [22]:
# Tokenization of sentences into words and punctuations

import random
from nltk.tokenize import word_tokenize

tokenized = []

for sentence in sentences:
    tokens = word_tokenize(sentence)
    tokenized.append(tokens)
    
if display:
    choice = int(random.random() * len(tokenized))
    print('Sentence ' + str(choice + 1) + ':')
    print(tokenized[choice])
    
# NER Named Entity Recognition, Phrase identification ~ tokenize first, do not stem

Sentence 3:
['noah', 'was', 'somewhat', 'surprised', '.']


In [23]:
# Removing punctuations and stopwords
import string

nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [24]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [29]:
def presence_alnum(token):
    for char in token:
        if char.isalnum():
            return True
    return False

words = []

for unit in tokenized:
    stream = [token for token in unit if
              token not in string.punctuation
              and presence_alnum(token)
              and token not in stop_words]
    
    words.append(stream)
    
if display:
    choice = int(random.random() * len(tokenized))
    print('Sentence ' + str(choice + 1) + ':')
    print(words[choice])

Sentence 3:
['noah', 'somewhat', 'surprised']


In [32]:
# Stemming
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

porter = PorterStemmer()
snowstem = SnowballStemmer(language = 'english')

porter_stemmed = []
snowball_stemmed = []

for unit in words:
    porter_stream = [porter.stem(token.lower()) for token in unit]
    snowball_stream = [snowstem.stem(token.lower()) for token in unit]
    
    porter_stemmed.append(porter_stream)
    snowball_stemmed.append(snowball_stream)
    
see_ids = []

# Visalizing stemmed content
for index in range(len(porter_stemmed)):
    if porter_stemmed[index] != snowball_stemmed[index]:
        see_ids.append(index)
        
        print('Raw Word Tokens | Sentence ' + str(index + 1) + ':')
        print(words[index])
        print()
        
        print('Using Porter Stemmer | Sentence ' + str(index + 1) + ':')
        print(porter_stemmed[index])
        print()

        print('Using Snowball Stemmer | Sentence ' + str(index + 1) + ' :')
        print(snowball_stemmed[index])
        print()

[['mr', 'ocean', 'live', 'u.s.', 'anymor', 'ask', 'noah'],
 ['marvin', 'repli', 'mr.', 'ocean', 'move', 'back', 'england'],
 ['noah', 'somewhat', 'surpris']]

In [14]:
# Lemmatization
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

In [15]:
lemmas = []

for unit in words:
    lemma_stream = [lemmatizer.lemmatize(token.lower()) for token in unit]
    lemmas.append(lemma_stream)
    
for index in see_ids:
    print('Raw Word Tokens | Sentence ' + str(index + 1) + ':')
    print(words[index])
    print()
    
    print('Using WordNet Lemmatizer | Sentence ' + str(index + 1) + ':')
    print(lemmas[index])
    print()

Raw Word Tokens | Sentence 9:
['These', 'passenger-friendly', 'coaches', '83', 'berths', 'fare', 'lower', 'compared', 'regular', '3AC', 'coaches', 'said']

Using WordNet Lemmatizer | Sentence 9:
['these', 'passenger-friendly', 'coach', '83', 'berth', 'fare', 'lower', 'compared', 'regular', '3ac', 'coach', 'said']

Raw Word Tokens | Sentence 13:
['According', 'officials', 'booking', 'tickets', 'passes', 'issued', 'Members', 'Parliament', 'rail', 'travel', 'coupons', 'RTCs', 'issued', 'MLA/MLCs', 'fully', 'reimbursable', 'warrants/vouchers', 'shall', 'permissible', 'per', 'existing', 'provision', '3AC', 'coaches', 'mail/express', 'trains']

Using WordNet Lemmatizer | Sentence 13:
['according', 'official', 'booking', 'ticket', 'pass', 'issued', 'member', 'parliament', 'rail', 'travel', 'coupon', 'rtcs', 'issued', 'mla/mlcs', 'fully', 'reimbursable', 'warrants/vouchers', 'shall', 'permissible', 'per', 'existing', 'provision', '3ac', 'coach', 'mail/express', 'train']

Raw Word Tokens | Sent

In [16]:
# That's all