# Text Acquisition & Ingestion Assignment

In [1]:
#!pip install feedparser

In [2]:
import json
import requests
import feedparser
from bs4 import BeautifulSoup
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

import os
import nltk
# nltk.download('punkt')

In [3]:
def corpus_stats(corpus):
    print("Corpus Statistics")
    print("Number of documents: " + str(len(corpus.fileids())))
    print("Number of paragraphs: " + str(len(corpus.paras())))
    print("Number of sentences: " + str(len(corpus.sents())))
    print("Number of words: " + str(len(corpus.words())))
    print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
    print("Avg chars per word: " + str(round(len(corpus.raw())/len(corpus.words()),1)))
    print("Avg words per sentence: " + str(round(len(corpus.words())/len(corpus.sents()),1)))

### Iterate through the list of article URLs below, scraping the text from each one and saving it to a text file. 

In [4]:
articles = ['http://lite.cnn.io/en/article/h_eac18760a7a7f9a1bf33616f1c4a336d',
            'http://lite.cnn.io/en/article/h_de3f82f17d289680dd2b47c6413ebe7c',
            'http://lite.cnn.io/en/article/h_72f4dc9d6f35458a89af014b62e625ad',
            'http://lite.cnn.io/en/article/h_aa21fe6bf176071cb49e09d422c3adf0',
            'http://lite.cnn.io/en/article/h_8ad34a532921c9076cdc9d7390d2f1bc',
            'http://lite.cnn.io/en/article/h_84422c79110d9989177cfaf1c5f45fe7',
            'http://lite.cnn.io/en/article/h_d010d9580abac3a44c6181ec6fb63d58',
            'http://lite.cnn.io/en/article/h_fb11f4e9d7c5323e75b337d9e9e5e368',
            'http://lite.cnn.io/en/article/h_7b27f0b131067f8ece6238ac559670ab',
            'http://lite.cnn.io/en/article/h_8cae7f735fa9573d470f802063ceffe2',
            'http://lite.cnn.io/en/article/h_72c3668280e82576fcc2602b0fa70c14',
            'http://lite.cnn.io/en/article/h_d20658fb0e20212051cda0e0a7248c8a',
            'http://lite.cnn.io/en/article/h_56611c43d7928120d2ae21666ccc7417',
            'http://lite.cnn.io/en/article/h_bda0394e3c5ee7054ee65c022bca7695']

### Ingest the text files generated via web scraping into a corpus and print the corpus statistics.

In [5]:
folder = 'cnn_lite'
if folder not in os.listdir():
    os.mkdir(folder)

In [6]:
for i, art in enumerate(articles):
    afile = open(folder + "/articles_text" + str(i) + ".p", "wb" )
    response = requests.get(art)
    content = response.text
    
    soup = BeautifulSoup(content, 'lxml')
    title = soup.find('div', class_='afe4286c').h2.text
    afile.write(title.encode("UTF-8"))
    
    text = soup.find('div', class_='afe4286c').find_all('p')
    text_list = [tag.get_text() for tag in text]
    
    for line in text_list:
         afile.write(line.encode("UTF-8"))
    
    afile.close() 

In [7]:
PATH = folder + '/'
DOC_PATTERN = r'articles_text.*\.p'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)
corpus_stats(corpus)

Corpus Statistics
Number of documents: 14
Number of paragraphs: 14
Number of sentences: 427
Number of words: 13668
Vocabulary: 2927
Avg chars per word: 5.0
Avg words per sentence: 32.0


### Parse the O'Reilly Radar RSS feed below, extract the text from each post, and save it to a text file.

The content of each post contains HTML tags. Strip those out using the same approach you used for web scraping so that only text is saved to the files.

In [8]:
feed = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [9]:
parsed = feedparser.parse(feed)
posts = parsed.entries

In [10]:
folder = 'oreilly_radar'
if folder not in os.listdir():
    os.mkdir(folder)

In [11]:
for i, post in enumerate(posts):   
    content = posts[i]['content']
    content = content[0]['value']
    soup = BeautifulSoup(content, 'lxml')

    text = soup.find('body').find_all(['p', 'li', 'h3', 'a'])
    text_list = [tag.get_text() for tag in text]

    afile = open(folder + "/articles_rss" + str(i) + ".p", "wb" )

    for line in text_list:
        afile.write(line.encode("UTF-8"))
    
    afile.close() 

### Ingest the text files generated via RSS parsing into a corpus and print the corpus statistics.

In [12]:
PATH = folder + '/'
DOC_PATTERN = r'articles_rss.*\.p'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)
corpus_stats(corpus)

Corpus Statistics
Number of documents: 60
Number of paragraphs: 61
Number of sentences: 1810
Number of words: 59115
Vocabulary: 6899
Avg chars per word: 5.2
Avg words per sentence: 32.7


### Make an API call to the Hacker News API to retrieve their Ask, Show, and Job category items. 

- URL: https://hacker-news.firebaseio.com/v0/askstories.json

In [13]:
url = 'https://hacker-news.firebaseio.com/v0/askstories.json'
response = requests.get(url)
items = json.loads(response.content)

### Once you have retrieved the item IDs from the URL above, retrieve each item by adding the item ID to the URL below, extract the item's text property, and save the text from each item to disk as its own document.

- URL: https://hacker-news.firebaseio.com/v0/item/ITEM_ID_HERE.json

The content of some items may contain HTML tags. Strip those out using the same approach you used for web scraping so that only text is saved to the files.

In [14]:
folder = 'hacker_news'
if folder not in os.listdir():
    os.mkdir(folder)

In [15]:
for i, item in enumerate(items):
    url = 'https://hacker-news.firebaseio.com/v0/item/' + str(item) + '.json'
    response = requests.get(url)
    content = response.text
    
    obj = eval(content)
        
    if ('text' in obj.keys()):
        text = obj['text']
        afile = open(folder + "/articles_api" + str(i) + ".p", "wb" )
        afile.write(text.encode("UTF-8"))
        afile.close() 

### Ingest the text files generated via API into a corpus and print the corpus statistics.

In [16]:
PATH = folder + '/'
DOC_PATTERN = r'articles_api.*\.p'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)
corpus_stats(corpus)

Corpus Statistics
Number of documents: 48
Number of paragraphs: 48
Number of sentences: 178
Number of words: 5808
Vocabulary: 1458
Avg chars per word: 4.3
Avg words per sentence: 32.6
