In [12]:
import io
import json
import csv
import string
import datetime as dt
from pipeline import Pipeline
from stop_words import stop_words
from pipeline import build_csv

**The pipeline object will arrange tasks and execute in a Directed Acyclic way**

In [13]:
pipeline = Pipeline()

**This task will read all the top stories of 2014 stored in the JSON file into a dictionary object**
<br>
The stories key contains a list of stories where each story is a dictionary

In [14]:
@pipeline.task()
def file_to_json():
    with open('Data/hn_stories_2014.json', 'r') as file:
        dic = json.load(file)
    return dic['stories']


**Out of all the stories we will filter out the stories with following criteria:**
* Number of comments less than or equal to 1
* Number of points less than or equal to 50
* Title contains Ask HN(Stories posted as questions to the community)

In [15]:
@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    return ( 
            story 
            for story in stories 
            if story['num_comments'] > 1 and story['points'] > 50
            and not story['title'].startswith('Ask HN') 
    )

**The following fields are selected from each story and stored in a CSV file**
* *objectID* : Unique Id of a story
* *created_at* : Time of creattion of story 
* *url* : URL of the story
* *points* : Number of points each story received
* *title* : Title of the story

In [16]:
@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    modified_stories = (
        (story['objectID'], dt.datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), 
         story['url'], story['points'], story['title'])
        for story in stories
    )
    csv_file = build_csv(
        modified_stories,
        header = [
            'objectID', 'created_at', 'url', 'points', 'title'
        ],
        file = io.StringIO()
    )
    return csv_file

**The below task returns a generator expression of all the titles of the stories stored in CSV file**

In [17]:
@pipeline.task(depends_on=json_to_csv)
def extract_titles(file):
    reader = csv.reader(file,delimiter=',')
    header = next(reader)
    idx = header.index('title')
    return (line[idx] for line in reader)

**The below task returns the titles as a generator expression after removing punctuations**

In [18]:

@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    return (
        title.lower().translate(str.maketrans('', '', string.punctuation))
        for title in titles
    )


**The below task returns a dictionary of word frequencies for the words present in the title**
<br>
Stopwords have not been considered here 

In [19]:
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles):
    word_freq = {}
    for title in titles:
        words = [
            word 
            for word in title.split() 
            if not word in stop_words and len(word) != 0
        ]
        for word in words:
            if word not in word_freq:
                word_freq[word] = 0
            word_freq[word] += 1
    return word_freq

**The below task returns the top 100 words with highest frequency**

In [20]:
@pipeline.task(depends_on=build_keyword_dictionary)
def sort_words(word_freq):
    lis = [(k,v) for k,v in word_freq.items()]
    return sorted(lis, key=lambda x:x[1], reverse=True)[:100]

**The *run* method will run these tasks in the order in which they are registered**

In [21]:
output = pipeline.run()

In [22]:
print(output[sort_words])

[('new', 185), ('google', 167), ('bitcoin', 101), ('open', 92), ('programming', 90), ('web', 88), ('data', 85), ('video', 79), ('python', 76), ('code', 72), ('facebook', 71), ('released', 71), ('using', 70), ('2013', 65), ('javascript', 65), ('free', 64), ('source', 64), ('game', 63), ('internet', 62), ('microsoft', 59), ('c', 59), ('linux', 58), ('app', 57), ('pdf', 55), ('work', 54), ('language', 54), ('software', 52), ('2014', 52), ('startup', 51), ('apple', 50), ('use', 50), ('make', 50), ('time', 48), ('yc', 48), ('security', 48), ('nsa', 45), ('github', 45), ('windows', 44), ('1', 41), ('world', 41), ('way', 41), ('like', 41), ('project', 40), ('computer', 40), ('heartbleed', 40), ('git', 37), ('users', 37), ('dont', 37), ('design', 37), ('ios', 37), ('developer', 36), ('os', 36), ('twitter', 36), ('ceo', 36), ('vs', 36), ('life', 36), ('big', 35), ('day', 35), ('android', 34), ('online', 34), ('years', 33), ('simple', 33), ('court', 33), ('guide', 32), ('learning', 32), ('mt', 3

**Observation: There were many stories related to bitcoin, google, data as can be seen from the above list**