#### So we have json file filled with 2014 hacker news post 
#### It contains stories as single keys 
#### we will be dealing with following keys 
created_at: A timestamp of the story's creation time.

created_at_i: A unix epoch timestamp.

url: The URL of the story link.

objectID: The ID of the story.

author: The story's author (username on HN).

points: The number of upvotes the story had.

title: The headline of the post.

num_comments: The number of a comments a post has.

#### Using this data-set 
#### we will be runining sequences basic of NLP task using our Pipeline Class.


In [1]:
from datetime import datetime
import json
import io
import string
import csv
from pipeline import build_csv
from pipeline import Pipeline
from stop_words import stop_words

In [2]:
pipeline=Pipeline()

#### loading of json file into python 
#### create pipeline.task() with no args 
#### call the the function file_to_json() loads the file and return stories 

In [3]:
@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json', 'r') as f:
        data = json.load(f)
        stories = data['stories']
    return stories

#### we need to filter out most popular stories every year 

@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    def is_popular(story): #is_popular will return boolean value
        
        return (story["points"]>50 and story["num_comments"]>1 
                and not story["title"].startswith("ASK HN"))
        
    return (story for story in stories if is_popular(story))

#### now dict to csv conversion is necessary
#### The purpose of translating the dictionaries to a CSV is that 
#### we want to have a consistent data format when running the later 
#### summarizations. By keeping consistent data formats, 
#### each of your pipeline tasks will be adaptable with future task 
#### requirements.

@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    lines=[]
    for story in stories:
        lines.append(
            (story['objectID'], datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), story['url'], story['points'], story['title'])
        )
    return build_csv(lines,header=['objectID','created_at','url',
                                   'points','title'], file=io.StringIO())

##### Once we have extracted the titles of each popular post, 
#### we can then run the next word frequency task.
@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('title')
    
    return (line[idx] for line in reader)

@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    for title in titles:
        title = title.lower()
        title = ''.join(c for c in title if c not in string.punctuation)
        yield title
        
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles):
    word_freq={}
    for title in titles:
        for word in title.split(" "):
            if word not in stop_words:
                if word not in word_freq:
                    word_freq[word]=1
                word_freq[word]+=1
    return word_freq

@pipeline.task(depends_on=build_keyword_dictionary)
def top_words(keyword_dictionary):
    top_values=sorted(keyword_dictionary.items() ,key=lambda x:x[1],reverse=True)
    return top_values[:100]
                

In [5]:
# ran = pipeline.run()
print(ran[top_values])

NameError: name 'top_values' is not defined