<h1 style='text-align:center'>Analysing Hacker News Data <h1>

In [50]:
from pipeline import Pipeline
pipeline = Pipeline()

> ## File to JSON

In [51]:
import json
@pipeline.task()
def file_to_json():
    file = open('hn_stories_2014.json', mode='r')
    json_file = json.loads(file.read())
    return json_file['stories']

In [4]:
output = pipeline.run()

## First 5 stories from 2014 files

In [9]:
output[list(output.keys())[0]][:5]

[{'story_text': '',
  'created_at': '2014-05-29T08:25:40Z',
  'story_title': None,
  'story_id': None,
  'comment_text': None,
  'created_at_i': 1401351940,
  'url': 'https://duckduckgo.com/settings',
  'parent_id': None,
  'objectID': '7815290',
  'author': 'TuxLyn',
  'points': 1,
  'title': 'DuckDuckGo Settings',
  '_tags': ['story', 'author_TuxLyn', 'story_7815290'],
  'num_comments': 0,
  '_highlightResult': {'story_text': {'matchedWords': [],
    'value': '',
    'matchLevel': 'none'},
   'author': {'matchedWords': [], 'value': 'TuxLyn', 'matchLevel': 'none'},
   'url': {'matchedWords': [],
    'value': 'https://duckduckgo.com/settings',
    'matchLevel': 'none'},
   'title': {'matchedWords': [],
    'value': 'DuckDuckGo Settings',
    'matchLevel': 'none'}},
  'story_url': None},
 {'story_text': '',
  'created_at': '2014-05-29T08:23:46Z',
  'story_title': None,
  'story_id': None,
  'comment_text': None,
  'created_at_i': 1401351826,
  'url': 'http://bits.blogs.nytimes.com/2014/

> ## Filter Stories

In [52]:
@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    filtered_stories = (story for story in stories if story['points'] > 50 \
                       and story['num_comments'] > 1 and \
                        not str.lower(story['title']).startswith('ask hn'))
    return filtered_stories

In [29]:
output = pipeline.run()

## First 2 filtered(popular) stories from 2014 files

In [34]:
for x in range(2):
    print('STORY {}:'.format(x), next(output[list(output.keys())[1]]))

STORY 0: {'story_text': '', 'created_at': '2014-05-28T20:50:28Z', 'story_title': None, 'story_id': None, 'comment_text': None, 'created_at_i': 1401310228, 'url': 'http://action.sumofus.org/a/Facebook-app-taps-phones/', 'parent_id': None, 'objectID': '7812812', 'author': 'makmanalp', 'points': 54, 'title': 'Facebook: Do not release your new app feature that listens to users', '_tags': ['story', 'author_makmanalp', 'story_7812812'], 'num_comments': 23, '_highlightResult': {'story_text': {'matchedWords': [], 'value': '', 'matchLevel': 'none'}, 'author': {'matchedWords': [], 'value': 'makmanalp', 'matchLevel': 'none'}, 'url': {'matchedWords': [], 'value': 'http://action.sumofus.org/a/Facebook-app-taps-phones/', 'matchLevel': 'none'}, 'title': {'matchedWords': [], 'value': 'Facebook: Do not release your new app feature that listens to users', 'matchLevel': 'none'}}, 'story_url': None}
STORY 1: {'story_text': '', 'created_at': '2014-05-28T20:32:33Z', 'story_title': None, 'story_id': None, 'c

> ## Filter stories to CSV

In [53]:
from pipeline import build_csv
import io, datetime

In [54]:
@pipeline.task(depends_on=filter_stories)
def json_to_csv(filtered_stories):
    lines=[]
    for line in filtered_stories:
        lines.append((line['objectID'],\
                      datetime.datetime.strptime(
                          line['created_at']\
                          ,'%Y-%m-%dT%H:%M:%SZ'),
                      line['url'],\
                      line['points'],\
                      line['title']\
                     ))
        
    file = build_csv(lines,
                     header=['objectID', 'created_at', 'url'\
                             , 'points', 'title'],
                     file=io.StringIO()
                    )
    return file

In [6]:
output = pipeline.run()

## First 2 lines from the filtered popular file

In [7]:
for x in range(3):
    print('STORY {}:'.format(x), next(output[list(output.keys())[2]]))

STORY 0: objectID,created_at,url,points,title

STORY 1: 7814725,2014-05-29 04:27:42,http://krebsonsecurity.com/2014/05/true-goodbye-using-truecrypt-is-not-secure/,60,True Goodbye: ‘Using TrueCrypt Is Not Secure’

STORY 2: 7814608,2014-05-29 03:51:01,http://projects.aljazeera.com/2014/portrait-of-down-syndrome/index.html,161,For Hire: Dedicated Young Man With Down Syndrome



> ## Extract Titles

In [55]:
import csv
@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('title')

    for line in reader:
        yield line[idx]

In [12]:
output = pipeline.run()

## First 3 popular titles from the file

In [13]:
for x in range(3):
    print('STORY {}:'.format(x), next(output[list(output.keys())[3]]))

STORY 0: True Goodbye: ‘Using TrueCrypt Is Not Secure’
STORY 1: For Hire: Dedicated Young Man With Down Syndrome
STORY 2: Absolute Zero


> ## Clean Titles

In [56]:
import string
@pipeline.task(depends_on=extract_titles)
def clean_titles(title):
    def _clean(title):
        title = str.lower(title)
        return ''.join(t for t in title if t not in string.punctuation)
    return(_clean(t) for t in title)
    

In [44]:
output = pipeline.run()

In [45]:
for x in range(3):
    print('STORY {}:'.format(x), next(output[list(output.keys())[4]]))

STORY 0: true goodbye ‘using truecrypt is not secure’
STORY 1: for hire dedicated young man with down syndrome
STORY 2: absolute zero


In [11]:
list(output.keys())

[<function __main__.file_to_json()>,
 <function __main__.filter_stories(stories)>,
 <function __main__.json_to_csv(filtered_stories)>,
 <function __main__.extract_titles(csv_file)>,
 <function __main__.clean_titles(title)>,
 <function __main__.build_keyword_dictionary(title)>]

In [47]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [53]:
for c in string.punctuation[:5]:
    print(c,':',ord(c))

! : 33
" : 34
# : 35
$ : 36
% : 37


> ## Build Keyword Dictionary

In [57]:
import stop_words
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(title):
    keyword_dict={}
    for t in title:
        words = t.split(' ')
        for word in words:
            if word not in stop_words.stop_words and word != '':
                if word not in keyword_dict:
                    keyword_dict[word] = 0
                keyword_dict[word] += 1
    return keyword_dict

In [10]:
output = pipeline.run()

In [12]:
for x in range(3):
    print(output[list(output.keys())[5]])



> ## top 100

In [58]:
@pipeline.task(depends_on=build_keyword_dictionary)
def top_100(keyword_dict):
    t100=[]
    for key in sorted(keyword_dict, key=keyword_dict.get, reverse=True)[:100]:
        t100.append((key, keyword_dict[key]))
        
    return t100

In [59]:
output = pipeline.run()

In [60]:
top_100_tuples = output[list(output.keys())[6]]

In [61]:
top_100_tuples

[('new', 185),
 ('google', 167),
 ('bitcoin', 101),
 ('open', 92),
 ('programming', 90),
 ('web', 88),
 ('data', 85),
 ('video', 79),
 ('python', 75),
 ('code', 72),
 ('facebook', 71),
 ('released', 71),
 ('using', 70),
 ('2013', 65),
 ('javascript', 65),
 ('free', 64),
 ('source', 64),
 ('game', 63),
 ('internet', 62),
 ('microsoft', 59),
 ('c', 59),
 ('linux', 58),
 ('app', 57),
 ('pdf', 55),
 ('work', 54),
 ('language', 54),
 ('software', 52),
 ('2014', 52),
 ('startup', 51),
 ('apple', 50),
 ('use', 50),
 ('make', 50),
 ('time', 48),
 ('yc', 48),
 ('security', 48),
 ('nsa', 45),
 ('github', 45),
 ('windows', 44),
 ('world', 41),
 ('way', 41),
 ('like', 41),
 ('1', 40),
 ('project', 40),
 ('computer', 40),
 ('heartbleed', 40),
 ('git', 37),
 ('users', 37),
 ('dont', 37),
 ('design', 37),
 ('ios', 37),
 ('developer', 36),
 ('os', 36),
 ('twitter', 36),
 ('ceo', 36),
 ('vs', 36),
 ('life', 36),
 ('big', 35),
 ('day', 35),
 ('android', 34),
 ('online', 34),
 ('years', 33),
 ('simple', 