<h1>HACKER NEWS POST ANALYSIS</h1>

This project mainly focusses on the use of regular expressions

In [2]:
import pandas as pd
import re

In [3]:
hn = pd.read_csv('hacker_news.csv')

In [4]:
hn.head()

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
0,12224879,Interactive Dynamic Video,http://www.interactivedynamicvideo.com/,386,52,ne0phyte,8/4/2016 11:52
1,11964716,Florida DJs May Face Felony for April Fools' W...,http://www.thewire.com/entertainment/2013/04/f...,2,1,vezycash,6/23/2016 22:20
2,11919867,Technology ventures: From Idea to Enterprise,https://www.amazon.com/Technology-Ventures-Ent...,3,1,hswarna,6/17/2016 0:01
3,10301696,Note by Note: The Making of Steinway L1037 (2007),http://www.nytimes.com/2007/11/07/movies/07ste...,8,2,walterbell,9/30/2015 4:12
4,10482257,Title II kills investment? Comcast and other I...,http://arstechnica.com/business/2015/10/comcas...,53,22,Deinos,10/31/2015 9:48


<h2>Regular expression Basics</h2>

In [6]:
titles = hn["title"].tolist()

pattern = '[Pp]ython'
python_mentions = 0

for s in titles:
    if re.search(pattern, s):
        python_mentions += 1

print(python_mentions)

160


In [7]:
pattern = '[Pp]ython'

titles = hn['title']

python_mentions = titles.str.contains(pattern).sum()
print(python_mentions)

160


In [8]:
titles = hn['title']
pattern = '[Rr]uby'

ruby_titles = titles[titles.str.contains(pattern)]
print(ruby_titles)

190                     Ruby on Google AppEngine Goes Beta
484           Related: Pure Ruby Relational Algebra Engine
1388     Show HN: HTTPalooza  Ruby's greatest HTTP clie...
1949     Rewriting a Ruby C Extension in Rust: How a Na...
2022     Show HN: CrashBreak  Reproduce exceptions as f...
2163                   Ruby 2.3 Is Only 4% Faster than 2.2
2306     Websocket Shootout: Clojure, C++, Elixir, Go, ...
2620                       Why Startups Use Ruby on Rails?
2645     Ask HN: Should I continue working a Ruby gem f...
3290     Ruby on Rails and the importance of being stup...
3749     Telegram.org Bot Platform Webhooks Server, for...
3874     Warp Directory (wd) unix command line tool for...
4026     OS X 10.11 Ruby / Rails users can install ther...
4163     Charles Nutter of JRuby Banned by Rubinius for...
4602     Quiz: Ruby or Rails? Matz and DHH were not abl...
5832     Show HN: An experimental Python to C#/Go/Ruby/...
6180     Shrine  A new solution for handling file uploa.

In [9]:
email_bool = titles.str.contains('e(-?)mail')

email_count = email_bool.sum()

email_titles = titles[email_bool]

print(email_count)

86


  """Entry point for launching an IPython kernel.


In [10]:
pattern = '\[(\w+)\]'

tag_titles = titles.str.contains(pattern)

tag_count = tag_titles.sum()
print(tag_count)

444


  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
java_titles = titles[titles.str.contains(r'[Jj]ava[^sS]')]

In [21]:
pattern = r'\b[Jj]ava\b'

java_titles = titles[titles.str.contains(pattern)]

In [22]:
pattern_beginning = r'^\[\w+\]'
pattern_ending = r'\[\w+\]$'

beginning_count = titles.str.contains(pattern_beginning).sum()
ending_count = titles.str.contains(pattern_ending).sum()

In [23]:
pattern = r'\be[-\s]?mails?'

email_mentions = titles.str.contains(pattern, flags=re.I).sum()

print(email_mentions)

143


<h2>Advanced Regular Expressions</h2>

In [24]:
titles = hn['title']

sql_counts = titles.str.contains(r'sql',flags=re.I).sum()

In [25]:
hn_sql = hn[hn['title'].str.contains(r"\w+SQL", flags=re.I)].copy()

hn_sql['flavor'] = hn['title'].str.extract(r'(\w+sql)',flags=re.I)

hn_sql['flavor'] = hn_sql['flavor'].str.lower()

sql_pivot = hn_sql.pivot_table(index='flavor',values='num_comments',aggfunc='mean')

In [27]:
def first_10_matches(pattern):
    """
    Return the first 10 story titles that match
    the provided regular expression
    """
    all_matches = titles[titles.str.contains(pattern)]
    first_10 = all_matches.head(10)
    return first_10

pattern = r"\b[Cc]\b[^.+]"

first_ten = first_10_matches(pattern)

In [28]:
pattern = r'(?<!Series\s)\b[Cc]\b(?![.+])'

c_mentions = titles.str.contains(pattern).sum()

In [29]:
pattern = r'\b(\w+)\s\1\b'

repeated_words = titles[titles.str.contains(pattern)]

  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
email_variations = pd.Series(['email', 'Email', 'e Mail',
                        'e mail', 'E-mail', 'e-mail',
                        'eMail', 'E-Mail', 'EMAIL'])

email_uniform = email_variations.str.replace(r'e[-\s]?mail','email',flags=re.I)

titles_clean = titles.str.replace(r'e[-\s]?mail','email',flags=re.I)

In [32]:
test_urls = pd.Series([
 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',
 'http://www.interactivedynamicvideo.com/',
 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',
 'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',
 'HTTPS://github.com/keppel/pinn',
 'Http://phys.org/news/2015-09-scale-solar-youve.html',
 'https://iot.seeed.cc',
 'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',
 'http://beta.crowdfireapp.com/?beta=agnipath',
 'https://www.valid.ly?param',
 'http://css-cursor.techstream.org'
])

test_urls_clean = test_urls.str.extract(r'https?://([\w.-]{0,})/?',flags=re.I)

domains = hn['url'].str.extract(r'https?://([\w.-]{0,})/?',flags=re.I)

In [33]:
test_url_parts = test_urls.str.extract(r'(https?)://([\w.-]+)/?(.*)',flags=re.I)

url_parts = hn['url'].str.extract(r'(https?)://([\w.-]+)/?(.*)',flags=re.I)

In [34]:
pattern = r"(?P<protocol>https?)://(?P<domain>[\w\.\-]+)/?(?P<path>.*)"

url_parts = hn['url'].str.extract(pattern,flags=re.I)