# Hacker news posts 

The aim of this project is to compaare two types of posts on a popular tech website Hacker News and to determine whether the time of posting influences the number of upvotes.

- Do Ask HN or Show HN receive more comments on average?
- Do posts created at a certain time receive more comments on average?

In [1]:
from csv import reader
import datetime as dt

In [2]:
opened_hn = open('datasets/hacker_news_2016.csv')

In [3]:
read_hn = reader(opened_hn)

In [4]:
hn = list(read_hn)

In [5]:
hn[:5]

[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'],
 ['12579008',
  'You have two days to comment if you want stem cells to be classified as your own',
  'http://www.regulations.gov/document?D=FDA-2015-D-3719-0018',
  '1',
  '0',
  'altstar',
  '9/26/2016 3:26'],
 ['12579005',
  'SQLAR  the SQLite Archiver',
  'https://www.sqlite.org/sqlar/doc/trunk/README.md',
  '1',
  '0',
  'blacksqr',
  '9/26/2016 3:24'],
 ['12578997',
  'What if we just printed a flatscreen television on the side of our boxes?',
  'https://medium.com/vanmoof/our-secrets-out-f21c1f03fdc8#.ietxmez43',
  '1',
  '0',
  'pavel_lishin',
  '9/26/2016 3:19'],
 ['12578989',
  'algorithmic music',
  'http://cacm.acm.org/magazines/2011/7/109891-algorithmic-composition/fulltext',
  '1',
  '0',
  'poindontcare',
  '9/26/2016 3:16']]

In [6]:
header_hn = hn[0]

In [7]:
header_hn

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']

In [8]:
hn = hn[1:]

In [9]:
hn[:5]

[['12579008',
  'You have two days to comment if you want stem cells to be classified as your own',
  'http://www.regulations.gov/document?D=FDA-2015-D-3719-0018',
  '1',
  '0',
  'altstar',
  '9/26/2016 3:26'],
 ['12579005',
  'SQLAR  the SQLite Archiver',
  'https://www.sqlite.org/sqlar/doc/trunk/README.md',
  '1',
  '0',
  'blacksqr',
  '9/26/2016 3:24'],
 ['12578997',
  'What if we just printed a flatscreen television on the side of our boxes?',
  'https://medium.com/vanmoof/our-secrets-out-f21c1f03fdc8#.ietxmez43',
  '1',
  '0',
  'pavel_lishin',
  '9/26/2016 3:19'],
 ['12578989',
  'algorithmic music',
  'http://cacm.acm.org/magazines/2011/7/109891-algorithmic-composition/fulltext',
  '1',
  '0',
  'poindontcare',
  '9/26/2016 3:16'],
 ['12578979',
  'How the Data Vault Enables the Next-Gen Data Warehouse and Data Lake',
  'https://www.talend.com/blog/2016/05/12/talend-and-Â\x93the-data-vaultÂ\x94',
  '1',
  '0',
  'markgainor1',
  '9/26/2016 3:14']]

In [10]:
ask_posts = []
show_posts = []
other_posts = []

In [11]:
for row in hn:
    
    title = row[1]
    lowercase_title = title.lower()
    
    if lowercase_title.startswith('ask hn'):
        ask_posts.append(row)
        
    elif lowercase_title.startswith('show hn'):
        show_posts.append(row)
        
    else:
        other_posts.append(row)
        

In [12]:
print('Ask HN posts: ', len(ask_posts))
print('Show HN posts: ', len(show_posts))
print('Other posts: ', len(other_posts))


Ask HN posts:  9139
Show HN posts:  10158
Other posts:  273822


In [13]:
total_ask_comments = 0

In [14]:
for row in ask_posts:
    num_comments = row[4]
    num_comments = int(num_comments)
    total_ask_comments += num_comments

In [15]:
total_ask_comments

94986

In [16]:
avg_ask_comments = total_ask_comments / len(ask_posts)

In [17]:
avg_ask_comments

10.393478498741656

In [18]:
total_show_comments = 0

In [19]:
for row in show_posts:
    num_comments = row[4]
    num_comments = int(num_comments)
    total_show_comments += num_comments

In [20]:
total_show_comments

49633

In [21]:
avg_show_comments = total_show_comments / len(show_posts)

In [22]:
avg_show_comments

4.886099625910612

It is clear that Ask HN is a more popular type of posts than Show HN. The average number of comments of that type is more than 2 times greater than Show HN.

Since ask posts are more likely to receive comments, we'll focus our remaining analysis just on these posts.

Now we'll determine if ask posts created at a certain time are more likely to attract comments.

In [23]:
result_list = []

In [24]:
for row in ask_posts:
    
    list = []
    created_at = row[6]
    list.append(created_at)
    num_comments = int(row[4])
    list.append(num_comments)
    result_list.append(list)
    

In [25]:
result_list

[['9/26/2016 2:53', 7],
 ['9/26/2016 1:17', 3],
 ['9/25/2016 22:57', 0],
 ['9/25/2016 22:48', 3],
 ['9/25/2016 21:50', 2],
 ['9/25/2016 19:30', 1],
 ['9/25/2016 19:22', 22],
 ['9/25/2016 17:55', 3],
 ['9/25/2016 15:48', 0],
 ['9/25/2016 15:35', 13],
 ['9/25/2016 15:28', 0],
 ['9/25/2016 14:43', 0],
 ['9/25/2016 14:17', 3],
 ['9/25/2016 13:08', 2],
 ['9/25/2016 11:27', 2],
 ['9/25/2016 10:51', 0],
 ['9/25/2016 10:47', 6],
 ['9/25/2016 9:04', 97],
 ['9/25/2016 7:09', 4],
 ['9/25/2016 3:00', 1],
 ['9/24/2016 23:04', 0],
 ['9/24/2016 22:02', 7],
 ['9/24/2016 21:18', 2],
 ['9/24/2016 20:58', 0],
 ['9/24/2016 19:57', 1],
 ['9/24/2016 19:02', 0],
 ['9/24/2016 17:55', 0],
 ['9/24/2016 17:27', 1],
 ['9/24/2016 16:50', 0],
 ['9/24/2016 16:03', 5],
 ['9/24/2016 15:29', 66],
 ['9/24/2016 14:03', 1],
 ['9/24/2016 10:10', 11],
 ['9/24/2016 8:46', 7],
 ['9/24/2016 8:39', 1],
 ['9/24/2016 8:38', 1],
 ['9/24/2016 8:28', 1],
 ['9/24/2016 3:36', 3],
 ['9/24/2016 0:21', 2],
 ['9/23/2016 23:38', 6],
 ['9/2

In [26]:
counts_by_hour = {}
comments_by_hour = {}

In [27]:
for row in result_list:
    
    date_str = row[0]
    date_dt = dt.datetime.strptime(date_str, '%m/%d/%Y %H:%S')
    hour = date_dt.hour
    
    if hour not in counts_by_hour:
        counts_by_hour[hour] = 1
        comments_by_hour[hour] = row[1]
    else:
        counts_by_hour[hour] += 1
        comments_by_hour[hour] += row[1]

In [28]:
counts_by_hour

{2: 269,
 1: 282,
 22: 383,
 21: 518,
 19: 552,
 17: 587,
 15: 646,
 14: 513,
 13: 444,
 11: 312,
 10: 282,
 9: 222,
 7: 226,
 3: 271,
 23: 343,
 20: 510,
 16: 579,
 8: 257,
 0: 301,
 18: 614,
 12: 342,
 4: 243,
 6: 234,
 5: 209}

In [29]:
comments_by_hour

{2: 2996,
 1: 2089,
 22: 3372,
 21: 4500,
 19: 3954,
 17: 5547,
 15: 18525,
 14: 4972,
 13: 7245,
 11: 2797,
 10: 3013,
 9: 1477,
 7: 1585,
 3: 2154,
 23: 2297,
 20: 4462,
 16: 4466,
 8: 2362,
 0: 2277,
 18: 4877,
 12: 4234,
 4: 2360,
 6: 1587,
 5: 1838}

In [30]:
avg_comments_by_hour = []

In [31]:
for hour in comments_by_hour:
    avg_comments_by_hour.append([hour, round(comments_by_hour[hour] / counts_by_hour[hour], 2)])

In [32]:
sorted(avg_comments_by_hour)

[[0, 7.56],
 [1, 7.41],
 [2, 11.14],
 [3, 7.95],
 [4, 9.71],
 [5, 8.79],
 [6, 6.78],
 [7, 7.01],
 [8, 9.19],
 [9, 6.65],
 [10, 10.68],
 [11, 8.96],
 [12, 12.38],
 [13, 16.32],
 [14, 9.69],
 [15, 28.68],
 [16, 7.71],
 [17, 9.45],
 [18, 7.94],
 [19, 7.16],
 [20, 8.75],
 [21, 8.69],
 [22, 8.8],
 [23, 6.7]]

In [33]:
swap_avg_by_hour = []

In [34]:
for row in avg_comments_by_hour:
    list = [0,0]
    list[0] = row[1]
    list[1] = row[0]
    swap_avg_by_hour.append(list)

In [35]:
sorted_swap = sorted(swap_avg_by_hour, reverse=True)

In [36]:
sorted_swap

[[28.68, 15],
 [16.32, 13],
 [12.38, 12],
 [11.14, 2],
 [10.68, 10],
 [9.71, 4],
 [9.69, 14],
 [9.45, 17],
 [9.19, 8],
 [8.96, 11],
 [8.8, 22],
 [8.79, 5],
 [8.75, 20],
 [8.69, 21],
 [7.95, 3],
 [7.94, 18],
 [7.71, 16],
 [7.56, 0],
 [7.41, 1],
 [7.16, 19],
 [7.01, 7],
 [6.78, 6],
 [6.7, 23],
 [6.65, 9]]

In [39]:
print('Top 5 Hours for Ask Posts Comments:')

for avg, hour in sorted_swap[:5]:
    str_hour = str(hour)
    print("{}: {} average comments per post".format(
            dt.datetime.strptime(str_hour, "%H").strftime("%H:%M"), avg))

Top 5 Hours for Ask Posts Comments:
15:00: 28.68 average comments per post
13:00: 16.32 average comments per post
12:00: 12.38 average comments per post
02:00: 11.14 average comments per post
10:00: 10.68 average comments per post
