In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploring Hackers News Posts

For this analyssis, I will 2 different kinds of posts on [Hacker News](https://news.ycombinator.com), a popular site where technology related stories (or 'posts') are voted and commented upon. The two types of posts I'll explore begin with either *Ask HN* or *Show HN*.

Quick context here - Users submit *Ask HN* posts to ask the Hacker News community a specific question, such as "Which is the best online course for python?". Similarly, users submit *Show HN* posts to show the Hacker News community a project, product, or just generally something interesting.

I'll specifically compare these two types of posts to determine the following:
* Do Ask HN or Show HN receive more comments on average?
* Do posts created at a certain time receive more comments on average?

Like any data science project, we begin by reading the dataset

In [None]:
from csv import reader
open_file = open('/kaggle/input/hacker-news-posts/HN_posts_year_to_Sep_26_2016.csv')
read_file = reader(open_file)
hn = list(read_file) # creating a list of lists
headers = hn[0] # stored the col headers here for easy reference
hn = hn[1:] # removed the col headers for easier analysis

#Let's have a look at the first 5 rows of the date

print(hn[:5])

In [None]:
# Printing the headers for easy reference later on...

print(headers)

# Isolating the required datapoints from the entire dataset

Now that we have the data, we can begin by isolating the data for Show HN and Ask HN comments

**Quick context - comments for *Show HN* and *Ask HN* always start with 'Show HN' or 'Ask HN'**

In [None]:
# Since we are only interested in "Show/Ask HN" posts, let's extract them from the dataset

ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1]
    if title.lower().startswith('ask hn'):
        ask_posts.append(row)
    elif title.lower().startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)

print('The number of ask hn posts are ', len(ask_posts))
print('The number of show hn posts are ',len(show_posts))
print('The number of other posts are ', len(other_posts))

# Calculating the total and average number of *Show HN* and *Ask HN* posts

In [None]:
#Now let us have a look at the comments across the ask hn and show hn posts

#Creating a function to calculate the total and average number of posts 

def totandavg(templist): # will return the total number and mean number of comments
    temp_total = 0
    temp_count = 0
    for row in templist:
        num_comments = row[4]
        num_comments = int(num_comments)
        temp_total += num_comments
        temp_count += 1
    temp_mean = temp_total/temp_count
    return temp_total, temp_mean

total_ask_comments, avg_ask_comments = totandavg(ask_posts)
print('The average number of Ask HN comments are ',round(avg_ask_comments, 2))

total_show_comments, avg_show_comments = totandavg(show_posts)
print('The average numnber of Show HN comments are ', round(avg_show_comments, 2))



# Key takeaway - SHow HN vs. Ask HN
From the above analysis it is clear that *Ask HN* posts get significantly higher number of comments (~2.2x higher) vs. *Show HN* posts
Further, since the *Ask HN* posts witness higher number of comments, **we will focus the remainder of our analysis on these type of posts**

# Further analysis of Ask HN posts

Next, we'll determine if ask posts created at a certain time are more likely to attract comments. We'll use the following steps to perform this analysis:

* Calculate the amount of ask posts created in each hour of the day, along with the number of comments received
* Calculate the average number of comments ask posts receive by hour created

# Finding the Amount of Ask Posts and Comments by Hour Created

I'll determine if we can maximize the amount of comments an ask post receives by creating it at a certain time. First, I'll find the amount of ask posts created during each hour of day, along with the number of comments those posts received. Then, we'll calculate the average amount of comments ask posts created at each hour of the day receive.


In [None]:
import datetime as dt

results_list = []

for row in ask_posts:
    temp = []
    temp.append(row[6])
    num_comments = int(row[4])
    temp.append(num_comments)
    results_list.append(temp)

counts_by_hour = {} # will store the count of comments across each hour
comments_by_hour = {} # will store the number of comments across each hour


for row in results_list:
    temp_hour = row[0]
    temp_hour = dt.datetime.strptime(temp_hour, '%m/%d/%Y %H:%M')
    temp_hour = dt.datetime.strftime(temp_hour, '%H')
    temp_comment = row[1]
    if temp_hour in counts_by_hour:
        counts_by_hour[temp_hour] += 1
        comments_by_hour[temp_hour] += temp_comment
    else:
        counts_by_hour[temp_hour] = 1
        comments_by_hour[temp_hour] = temp_comment

print(counts_by_hour) # printing to check the output
print('\n',comments_by_hour) # printing to check the output

# Calculating the Average Number of Comments for Ask HN Posts by Hour

In [None]:
avg_by_hour = []

for row in counts_by_hour:
    avg = comments_by_hour[row]/counts_by_hour[row]
    avg_by_hour.append([row, avg])
print(avg_by_hour)

# Sorting the data and creating a list of lists

Although we now have the results we need, this format makes it hard to identify the hours with the highest values. Let's finish by sorting the list of lists and printing the five highest values in a format that's easier to read

In [None]:
# Will start by creating a list that is the same as the avg_by_hour list, but with the cols swapped
# so that we can use the sorted() function

swap_avg_by_hour = []
for row in avg_by_hour:
    swap_avg_by_hour.append([row[1], row[0]])
    
sorted_swap = sorted(swap_avg_by_hour, reverse = True)

print('Top 5 Hours for Ask Posts Comments')
for com, hr in sorted_swap[:5]:
    print('{time}: {avg:.2f} average comments per post'.format(time = dt.datetime.strptime(hr, '%H').strftime('%H:%M'), avg = com) )

The hour that receives the most comments per post on average is 15:00, with an average of ~30 comments per post. There's about a ~1.8x increase in the number of comments between the hours with the highest and second highest average number of comments.

According to the data set [documentation](https://www.kaggle.com/hacker-news/hacker-news-posts), the timezone used is Eastern Time in the US. So, we could also write 15:00 as 3:00 pm est.

# Conclusion

* In this project, we analyzed ask posts and show posts to determine which type of post and time receive the most comments on average
* Based on our analysis, to maximize the amount of comments a post receives, we'd recommend the post be categorized as ask post and created between 15:00 and 16:00 (3:00 pm est - 4:00 pm est)
* However, it should be noted that the data set we analyzed excluded posts without any comments
* Given that, it's more accurate to say that of the posts that received comments, ask posts received more comments on average and ask posts created between 15:00 and 16:00 (3:00 pm est - 4:00 pm est) received the most comments on average