In [3]:
import csv

# Load data from CSV
filename = 'HN_posts.csv'  # Replace this with the path to your dataset
with open(filename, newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    data = list(reader)  # Convert the reader object into a list

# Check the first few rows to understand the structure
print(data[:5])  # Display the first 5 rows of the dataset


[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12579008', 'You have two days to comment if you want stem cells to be classified as your own', 'http://www.regulations.gov/document?D=FDA-2015-D-3719-0018', '1', '0', 'altstar', '9/26/2016 3:26'], ['12579005', 'SQLAR  the SQLite Archiver', 'https://www.sqlite.org/sqlar/doc/trunk/README.md', '1', '0', 'blacksqr', '9/26/2016 3:24'], ['12578997', 'What if we just printed a flatscreen television on the side of our boxes?', 'https://medium.com/vanmoof/our-secrets-out-f21c1f03fdc8#.ietxmez43', '1', '0', 'pavel_lishin', '9/26/2016 3:19'], ['12578989', 'algorithmic music', 'http://cacm.acm.org/magazines/2011/7/109891-algorithmic-composition/fulltext', '1', '0', 'poindontcare', '9/26/2016 3:16']]


In [4]:
# The first row contains headers, so we'll remove it
headers = data[0]  # First row is the header
data = data[1:]    # Remove header from the data

# Check the first few rows after removing the header
print(data[:5])  # Display the first 5 rows of the data


[['12579008', 'You have two days to comment if you want stem cells to be classified as your own', 'http://www.regulations.gov/document?D=FDA-2015-D-3719-0018', '1', '0', 'altstar', '9/26/2016 3:26'], ['12579005', 'SQLAR  the SQLite Archiver', 'https://www.sqlite.org/sqlar/doc/trunk/README.md', '1', '0', 'blacksqr', '9/26/2016 3:24'], ['12578997', 'What if we just printed a flatscreen television on the side of our boxes?', 'https://medium.com/vanmoof/our-secrets-out-f21c1f03fdc8#.ietxmez43', '1', '0', 'pavel_lishin', '9/26/2016 3:19'], ['12578989', 'algorithmic music', 'http://cacm.acm.org/magazines/2011/7/109891-algorithmic-composition/fulltext', '1', '0', 'poindontcare', '9/26/2016 3:16'], ['12578979', 'How the Data Vault Enables the Next-Gen Data Warehouse and Data Lake', 'https://www.talend.com/blog/2016/05/12/talend-and-Â\x93the-data-vaultÂ\x94', '1', '0', 'markgainor1', '9/26/2016 3:14']]


In [5]:
# Extract 'Ask HN' and 'Show HN' posts
ask_hn_posts = []
show_hn_posts = []

for row in data:
    title = row[1]  # The title is in the second column (index 1)
    
    # Check if the title starts with 'Ask HN' or 'Show HN'
    if title.lower().startswith('ask hn'):
        ask_hn_posts.append(row)
    elif title.lower().startswith('show hn'):
        show_hn_posts.append(row)

# Display the first few posts of each
print("Ask HN Posts:")
print(ask_hn_posts[:5])  # Display first 5 'Ask HN' posts

print("\nShow HN Posts:")
print(show_hn_posts[:5])  # Display first 5 'Show HN' posts


Ask HN Posts:
[['12578908', 'Ask HN: What TLD do you use for local development?', '', '4', '7', 'Sevrene', '9/26/2016 2:53'], ['12578522', 'Ask HN: How do you pass on your work when you die?', '', '6', '3', 'PascLeRasc', '9/26/2016 1:17'], ['12577908', 'Ask HN: How a DNS problem can be limited to a geographic region?', '', '1', '0', 'kuon', '9/25/2016 22:57'], ['12577870', 'Ask HN: Why join a fund when you can be an angel?', '', '1', '3', 'anthony_james', '9/25/2016 22:48'], ['12577647', 'Ask HN: Someone uses stock trading as passive income?', '', '5', '2', '00taffe', '9/25/2016 21:50']]

Show HN Posts:
[['12578335', 'Show HN: Finding puns computationally', 'http://puns.samueltaylor.org/', '2', '0', 'saamm', '9/26/2016 0:36'], ['12578182', 'Show HN: A simple library for complicated animations', 'https://christinecha.github.io/choreographer-js/', '1', '0', 'christinecha', '9/26/2016 0:01'], ['12578098', 'Show HN: WebGL visualization of DNA sequences', 'http://grondilu.github.io/dna.html

In [6]:
# Initialize variables to store the total number of comments
total_ask_comments = 0
total_show_comments = 0

# Loop through Ask HN posts and sum up the number of comments
for post in ask_hn_posts:
    total_ask_comments += int(post[4])  # Convert comment count to integer and add

# Loop through Show HN posts and sum up the number of comments
for post in show_hn_posts:
    total_show_comments += int(post[4])  # Convert comment count to integer and add

# Calculate the average number of comments
avg_ask_comments = total_ask_comments / len(ask_hn_posts)
avg_show_comments = total_show_comments / len(show_hn_posts)

# Print results
print(f"Average comments on 'Ask HN' posts: {avg_ask_comments:.2f}")
print(f"Average comments on 'Show HN' posts: {avg_show_comments:.2f}")


Average comments on 'Ask HN' posts: 10.39
Average comments on 'Show HN' posts: 4.89


In [7]:
from datetime import datetime as dt

# Dictionary to store post counts and total comments by hour
counts_by_hour = {}  # {hour: number of posts}
comments_by_hour = {}  # {hour: total comments}

# Loop through Ask HN posts
for post in ask_hn_posts:
    created_at = post[6]  # Get the datetime string
    hour = dt.strptime(created_at, "%m/%d/%Y %H:%M").strftime("%H")  # Extract hour
    
    # Update counts_by_hour
    if hour not in counts_by_hour:
        counts_by_hour[hour] = 0
        comments_by_hour[hour] = 0
    
    counts_by_hour[hour] += 1
    comments_by_hour[hour] += int(post[4])  # Convert comments to integer and add

# Compute average comments per post by hour
avg_comments_by_hour = []

for hour in counts_by_hour:
    avg_comments_by_hour.append([hour, comments_by_hour[hour] / counts_by_hour[hour]])

# Sort the results in descending order based on the average number of comments
sorted_avg_comments = sorted(avg_comments_by_hour, key=lambda x: x[1], reverse=True)

# Print top 5 hours with the highest average comments
print("Top 5 Hours for 'Ask HN' Posts (by average comments):")
for hour, avg_comments in sorted_avg_comments[:5]:
    print(f"{hour}:00 - {avg_comments:.2f} average comments")


Top 5 Hours for 'Ask HN' Posts (by average comments):
15:00 - 28.68 average comments
13:00 - 16.32 average comments
12:00 - 12.38 average comments
02:00 - 11.14 average comments
10:00 - 10.68 average comments


In [8]:
print("\nTop 5 Hours for 'Ask HN' Posts (by average comments):")
for hour, avg_comments in sorted_avg_comments[:5]:
    formatted_hour = f"{hour}:00"  # Format the hour for readability
    print(f"{formatted_hour:<6} - {avg_comments:.2f} average comments")



Top 5 Hours for 'Ask HN' Posts (by average comments):
15:00  - 28.68 average comments
13:00  - 16.32 average comments
12:00  - 12.38 average comments
02:00  - 11.14 average comments
10:00  - 10.68 average comments
