<a href="https://colab.research.google.com/github/marinathomas/SentimentAnalysisHN/blob/master/HN_SentimentAnalysis_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Load credentials to access BigQuery
2. Read the story ids for 2017 from the 'full' table.
3. For each story, get the associated 'main' comments. We are not considering response to the comments for now.
4. Analyze the comments and give the story a score based off the sentiment of the comments.

Step 1 - Load credentials

In [75]:
from google.cloud import bigquery
import pandas as pd
import os

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
root_path = "/content/gdrive/Shared drives/HackerNews:SentimentAnalysis/"
credential_path = root_path+"hackernews-bigquery-261019-0f8cc2295b63.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

Step 2 - Install the sentiment analysis library
https://github.com/cjhutto/vaderSentiment

In [28]:
!pip install vaderSentiment



In [0]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

Step 3 - Load BigQuery client and HackerNews dataset
1. Load the BigQuery client
2. Get a reference to HackerNews dataset
3. Load the data set

In [0]:
client = bigquery.Client()
hn_dataset_ref = client.dataset('hacker_news', project='bigquery-public-data')
hn_dset = client.get_dataset(hn_dataset_ref)

Step 4 - Look for stories of 2017

In [0]:
def get_stories():
    query = """
    SELECT table_full.id, table_full.title, table_full.url, table_full.score, table_full.descendants
    FROM `bigquery-public-data.hacker_news.full` as table_full
    WHERE  table_full.type = 'story' and REGEXP_CONTAINS(title, r"(S|s)how HN") and (deleted IS NULL or deleted IS FALSE) and  EXTRACT(YEAR FROM timestamp)=2017
    and table_full.descendants > 5
    --and (table_full.descendants = 0 or table_full.descendants <5)
    ORDER BY id asc
    --limit 50
    """

    query_job = client.query(query)
    iterator = query_job.result(timeout=30)
    rows = list(iterator)

    # Transform the rows into a nice pandas dataframe
    stories = pd.DataFrame(data=[list(x.values()) for x in rows], columns=list(rows[0].keys()))
    #stories.head(10)

    return stories

Lets check the data

In [0]:
import csv

stories = get_stories()
csv_file = root_path + "sentiment_analysis_results/stories_with_more_than_5_comments_2017.csv"
with open(csv_file, mode='w') as stories_file:
  stories_writer = csv.writer(stories_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  for index,row in stories.iterrows():
      title, parent_id, score, noOf_comments, url = row['title'], row['id'], row['score'],row['descendants'], row['url']
      stories_writer.writerow([index, title, parent_id, url, noOf_comments])
      #print("----------------------------------------")
      #print('{} Title: {} \t  ID: {} \t url: {} \t descendants:{}'.format(index, title, parent_id, url, noOf_comments))



Let's bring up the comments for the above stories

Step 5 - For each story, bring up the associated comment

In [0]:
def get_comments(parent_id):
    query = """
    select  table_full.id, table_full.text
    from `bigquery-public-data.hacker_news.full`  as table_full
    where type = 'comment'  and (deleted IS NULL or deleted IS FALSE) and parent = @parent
    order by parent ;
    """

    query_params = [
        bigquery.ScalarQueryParameter("parent", "INT64", parent_id)
    ] 

    job_config = bigquery.QueryJobConfig()
    job_config.query_parameters = query_params
    query_job = client.query(query,location="US",job_config=job_config,)  

    iterator = query_job.result(timeout=30)
    rows = list(iterator)

    #comments = pd.DataFrame()
    # Transform the rows into a nice pandas dataframe
    #if rows:
    comments = pd.DataFrame(data=[list(x.values()) for x in rows], columns=list(rows[0].keys()))
      #comments.append(comments_temp)
      #comments.head(20)

    return comments

Step 6 - Analyse comments

In [94]:
for index, row in stories.iterrows():
  parent_id = 14497295 #row['id']
  comments = get_comments(parent_id)
  scores = analyse_comments(comments, parent_id)
  for score in scores:
    print(str(score))
  break;

{'neg': 0.0, 'neu': 0.926, 'pos': 0.074, 'compound': 0.6249}
{'neg': 0.0, 'neu': 0.798, 'pos': 0.202, 'compound': 0.7476}
{'neg': 0.0, 'neu': 0.794, 'pos': 0.206, 'compound': 0.7146}
{'neg': 0.039, 'neu': 0.876, 'pos': 0.085, 'compound': 0.2263}
{'neg': 0.0, 'neu': 0.854, 'pos': 0.146, 'compound': 0.4404}
{'neg': 0.0, 'neu': 0.667, 'pos': 0.333, 'compound': 0.4588}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 0.825, 'pos': 0.175, 'compound': 0.8764}
{'neg': 0.0, 'neu': 0.816, 'pos': 0.184, 'compound': 0.8313}
{'neg': 0.0, 'neu': 0.677, 'pos': 0.323, 'compound': 0.9408}
{'neg': 0.091, 'neu': 0.75, 'pos': 0.159, 'compound': 0.8823}
{'neg': 0.032, 'neu': 0.717, 'pos': 0.251, 'compound': 0.9459}
{'neg': 0.42, 'neu': 0.58, 'pos': 0.0, 'compound': -0.4404}
{'neg': 0.072, 'neu': 0.691, 'pos': 0.237, 'compound': 0.9753}
{'neg': 0.0, 'neu': 0.94, 'pos': 0.06, 'compound': 0.2457}
{'neg': 0.0, 'neu': 0.741, 'pos': 0.259, 'compound': 0.4215}
{'neg': 0.0, 'neu': 0.718, 

In [0]:
def analyse_comments(comments, parent_id):
  scores = []
  csv_file = root_path + "sentiment_analysis_results/comments_for_"+str(parent_id)+".csv"

  with open(csv_file, mode='w') as comments_file:
    comments_writer = csv.writer(comments_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for index,row in comments.iterrows():
        sentence = row['text']
        score = analyser.polarity_scores(str(sentence))
        #print("{}\t Comment: {} \t SCORE: {}".format(index, sentence, str(score)))
        comments_writer.writerow([index, sentence, str(score)])
        scores.append(score)
  return scores


TODO: Analyze comments of comments ==> Will take up later

In [0]:
def score_story(scores):
  story_score = 0
  for row in scores:
    compound_score = row['compound']
    if compound_score >= 0.05:
      story_score += 1
    elif compound_score <= -0.05:
      story_score -= 1
  return story_score


In [0]:
csv_file = root_path + "sentiment_analysis_results/scores_stories_with_comments_2017.csv"

def analyse_hacker_news():
  stories = get_stories()
  scored_stories = []
  for index, row in stories.iterrows():
    parent_id = row['id']
    comments = get_comments(parent_id)
    #story_point=0
    #if not comments.empty:
    scores = analyse_comments(comments, parent_id)
    story_point = score_story(scores)
    with open(csv_file, mode='a') as scores_file:
      scores_writer = csv.writer(scores_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
      scores_writer.writerow([index, row['title'], parent_id, row['url'], row['score'],row['descendants'],story_point])
    print("{} Story {} with id {} and noOf_Comments {} url {} with original score O_Score {} scored {}".format(index, row['title'], parent_id, row['descendants'], row['url'], row['score'], story_point))
    #break;


In [99]:
analyse_hacker_news()

0 Story Show HN: Blink my keyboard lights when you visit this page with id 13293894 and noOf_Comments 22 url http://lelandbatey.com/posts/2016/12/Making-lights-blink-for-each-HTTP-request/ with original score O_Score 87 scored 11
1 Story Show HN: Shadowsocks-rust – A fast SOCKSv5 proxy in Rust with id 13294375 and noOf_Comments 6 url https://github.com/loggerhead/shadowsocks-rust with original score O_Score 68 scored 4
2 Story Show HN: Math Worksheets for Kids with id 13295049 and noOf_Comments 8 url http://worksheets.guru/ with original score O_Score 18 scored 2
3 Story Show HN: Automated blind control via an Amazon Echo Dot and Raspberry Pi with id 13297192 and noOf_Comments 30 url https://jwahawis.github.io/automated-blinds with original score O_Score 101 scored 7
4 Story Show HN: Mastodon, a federated microblogging network with id 13303346 and noOf_Comments 61 url https://mastodon.social with original score O_Score 172 scored 7
5 Story Show HN: Castor, a live dashboard for your pro