<a href="https://colab.research.google.com/github/marinathomas/SentimentAnalysisHN/blob/master/HN_SentimentAnalysis_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Load credentials to access BigQuery
2. Read the story ids for 2017 from the 'full' table.
3. For each story, get the associated 'main' comments. We are not considering response to the comments for now.
4. Analyze the comments and give the story a score based off the sentiment of the comments.

Step 1 - Load credentials

In [26]:
from google.cloud import bigquery
import pandas as pd
import os

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
root_path = "/content/gdrive/Shared drives/HackerNews:SentimentAnalysis/"
credential_path = root_path+"hackernews-bigquery-261019-0f8cc2295b63.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

Step 2 - Install the sentiment analysis library
https://github.com/cjhutto/vaderSentiment

In [28]:
!pip install vaderSentiment



In [0]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

Step 3 - Load BigQuery client and HackerNews dataset
1. Load the BigQuery client
2. Get a reference to HackerNews dataset
3. Load the data set

In [0]:
client = bigquery.Client()
hn_dataset_ref = client.dataset('hacker_news', project='bigquery-public-data')
hn_dset = client.get_dataset(hn_dataset_ref)

Step 4 - Look for stories of 2017

In [0]:
def get_stories():
    query = """
    SELECT table_full.id, table_full.title, table_full.url, table_full.score, table_full.descendants
    FROM `bigquery-public-data.hacker_news.full` as table_full
    WHERE  table_full.type = 'story' and REGEXP_CONTAINS(title, r"(S|s)how HN") and (deleted IS NULL or deleted IS FALSE) and  EXTRACT(YEAR FROM timestamp)=2017
    and table_full.descendants > 5
    --and (table_full.descendants = 0 or table_full.descendants <5)
    ORDER BY id asc
    --limit 50
    """

    query_job = client.query(query)
    iterator = query_job.result(timeout=30)
    rows = list(iterator)

    # Transform the rows into a nice pandas dataframe
    stories = pd.DataFrame(data=[list(x.values()) for x in rows], columns=list(rows[0].keys()))
    #stories.head(10)

    return stories

Lets check the data

In [0]:
import csv

stories = get_stories()
csv_file = root_path + "stories_with_more_than_5_comments_2017.csv"
with open(csv_file, mode='w') as stories_file:
  stories_writer = csv.writer(stories_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  for index,row in stories.iterrows():
      title, parent_id, score, noOf_comments, url = row['title'], row['id'], row['score'],row['descendants'], row['url']
      stories_writer.writerow([index, title, parent_id, url, noOf_comments])
      #print("----------------------------------------")
      #print('{} Title: {} \t  ID: {} \t url: {} \t descendants:{}'.format(index, title, parent_id, url, noOf_comments))



Let's bring up the comments for the above stories

Step 5 - For each story, bring up the associated comment

In [0]:
def get_comments(parent_id):
    query = """
    select  table_full.id, table_full.text
    from `bigquery-public-data.hacker_news.full`  as table_full
    where type = 'comment'  and (deleted IS NULL or deleted IS FALSE) and parent = @parent
    order by parent ;
    """

    query_params = [
        bigquery.ScalarQueryParameter("parent", "INT64", parent_id)
    ] 

    job_config = bigquery.QueryJobConfig()
    job_config.query_parameters = query_params
    query_job = client.query(query,location="US",job_config=job_config,)  

    iterator = query_job.result(timeout=30)
    rows = list(iterator)

    comments = pd.DataFrame()
    # Transform the rows into a nice pandas dataframe
    if rows:
      comments_temp = pd.DataFrame(data=[list(x.values()) for x in rows], columns=list(rows[0].keys()))
      comments.append(comments_temp)
      #comments.head(20)

    return comments

Step 6 - Analyse comments

In [0]:
for index, row in stories.iterrows():
  parent_id = 14497295 #row['id']
  comments = get_comments(parent_id)
  analyse_comments(comments, parent_id)
  break;

In [0]:
def analyse_comments(comments, parent_id):
  scores = []
  csv_file = root_path + "sentiment_analysis_results/comments_for_"+str(parent_id)+".csv"

  with open(csv_file, mode='w') as comments_file:
    comments_writer = csv.writer(comments_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for index,row in comments.iterrows():
        sentence = row['text']
        score = analyser.polarity_scores(str(sentence))
        print("{}\t Comment: {} \t SCORE: {}".format(index, sentence, str(score)))
        comments_writer.writerow([index, sentence, str(score)])
        scores.append(score)
  return scores


TODO: Analyze comments of comments ==> Will take up later

In [0]:
def score_story(scores):
  story_score = 0
  for row in scores:
    compound_score = row['compound']
    if compound_score >= 0.05:
      story_score += 1
    elif compound_score <= -0.05:
      story_score -= 1
  return story_score


In [0]:
csv_file = root_path + "sentiment_analysis_results/scores_stories_with_comments_2017.csv"

def analyse_hacker_news():
  stories = get_stories()
  scored_stories = []
  for index, row in stories.iterrows():
    parent_id = row['id']
    comments = get_comments(parent_id)
    story_point=0
    if not comments.empty:
      scores = analyse_comments(comments, parent_id)
      story_point = score_story(scores)
    with open(csv_file, mode='a') as scores_file:
      scores_writer = csv.writer(scores_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
      scores_writer.writerow([index, row['title'], parent_id, row['url'], row['score'],row['descendants'],story_point])
    print("{} Story {} with id {} and noOf_Comments {} url {} with original score O_Score {} scored {}".format(index, row['title'], parent_id, row['descendants'], row['url'], row['score'], story_point))
    break;


In [58]:
analyse_hacker_news()

0 Story Show HN: Blink my keyboard lights when you visit this page with id 13293894 and noOf_Comments 22 url http://lelandbatey.com/posts/2016/12/Making-lights-blink-for-each-HTTP-request/ with original score O_Score 87 scored 0
