<a href="https://colab.research.google.com/github/marinathomas/SentimentAnalysisHN/blob/master/HN_SentimentAnalysis_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Load credentials to access BigQuery
2. Read the story ids for 2017 from the 'full' table.
3. For each story, get the associated 'main' comments. We are not considering response to the comments for now.
4. Analyze the comments and give the story a score based off the sentiment of the comments.

Step 1 - Load credentials

In [44]:
from google.cloud import bigquery
import pandas as pd
import os

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
root_path = "/content/gdrive/Shared drives/HackerNews:SentimentAnalysis/"
credential_path = "hackernews-bigquery-261019-0f8cc2295b63.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

Step 2 - Install the sentiment analysis library
https://github.com/cjhutto/vaderSentiment

In [3]:
!pip install vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/86/9e/c53e1fc61aac5ee490a6ac5e21b1ac04e55a7c2aba647bb8411c9aadf24e/vaderSentiment-3.2.1-py2.py3-none-any.whl (125kB)
[K     |██▋                             | 10kB 19.5MB/s eta 0:00:01[K     |█████▏                          | 20kB 3.3MB/s eta 0:00:01[K     |███████▉                        | 30kB 4.7MB/s eta 0:00:01[K     |██████████▍                     | 40kB 3.1MB/s eta 0:00:01[K     |█████████████                   | 51kB 3.9MB/s eta 0:00:01[K     |███████████████▋                | 61kB 4.6MB/s eta 0:00:01[K     |██████████████████▎             | 71kB 5.3MB/s eta 0:00:01[K     |████████████████████▉           | 81kB 5.9MB/s eta 0:00:01[K     |███████████████████████▍        | 92kB 6.6MB/s eta 0:00:01[K     |██████████████████████████      | 102kB 5.2MB/s eta 0:00:01[K     |████████████████████████████▋   | 112kB 5.2MB/s eta 0:00:01[K     |███████████████████████████████▎| 12

In [0]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

Step 3 - Load BigQuery client and HackerNews dataset
1. Load the BigQuery client
2. Get a reference to HackerNews dataset
3. Load the data set

In [46]:
client = bigquery.Client()
hn_dataset_ref = client.dataset('hacker_news', project='bigquery-public-data')
hn_dset = client.get_dataset(hn_dataset_ref)

DefaultCredentialsError: ignored

Step 4 - Look for 3 most popular stories of 2017

In [0]:
def get_stories():
    query = """
    SELECT table_full.id, table_full.title, table_full.url
    FROM `bigquery-public-data.hacker_news.full` as table_full
    WHERE  table_full.type = 'story' and REGEXP_CONTAINS(title, r"(S|s)how HN") and (deleted IS NULL or deleted IS FALSE) and  EXTRACT(YEAR FROM timestamp)=2017
    ORDER BY SCORE desc
    --LIMIT 3
    """

    query_job = client.query(query)
    iterator = query_job.result(timeout=30)
    rows = list(iterator)

    # Transform the rows into a nice pandas dataframe
    stories = pd.DataFrame(data=[list(x.values()) for x in rows], columns=list(rows[0].keys()))
    stories.head(10)

    return stories

Lets check the data

In [0]:
import csv

stories = get_stories()
#csv_file = root_path + "stories_2017.csv"
#with open(csv_file, mode='w') as stories_file:
  #stories_writer = csv.writer(stories_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  #for index,row in stories.iterrows():
      #title, parent_id, url = row['title'], row['id'], row['url']
      #stories_writer.writerow([index, title, parent_id, url])
      #print("----------------------------------------")
      #print('{} Title: {} \t  Descendants: {} \t  ID: {} \t url: {}'.format(index, title, parent_id, url))



Let's bring up the comments for the above stories

Step 5 - For each story, bring up the associated comment

In [0]:
def get_comments(parent_id):
    query = """
    select  table_full.id, table_full.text
    from `bigquery-public-data.hacker_news.full`  as table_full
    where type = 'comment'  and (deleted IS NULL or deleted IS FALSE) and parent = @parent
    order by parent ;
    """

    query_params = [
        bigquery.ScalarQueryParameter("parent", "INT64", parent_id)
    ] 

    job_config = bigquery.QueryJobConfig()
    job_config.query_parameters = query_params
    query_job = client.query(query,location="US",job_config=job_config,)  

    iterator = query_job.result(timeout=30)
    rows = list(iterator)

    # Transform the rows into a nice pandas dataframe
    comments = pd.DataFrame(data=[list(x.values()) for x in rows], columns=list(rows[0].keys()))
    #comments.head(20)

    return comments

Step 6 - Analyse comments

In [0]:
for index, row in stories.iterrows():
  parent_id = row['id']
  comments = get_comments(parent_id)
  break;

In [0]:
def analyse_comments(comments):
    scores = []
    for index,row in comments.iterrows():
        sentence = row['text']
        score = analyser.polarity_scores(str(sentence))
        #print("{}\t Comment: {} \t SCORE: {}".format(index, sentence, str(score)))
        #print("==============================================================================")
        scores.append(score)
    return scores


In [14]:
analyse_comments(comments)

[{'compound': 0.4376, 'neg': 0.0, 'neu': 0.776, 'pos': 0.224},
 {'compound': 0.3164, 'neg': 0.358, 'neu': 0.124, 'pos': 0.518},
 {'compound': 0.8624, 'neg': 0.0, 'neu': 0.803, 'pos': 0.197},
 {'compound': 0.3703, 'neg': 0.034, 'neu': 0.912, 'pos': 0.054},
 {'compound': 0.34, 'neg': 0.0, 'neu': 0.87, 'pos': 0.13},
 {'compound': 0.5093, 'neg': 0.0, 'neu': 0.891, 'pos': 0.109},
 {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0},
 {'compound': 0.636, 'neg': 0.106, 'neu': 0.597, 'pos': 0.298},
 {'compound': 0.34, 'neg': 0.0, 'neu': 0.906, 'pos': 0.094},
 {'compound': 0.5562, 'neg': 0.0, 'neu': 0.754, 'pos': 0.246},
 {'compound': 0.7501, 'neg': 0.0, 'neu': 0.319, 'pos': 0.681},
 {'compound': 0.858, 'neg': 0.062, 'neu': 0.661, 'pos': 0.277},
 {'compound': 0.7263, 'neg': 0.0, 'neu': 0.596, 'pos': 0.404},
 {'compound': 0.8196, 'neg': 0.0, 'neu': 0.716, 'pos': 0.284},
 {'compound': 0.8268, 'neg': 0.0, 'neu': 0.365, 'pos': 0.635},
 {'compound': 0.9511, 'neg': 0.0, 'neu': 0.744, 'pos': 0.256},

TODO: Analyze comments of comments ==> Will take up later

In [0]:
def score_story(scores):
  story_score = 0
  for row in scores:
    compound_score = row['compound']
    if compound_score >= 0.05:
      story_score += 1
    elif compound_score <= -0.05:
      story_score -= 1
  return story_score


In [0]:
from time import sleep
csv_file = root_path + "scores_2017.csv"

def analyse_hacker_news():
  stories = get_stories()
  scored_stories = []
  with open(csv_file, mode='w') as scores_file:
    scores_writer = csv.writer(scores_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for index, row in stories.iterrows():
      parent_id = row['id']
      comments = get_comments(parent_id)
      scores = analyse_comments(comments)
      story_point = score_story(scores)
      scores_writer.writerow([index, row['title'], parent_id, row['url'], story_point])
      sleep(1)
      print("{} Story {} with id {} and url {} scored {}".format(index, row['title'], parent_id, row['url'], story_point))
      #print("==================================================================================================")
      #break;


In [41]:
analyse_hacker_news()

Forbidden: ignored