In [None]:
pip install newspaper3k

In [None]:
import numpy as np
import pandas as pd
from time import sleep
from newspaper import Article
from google.cloud import bigquery
from functools import reduce

In [None]:
# Client is needed for configuring API requests. Leaving it empty will initiate Kaggle's public dataset BigQuery integration.
client = bigquery.Client()

In [None]:
# Construct a reference to the "hacker_news" dataset
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Basic SQL queries against the HN dataset

We start our analysis with investigating the top domains that HN users use as sources. 

Steps:
1. Extract domains from the stories' urls using regexp. 
2. Exclude stories without urls
3. Include stories published between '2018-01-01' and '2021-04-01' containing selected keywords in their titles or texts.
4. `COUNT `top 50 domains and store the results in the column `c` 

In [None]:
# Environment, Sustainability and Resilience
umb0='''climate crisis
global warming
right to repair
climate change
sustainable
sustainability'''.split('\n')
# Decentralising Power and Building Alternatives
umb1='''open-source
blockchain
competition
decentralization
decentralisation
decentralized
decentralised'''.split('\n')
# Public Space and Sociality
umb2='''public space
smart city
offline'''.split('\n')
# Privacy, Identity, and Data Governance
umb3='''privacy
data privacy
data governance
personal data
digital id
e-id
encryption
anonymity'''.split('\n')
# Trustworthy Information Flows, Cybersecurity and Democracy
umb4='''fake news
filter bubble
democracy
democratically
cybersecurity
censorship'''.split('\n')
# Access, Inclusion and Justice 
umb5='''discrimination
justice
ethical
ethics
inclusive
freedom
human rights
open internet
equality'''.split('\n')

In [None]:
n_topics=6
umbs=[]
for i in range(n_topics):
    umbs.append(eval('umb'+str(i)))

In [None]:
# SQL query on HN database.
repls = (' ', '_'), ('-', '_')

dfs=[]

for umb in umbs:
    for keyw in umb:
        k=reduce(lambda x, kv: x.replace(*kv), repls, str(keyw))
        print(k)
        query = """
        #standardSQL
        SELECT REGEXP_EXTRACT(url, '//([^/]*)/?') {}, COUNT(*) c
        FROM `bigquery-public-data.hacker_news.full`
        WHERE url!='' AND (REGEXP_CONTAINS(text, r"{}") OR REGEXP_CONTAINS(title, r"{}")) AND timestamp BETWEEN '2018-01-01' AND '2021-04-01' AND type='story' 
        GROUP BY {} ORDER BY c DESC LIMIT 50""".format(k, str(keyw), str(keyw), k)
        # Set up the query
        query_job = client.query(query)
        dfs.append(query_job.to_dataframe())

In [None]:
for df in dfs:
    df.rename(columns={df.columns[0]:'keyws'},inplace=True)

In [None]:
dfs_c=pd.concat(dfs)

In [None]:
top_domains=dfs_c.groupby('keyws').sum().sort_values('c',ascending=False)

In [None]:
top_domains[:50]

In [None]:
# SQL query on HN database. 
repls = (' ', '_'), ('-', '_')

dfs=[]

for umb in umbs:
    for keyw in umb:
        k=reduce(lambda x, kv: x.replace(*kv), repls, str(keyw))
        print(k)
        query = """
        #standardSQL
        SELECT *
        FROM `bigquery-public-data.hacker_news.full`
        WHERE url!='' AND (REGEXP_CONTAINS(text, r"{}") OR REGEXP_CONTAINS(title, r"{}")) AND timestamp BETWEEN '2018-01-01' AND '2021-04-01' AND type='story' 
        """.format(str(keyw), str(keyw))
        # Set up the query
        query_job = client.query(query)
        dfs.append(query_job.to_dataframe())

In [None]:
len(dfs)

In [None]:
for no, df in enumerate(dfs):
    print(no,len(df))

In [None]:
l_dfs=[]

# kl = [item for sublist in umbs for item in sublist]

for n, df in enumerate(dfs):
    print('keyword: '+str(n))
    df=df.drop_duplicates(subset=['url'])
    date=[]
    auths=[]
    titles=[]
    text=[] 

    for no, url in enumerate(df['url'].tolist()):
        if no%100==0:
            print(no)
            sleep(1)
        article = Article(url)
        try:
            article.download()
            article.parse()
            date.append(article.publish_date)
            auths.append(article.authors)
            titles.append(article.title)
            text.append(article.text)
        except:
            date.append(np.nan)
            auths.append(np.nan)
            titles.append(np.nan)
            text.append(np.nan)

    res={
        'title': titles,
        'link': df['url'].tolist(),
        'date':date,
        'authors':auths,
        'text':text,
        }
    df_temp=pd.DataFrame(res)
    l_dfs.append(df_temp)

In [None]:
kl = [item for sublist in umbs for item in sublist]

In [None]:
for no, df in enumerate(l_dfs):
    df['keyw']=kl[no]

In [None]:
dfc=pd.concat(l_dfs)

In [None]:
# dfc=pd.read_csv('../input/hackernews-umbrella-topics/hn.csv')

In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

dfc=dfc.dropna(subset=['text'])
docs=dfc['text'].tolist()

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

In [None]:
dfc['token']=docs

In [None]:
dfc.head()

In [None]:
# dfc=dfc[dfc['keyw'].isin(umb0)]

In [None]:
dfc.to_csv('/kaggle/working/hn_token.csv',sep=';')

In [None]:
dfc