# News Topic Modeling

This notebook is used to derive topics from news text

The model used here is the Universal Sentence Encoder Lite from Tensorflow Hub

https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder_lite

In [None]:
# # change gcloud config for running locally
# !gcloud config configurations activate news-site

In [1]:
from google.cloud import bigquery

import pandas
import re
import numpy
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import tensorflow_hub as hub
import sentencepiece as spm
from sklearn.cluster import KMeans

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# # use this approach to create local credentials
# token = !gcloud auth print-access-token
# import google.oauth2.credentials
# credentials = google.oauth2.credentials.Credentials(token[0])
# client = bigquery.Client(credentials=credentials)

In [3]:
# use credentials with client when running locally
client = bigquery.Client()

sql = """
    SELECT 
        article_id,
        publishedAt,
        CONCAT(title, '. ', description, '. ', content) AS text
    FROM `news-site-280319.news.articles`
    WHERE
      title IS NOT NULL
      AND description IS NOT NULL
      AND content IS NOT NULL
      AND DATE(publishedAt) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
"""

df = client.query(sql).to_dataframe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244 entries, 0 to 1243
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   article_id   1244 non-null   object             
 1   publishedAt  1244 non-null   datetime64[ns, UTC]
 2   text         1244 non-null   object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 29.3+ KB


In [4]:
news_df = df
news_df.head()

Unnamed: 0,article_id,publishedAt,text
0,ca2e0821-9dca-4d7a-a61a-3b7f1bc30bdf,2021-02-21 22:16:12+00:00,Paris St Germain 0-2 Monaco: Pochettino loses ...
1,ca1fa44f-3a3a-4163-970d-6502b32837a0,2021-02-21 21:22:20+00:00,Ross County 1-0 Celtic: John Hughes' side off ...
2,d34bef50-cb17-4eef-a663-d8f0bdfa6557,2021-02-21 20:56:23+00:00,Manchester United 3-1 Newcastle United: Marcus...
3,8cebfabb-b80f-4c8d-95cd-455fb7296b7a,2021-02-21 21:06:00+00:00,2021/02/21 21:00 GMT. The latest five minute n...
4,a718a782-93dc-462a-ad35-9f539b2f0d35,2021-02-21 22:12:00+00:00,Iran and the IAEA come to a temporary agreemen...


In [5]:
module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-lite/2")

In [6]:
input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])
encodings = module(
    inputs=dict(
        values=input_placeholder.values,
        indices=input_placeholder.indices,
        dense_shape=input_placeholder.dense_shape))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [7]:
with tf.Session() as sess:
    spm_path = sess.run(module(signature="spm_path"))

sp = spm.SentencePieceProcessor()
with tf.io.gfile.GFile(spm_path, mode="rb") as f:
    sp.LoadFromSerializedProto(f.read())
print("SentencePiece model loaded at {}.".format(spm_path))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


SentencePiece model loaded at b'/var/folders/99/_fcjbs5d08ndbt_1__blsphm0000gn/T/tfhub_modules/539544f0a997d91c327c23285ea00c37588d92cc/assets/universal_encoder_8k_spm.model'.


In [8]:
def process_to_IDs_in_sparse_format(sp, sentences):
    # An utility method that processes sentences with the sentence piece processor
    # 'sp' and returns the results in tf.SparseTensor-similar format:
    # (values, indices, dense_shape)
    ids = [sp.EncodeAsIds(x) for x in sentences]
    max_len = max(len(x) for x in ids)
    dense_shape=(len(ids), max_len)
    values=[item for sublist in ids for item in sublist]
    indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]
    return (values, indices, dense_shape)

In [9]:
# pass the text field from the news data into the messages
messages = news_df['text']
values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, messages)

In [10]:
# generate embeddings
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(
        encodings,
        feed_dict={input_placeholder.values: values,
                   input_placeholder.indices: indices,
                   input_placeholder.dense_shape: dense_shape})

In [15]:
# convert message_embeddings to list for use with sklearn kmeans
news_df['embedding'] = message_embeddings.tolist()

In [16]:
news_df.head()

Unnamed: 0,article_id,publishedAt,text,embedding
0,ca2e0821-9dca-4d7a-a61a-3b7f1bc30bdf,2021-02-21 22:16:12+00:00,Paris St Germain 0-2 Monaco: Pochettino loses ...,"[-0.028592605143785477, 0.04850605130195618, -..."
1,ca1fa44f-3a3a-4163-970d-6502b32837a0,2021-02-21 21:22:20+00:00,Ross County 1-0 Celtic: John Hughes' side off ...,"[-0.023736482486128807, 0.03493289276957512, -..."
2,d34bef50-cb17-4eef-a663-d8f0bdfa6557,2021-02-21 20:56:23+00:00,Manchester United 3-1 Newcastle United: Marcus...,"[0.016477283090353012, 0.00411869864910841, -0..."
3,8cebfabb-b80f-4c8d-95cd-455fb7296b7a,2021-02-21 21:06:00+00:00,2021/02/21 21:00 GMT. The latest five minute n...,"[-0.06045500934123993, -0.059935059398412704, ..."
4,a718a782-93dc-462a-ad35-9f539b2f0d35,2021-02-21 22:12:00+00:00,Iran and the IAEA come to a temporary agreemen...,"[0.04008898884057999, -0.012774740345776081, -..."


In [18]:
# rearrange embedding series from dataframe into vertical stack for kmeans model
X = numpy.vstack(news_df['embedding'])
X.shape

In [20]:
#Applying kmeans to the dataset / Creating the kmeans classifier
kmeans = KMeans(n_clusters = 20, init = 'k-means++', max_iter = 100, random_state = 0)
y_kmeans = kmeans.fit_predict(X)

In [21]:
# add cluster predictions to dataframe
news_df['cluster'] = y_kmeans

In [22]:
# add distance to nearest cluster
news_df['distance'] = numpy.min(kmeans.fit_transform(X), axis=1)

In [26]:
# evaluate closest articles for clusters
pandas.set_option('display.max_colwidth', None)
news_df[['text','cluster','distance']][news_df['cluster'] == 1].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
524,"Aretha Franklin's son puts upcoming 'Genius' series on blast, says family does not support it. Five weeks before the eight-part series starring Cynthia Erivo as Aretha, Kecalf Franklin claims a deal was ""pushed through without our consent."". Months after her death, a set of three handwritten wills were discovered in Franklin's home\r\nTime People\r\nAretha Franklins youngest son has slammed the forthcoming National Geographic series about th… [+3412 chars]",1,0.667728
1157,"HBO Max’s Bold, Witty and Wrenching AIDS Drama It’s a Sin Is the First Must-See Show of 2021. Russell T. Davies' bold, witty, wrenching HBO Max drama follows five young friends through the first decade of the AIDS crisis in London.. It’s one of the oldest stories of modern times: after growing up in a place that makes them feel like a freak or an abomination or a space alien, a young person lights out for the big city, in search… [+6306 chars]",1,0.686509
1187,"'It's a Sin' Review: HBO Max AIDS Drama is Revolutionary But Rushed. ""It's a Sin"" is the new show from ""Queer as Folk"" creator Russell T. Davies, streaming now on HBO Max.. The AIDS crisis of the 1980s and 1990s is a topic that has been covered in many excellent pieces of art in the U.S., from Angels in America and When We Rise to must-watch documentaries like How to Su… [+6440 chars]",1,0.686666
895,"The trials and triumph of the Fisk Jubilee Singers, one of America's great musical institutions. The creation, rise and endurance of the Fisk Jubilee Singers — many of them former slaves — is a true American triumph.. As the National Museum of African American Music opens its doors, journalists from the USA TODAY Network explore the stories, places and people who helped make music what it is today in our expansive… [+6589 chars]",1,0.716526
893,"'Nomadland': The True Story Behind the Hulu Movie. ""Nomadland"" is streaming now on Hulu, meaning viewers can finally see the Oscar-tipped movie for themselves and learn the true stories of some of the urban nomads in the film.. Nomadland has spent the last few months racking up awards, with the central performance of Frances McDormand as a woman who goes out on the road after her husband died. Apart from McDormand, however,… [+5634 chars]",1,0.719286


In [27]:
# evaluate closest articles for clusters
news_df[['text','cluster','distance']][news_df['cluster'] == 2].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
394,"Amid GOP Infighting Over Trump, Republicans Agree on Opposing Biden's Agenda. President Joe Biden wanted to govern with Congress in a bipartisan manner, but that looks increasingly unlikely.. As tensions continue to simmer within the Republican Party over the role of former President Donald Trump going forward, GOP lawmakers appear to be largely aligned behind a unified agenda of opposing… [+6850 chars]",2,0.4883
575,"Live updates: Biden seeks to return attention to combating the coronavirus with televised town hall. The president plans to travel to Milwaukee on Tuesday to talk to a socially distanced audience about the pandemic and its economic fallout. It comes as he is pushing Congress to pass a $1.9 trillion relief bill.. Sen. Richard Burr (N.C.) became the latest Republican to face pushback for his guilty vote in Trumps impeachment trial, as the central committee of the North Carolina Republican Party unanimously vot… [+1602 chars]",2,0.49847
1164,"Mitch McConnell's Condemnation of Donald Trump Firmly at Odds with GOP Consensus. The Senate minority leader has faced the wrath of the former president in recent days, after stating the president held responsibility of the violence at the Capitol on January 6.. Senate Minority Leader Mitch McConnell's suggestion former President Donald Trump is ""practically and morally responsible"" for provoking the violence of January 6 stands firmly at odds with the opini… [+3182 chars]",2,0.509849
659,"North Carolina GOP censures Sen. Burr for impeachment vote. ""My party’ s leadership has chosen loyalty to one man over the core principles of the Republican Party and the founders of our great nation,” Burr said in response.. RALEIGH, N.C. The North Carolina Republican Party unanimously approved a resolution Monday to censure Sen. Richard Burr over his vote to convict former President Donald Trump during his second impeac… [+5172 chars]",2,0.535489
316,"GOP Rep. Tom Reed 'definitely looking' at challenging NY Gov. Cuomo amid nursing home controversy. Republican Rep. Tom Reed of New York says that he’s ""definitely looking"" at running next year against Gov. Andrew Cuomo when the embattled three-term Democratic governor bids for re-election.. Republican Rep. Tom Reed of New York says that hes ""definitely looking"" at running next year against Gov. Andrew Cuomo when the embattled three-term Democratic governor bids for re-election.\r\n""People… [+3674 chars]",2,0.53737


In [28]:
# evaluate closest articles for clusters
news_df[['text','cluster','distance']][news_df['cluster'] == 3].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
1021,"Diablo 2 Resurrected remasters Blizzard’s classic RPG for console and PC. To close out this year’s BlizzCon opening ceremony, Blizzard announced the long-awaited Diablo 2 Resurrection, a remaster of the ARPG classic. This new version of Diablo 2 will launch later this year on both console and PC.. The long-rumored Diablo 2 makeover is confirmed Diablo 2 Resurrected will be a faithful remaster of Diablo 2 and its expansion, Lord of Destruction. Blizzard announced Resurrected in the closing mome… [+990 chars]",3,0.554347
753,"The Legend of Zelda: Skyward Sword coming to Nintendo Switch. The Legend of Zelda: Skyward Sword is coming to the Nintendo Switch as part of the 35th anniversary collection in 2020, as announced via the Nintendo Direct Feb. 17.. Nintendo is bringing The Legend of Zelda: Skyward Sword, Links high-flying 2011 Wii adventure, to the Switch later this year. The Switch port of Skyward Sword will offer a major visual upgrade over t… [+1135 chars]",3,0.562915
1010,"Blizzard Arcade Collection bundles three of the studio’s more obscure classics. During the opening ceremony for BlizzCon Online, the World of Warcraft developer announced today that it will release a new video game anthology called Blizzard Arcade Collection, a compilation of three games from the 1990s.. Available now\r\nAs part of BlizzCon Onlines opening ceremony, Blizzard Entertainment announced a new video game anthology that includes some of the earlier titles created by the World of Warcraft deve… [+876 chars]",3,0.587458
965,"Diablo II is getting remastered for console and PC. Diablo 2 is getting a new remastered edition, called Diablo II Resurrected, which was announced at Blizzcon 2021. It’s coming to PC, Switch, PS4, PS4, and Xbox Series X and S, with cross-progression between PC and console.. Diablo II is getting a big overhaul in a new remaster. After a few leaks, Blizzard officially announced Diablo II Resurrected at its online-only edition of Blizzcon 2021. The new version of the game … [+361 chars]",3,0.598774
959,"Blizzard Arcade Collection of Enhanced Retro Classics Announced. Classic retro games from Blizzard's past are being enhanced and re-released in the Blizzard Arcade Collection.. Blizzard has announced the Blizzard Arcade Collection, a set of three enhanced editions of some of Blizzards earliest, pre-Warcraft games. The collection is available today on PC and consoles.The Bli… [+2886 chars]",3,0.600433


In [29]:
# add temporary columns for keywords
news_df['keywords'] = numpy.nan

# rename distance to topic_perc_contrib to fit legacy schema
news_df.rename(columns={'cluster': 'dominant_topic', 'distance': 'topic_perc_contrib'}, inplace=True)

# drop embedding columns
news_df.drop(['embedding'], axis=1, inplace=True)

In [30]:
# evaluate data to be written to bigquery
pandas.reset_option('display')

news_df.head()

Unnamed: 0,article_id,publishedAt,text,dominant_topic,topic_perc_contrib,keywords
0,ca2e0821-9dca-4d7a-a61a-3b7f1bc30bdf,2021-02-21 22:16:12+00:00,Paris St Germain 0-2 Monaco: Pochettino loses ...,10,0.526822,
1,ca1fa44f-3a3a-4163-970d-6502b32837a0,2021-02-21 21:22:20+00:00,Ross County 1-0 Celtic: John Hughes' side off ...,10,0.45898,
2,d34bef50-cb17-4eef-a663-d8f0bdfa6557,2021-02-21 20:56:23+00:00,Manchester United 3-1 Newcastle United: Marcus...,10,0.512203,
3,8cebfabb-b80f-4c8d-95cd-455fb7296b7a,2021-02-21 21:06:00+00:00,2021/02/21 21:00 GMT. The latest five minute n...,0,0.048618,
4,a718a782-93dc-462a-ad35-9f539b2f0d35,2021-02-21 22:12:00+00:00,Iran and the IAEA come to a temporary agreemen...,4,0.597285,


In [31]:
# configure BigQuery job
job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("article_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("publishedAt", bigquery.enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("text", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("dominant_topic", bigquery.enums.SqlTypeNames.INT64),
        bigquery.SchemaField("topic_perc_contrib", bigquery.enums.SqlTypeNames.FLOAT64),
        bigquery.SchemaField("keywords", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

In [32]:
table_id = "news-site-280319.topics.article_topics"
job = client.load_table_from_dataframe(
    news_df,
    table_id, 
    job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

<google.cloud.bigquery.job.LoadJob at 0x156cbabe0>

In [33]:
# check job status
job.result().state

'DONE'