# News Topic Modeling

This notebook is used to derive topics from news text

The model used here is the Universal Sentence Encoder Lite from Tensorflow Hub

https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder_lite

In [None]:
# # change gcloud config for running locally
# !gcloud config configurations activate news-site

In [2]:
from google.cloud import bigquery

import pandas
import re
import numpy
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import tensorflow_hub as hub
import sentencepiece as spm
from sklearn.cluster import KMeans

Instructions for updating:
non-resource variables are not supported in the long term


In [3]:
# # use this approach to create local credentials
# token = !gcloud auth print-access-token
# import google.oauth2.credentials
# credentials = google.oauth2.credentials.Credentials(token[0])
# client = bigquery.Client(credentials=credentials)

In [5]:
# use credentials with client when running locally
# client = bigquery.Client(credentials=credentials)
client = bigquery.Client()

sql = """
    SELECT 
        article_id,
        publishedAt,
        CONCAT(title, '. ', description, '. ', content) AS text
    FROM `news-site-280319.news.articles`
    WHERE
      title IS NOT NULL
      AND description IS NOT NULL
      AND content IS NOT NULL
      AND DATE(publishedAt) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
"""

df = client.query(sql).to_dataframe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   article_id   200 non-null    object             
 1   publishedAt  200 non-null    datetime64[ns, UTC]
 2   text         200 non-null    object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 4.8+ KB


In [6]:
news_df = df
news_df.head()

Unnamed: 0,article_id,publishedAt,text
0,c55b6133-5054-4344-985f-f8d6200b5924,2022-06-18 21:20:32+00:00,Canadian Grand Prix: Max Verstappen beats Fern...
1,95374ef9-8b76-483d-97bd-1f1027624c0c,2022-06-18 20:00:00+00:00,U.S. Open leaders greeted by tough scoring con...
2,b2ef6f94-a502-46c1-b063-2cc5280f8f43,2022-06-18 20:47:00+00:00,ECB won't solve profound debt issues: Rehn - R...
3,de62c664-c660-46d3-8fb1-613d6e74859f,2022-06-18 21:15:00+00:00,N.Korea reports 19310 new fever cases amid COV...
4,e396de49-ab4d-45c5-8c40-508a4fd31b50,2022-06-18 18:58:27+00:00,Premiership final: Leicester Tigers title a dr...


In [7]:
module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-lite/2")

In [8]:
input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])
encodings = module(
    inputs=dict(
        values=input_placeholder.values,
        indices=input_placeholder.indices,
        dense_shape=input_placeholder.dense_shape))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [9]:
with tf.Session() as sess:
    spm_path = sess.run(module(signature="spm_path"))

sp = spm.SentencePieceProcessor()
with tf.io.gfile.GFile(spm_path, mode="rb") as f:
    sp.LoadFromSerializedProto(f.read())
print("SentencePiece model loaded at {}.".format(spm_path))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


SentencePiece model loaded at b'/var/folders/99/_fcjbs5d08ndbt_1__blsphm0000gn/T/tfhub_modules/539544f0a997d91c327c23285ea00c37588d92cc/assets/universal_encoder_8k_spm.model'.


In [10]:
def process_to_IDs_in_sparse_format(sp, sentences):
    # An utility method that processes sentences with the sentence piece processor
    # 'sp' and returns the results in tf.SparseTensor-similar format:
    # (values, indices, dense_shape)
    ids = [sp.EncodeAsIds(x) for x in sentences]
    max_len = max(len(x) for x in ids)
    dense_shape=(len(ids), max_len)
    values=[item for sublist in ids for item in sublist]
    indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]
    return (values, indices, dense_shape)

In [11]:
# pass the text field from the news data into the messages
messages = news_df['text']
values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, messages)

In [12]:
# generate embeddings
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(
        encodings,
        feed_dict={input_placeholder.values: values,
                   input_placeholder.indices: indices,
                   input_placeholder.dense_shape: dense_shape})

In [13]:
# convert message_embeddings to list for use with sklearn kmeans
news_df['embedding'] = message_embeddings.tolist()

In [14]:
news_df.head()

Unnamed: 0,article_id,publishedAt,text,embedding
0,c55b6133-5054-4344-985f-f8d6200b5924,2022-06-18 21:20:32+00:00,Canadian Grand Prix: Max Verstappen beats Fern...,"[-0.06367803364992142, 0.048824556171894073, 0..."
1,95374ef9-8b76-483d-97bd-1f1027624c0c,2022-06-18 20:00:00+00:00,U.S. Open leaders greeted by tough scoring con...,"[0.02407379075884819, 0.020062070339918137, -0..."
2,b2ef6f94-a502-46c1-b063-2cc5280f8f43,2022-06-18 20:47:00+00:00,ECB won't solve profound debt issues: Rehn - R...,"[0.018625274300575256, 0.058462224900722504, 0..."
3,de62c664-c660-46d3-8fb1-613d6e74859f,2022-06-18 21:15:00+00:00,N.Korea reports 19310 new fever cases amid COV...,"[0.06288798153400421, -0.022045912221074104, -..."
4,e396de49-ab4d-45c5-8c40-508a4fd31b50,2022-06-18 18:58:27+00:00,Premiership final: Leicester Tigers title a dr...,"[-0.011827737092971802, 0.046899907290935516, ..."


In [15]:
# rearrange embedding series from dataframe into vertical stack for kmeans model
X = numpy.vstack(news_df['embedding'])
X.shape

(200, 512)

In [16]:
#Applying kmeans to the dataset / Creating the kmeans classifier
kmeans = KMeans(n_clusters = 20, init = 'k-means++', max_iter = 100, random_state = 0)
y_kmeans = kmeans.fit_predict(X)

In [17]:
# add cluster predictions to dataframe
news_df['cluster'] = y_kmeans

In [18]:
# add distance to nearest cluster
news_df['distance'] = numpy.min(kmeans.fit_transform(X), axis=1)

In [19]:
# evaluate closest articles for clusters
pandas.set_option('display.max_colwidth', None)
news_df[['text','cluster','distance']][news_df['cluster'] == 1].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
65,"Russia Has Already 'Strategically Lost' Ukraine War: UK Defense Chief. Sir Tony Radakin said Friday that ""Russia will never take control of Ukraine,"" calling it a ""dreadful mistake."". Russian President Vladimir Putin has already ""strategically lost"" the war against Ukraine, British Defense Chief Sir Tony Radakin said Friday.\r\nThe Kremlin ordered the invasion in late February with … [+2883 chars]",1,0.397631
165,"Russia Has Already 'Strategically Lost' Ukraine War: UK Defense Chief. Sir Tony Radakin said Friday that ""Russia will never take control of Ukraine,"" calling it a ""dreadful mistake."". Russian President Vladimir Putin has already ""strategically lost"" the war against Ukraine, British Defense Chief Sir Tony Radakin said Friday.\r\nThe Kremlin ordered the invasion in late February with … [+2883 chars]",1,0.397631
44,"Ukraine War: Russia Expert Warns of Putin Pressuring Belarus to Add Troops. Although not directly involved in the invasion, Belarus has allowed Russian forces to operate on its land, acting as an entry point into Ukraine.. Belarusian troops could be on the verge of deploying into Ukraine, said one expert on Russia on Saturday.\r\nThe Belarusian military is scheduled to hold mobilization training exercises this month and … [+2712 chars]",1,0.417517
142,"Ukraine War: Russia Expert Warns of Putin Pressuring Belarus to Add Troops. Although not directly involved in the invasion, Belarus has allowed Russian forces to operate on its land, acting as an entry point into Ukraine.. Belarusian troops could be on the verge of deploying into Ukraine, said one expert on Russia on Saturday.\r\nThe Belarusian military is scheduled to hold mobilization training exercises this month and … [+2712 chars]",1,0.417517
87,"Ukraine war: Zelensky visits front-line cities of Mykolaiv and Odesa. Ukraine's president makes a rare trip outside the capital to visit bomb-damaged Mykolaiv and Odesa.. The president of Ukraine has made a rare trip outside of the capital Kyiv to visit the southern cities of Mykolaiv and Odesa.\r\nVolodymyr Zelensky inspected damaged buildings and met soldiers, officia… [+2586 chars]",1,0.486934


In [20]:
# evaluate closest articles for clusters
news_df[['text','cluster','distance']][news_df['cluster'] == 2].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
2,"ECB won't solve profound debt issues: Rehn - Reuters.com. The European Central Bank should limit the rise in borrowing costs of more indebted euro zone members but will not solve their debt issues or let budget concerns dictate monetary policy, ECB policymaker Olli Rehn said on Saturday.. FRANKFURT, June 18 (Reuters) - The European Central Bank should limit the rise in borrowing costs of more indebted euro zone members but will not solve their debt issues or let budget concerns dictat… [+2204 chars]",2,0.469238
101,"ECB won't solve profound debt issues: Rehn - Reuters.com. The European Central Bank should limit the rise in borrowing costs of more indebted euro zone members but will not solve their debt issues or let budget concerns dictate monetary policy, ECB policymaker Olli Rehn said on Saturday.. FRANKFURT, June 18 (Reuters) - The European Central Bank should limit the rise in borrowing costs of more indebted euro zone members but will not solve their debt issues or let budget concerns dictat… [+2204 chars]",2,0.469238
75,"Waller Backs 75 Basis-Point July Hike, Says Fed Is 'All In'. Federal Reserve Governor Christopher Waller said he would support another 75-basis-point rate increase at the central bank’s July meeting should economic data come in as he expects.. Sign up for the New Economy Daily newsletter, follow us @economics and subscribe to our podcast.\r\nFederal Reserve Governor Christopher Waller said he would support another 75-basis-point rate increas… [+2796 chars]",2,0.48936
174,"Waller Backs 75 Basis-Point July Hike, Says Fed Is 'All In'. Federal Reserve Governor Christopher Waller said he would support another 75-basis-point rate increase at the central bank’s July meeting should economic data come in as he expects.. Sign up for the New Economy Daily newsletter, follow us @economics and subscribe to our podcast.\r\nFederal Reserve Governor Christopher Waller said he would support another 75-basis-point rate increas… [+2796 chars]",2,0.48936
72,"ECB's Rehn Underscores Commitment to Contain Bond-Market Panic. The European Central Bank intends to ensure that its monetary policy is transmitted equally across the euro zone’s 19 member-states by preventing undue turbulence on government bond markets, according to Governing Council member Olli Rehn.. Sign up for the New Economy Daily newsletter, follow us @economics and subscribe to our podcast.\r\nThe European Central Bank intends to ensure that its monetary policy is transmitted equally across th… [+2561 chars]",2,0.490171


In [21]:
# evaluate closest articles for clusters
news_df[['text','cluster','distance']][news_df['cluster'] == 3].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
141,Texans’ Davis Mills places No. 35 in Chris Simms’ QB rankings. Houston Texans quarterback Davis Mills ranked No. 35 in the latest Chris Simms top-40 QB list.. The Houston Texans are placing it all in the hands of Davis Mills in 2022.\r\nThe second-year quarterback has had no competition either from the teams free agent signings or the teams draft class. Kyle… [+2024 chars],3,0.446484
43,Texans’ Davis Mills places No. 35 in Chris Simms’ QB rankings. Houston Texans quarterback Davis Mills ranked No. 35 in the latest Chris Simms top-40 QB list.. The Houston Texans are placing it all in the hands of Davis Mills in 2022.\r\nThe second-year quarterback has had no competition either from the teams free agent signings or the teams draft class. Kyle… [+2024 chars],3,0.446484
57,"Dolphins grab two impact players in Draft Wire’s early 2023 mock draft. A look at the top talent in next year's draft.. With the Miami Dolphins on break between OTAs and training camp, its important to look to the future.\r\nThe Miami Dolphins, due to the trade with the San Francisco 49ers ahead of the 2021 NFL draft, h… [+1710 chars]",3,0.531848
157,"Dolphins grab two impact players in Draft Wire’s early 2023 mock draft. A look at the top talent in next year's draft.. With the Miami Dolphins on break between OTAs and training camp, its important to look to the future.\r\nThe Miami Dolphins, due to the trade with the San Francisco 49ers ahead of the 2021 NFL draft, h… [+1710 chars]",3,0.531848
55,"3 teams the Sooners need to play in the Brent Venables era. The Oklahoma Sooners have a few schools they need to go up against in the Brent Venables era.. College football is home to some of the best storylines in American sports. When Lincoln Riley left Oklahoma for USC and Brent Venables replaced him, that was yet another juicy bit of drama in a spor… [+4690 chars]",3,0.541064


In [22]:
# add temporary columns for keywords
news_df['keywords'] = numpy.nan

# rename distance to topic_perc_contrib to fit legacy schema
news_df.rename(columns={'cluster': 'dominant_topic', 'distance': 'topic_perc_contrib'}, inplace=True)

# drop embedding columns
news_df.drop(['embedding'], axis=1, inplace=True)

In [23]:
# evaluate data to be written to bigquery
pandas.reset_option('display')

news_df.head()

Unnamed: 0,article_id,publishedAt,text,dominant_topic,topic_perc_contrib,keywords
0,c55b6133-5054-4344-985f-f8d6200b5924,2022-06-18 21:20:32+00:00,Canadian Grand Prix: Max Verstappen beats Fern...,4,0.74736,
1,95374ef9-8b76-483d-97bd-1f1027624c0c,2022-06-18 20:00:00+00:00,U.S. Open leaders greeted by tough scoring con...,13,0.655995,
2,b2ef6f94-a502-46c1-b063-2cc5280f8f43,2022-06-18 20:47:00+00:00,ECB won't solve profound debt issues: Rehn - R...,2,0.469238,
3,de62c664-c660-46d3-8fb1-613d6e74859f,2022-06-18 21:15:00+00:00,N.Korea reports 19310 new fever cases amid COV...,12,0.626386,
4,e396de49-ab4d-45c5-8c40-508a4fd31b50,2022-06-18 18:58:27+00:00,Premiership final: Leicester Tigers title a dr...,13,0.603693,


In [24]:
# configure BigQuery job
job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("article_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("publishedAt", bigquery.enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("text", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("dominant_topic", bigquery.enums.SqlTypeNames.INT64),
        bigquery.SchemaField("topic_perc_contrib", bigquery.enums.SqlTypeNames.FLOAT64),
        bigquery.SchemaField("keywords", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

In [25]:
table_id = "news-site-280319.topics.article_topics"
job = client.load_table_from_dataframe(
    news_df,
    table_id, 
    job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

<google.cloud.bigquery.job.LoadJob at 0x15382ca30>

In [26]:
# check job status
job.result().state

'DONE'