# News Topic Modeling

This notebook is used to derive topics from news text

The model used here is the Universal Sentence Encoder Lite from Tensorflow Hub

https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder_lite

In [1]:
# # change gcloud config for running locally
# !gcloud config configurations activate news-site

In [2]:
from google.cloud import bigquery

import pandas
import re
import numpy
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import tensorflow_hub as hub
import sentencepiece as spm
from sklearn.cluster import KMeans

Instructions for updating:
non-resource variables are not supported in the long term


In [3]:
# # use this approach to create local credentials
# token = !gcloud auth print-access-token
# import google.oauth2.credentials
# credentials = google.oauth2.credentials.Credentials(token[0])
# client = bigquery.Client(credentials=credentials)

In [4]:
# use credentials with client when running locally
client = bigquery.Client()

sql = """
    SELECT 
        article_id,
        publishedAt,
        CONCAT(title, '. ', description, '. ', content) AS text
    FROM `news-site-280319.news.articles`
    WHERE
      title IS NOT NULL
      AND description IS NOT NULL
      AND content IS NOT NULL
      AND DATE(publishedAt) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
"""

df = client.query(sql).to_dataframe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   article_id   999 non-null    object             
 1   publishedAt  999 non-null    datetime64[ns, UTC]
 2   text         999 non-null    object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 23.5+ KB


In [5]:
news_df = df
news_df.head()

Unnamed: 0,article_id,publishedAt,text
0,c2b2b5d8-9b26-4c9f-8f43-4521889be456,2021-08-24 10:47:42+00:00,South African Unemployment Rate Rises to Highe...
1,b7ffdf3f-fd51-4fdf-8426-90f21e7a2267,2021-08-24 10:53:28+00:00,Afghan basketball star welcomed by new team in...
2,aea7857a-6e9a-43a6-96a8-5a52de44e6de,2021-08-24 11:24:21+00:00,How are women in Afghanistan reacting to Talib...
3,d9c15bd2-d69e-4ddb-bf38-e8cf686389f1,2021-08-24 11:21:35+00:00,Afghan women footballers evacuated by Australi...
4,444a5117-e1fe-40ea-b297-5c9929531dd1,2021-08-24 11:34:53+00:00,Housing inventories continue rebound with July...


In [6]:
module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-lite/2")

In [7]:
input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])
encodings = module(
    inputs=dict(
        values=input_placeholder.values,
        indices=input_placeholder.indices,
        dense_shape=input_placeholder.dense_shape))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [8]:
with tf.Session() as sess:
    spm_path = sess.run(module(signature="spm_path"))

sp = spm.SentencePieceProcessor()
with tf.io.gfile.GFile(spm_path, mode="rb") as f:
    sp.LoadFromSerializedProto(f.read())
print("SentencePiece model loaded at {}.".format(spm_path))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


SentencePiece model loaded at b'/var/folders/99/_fcjbs5d08ndbt_1__blsphm0000gn/T/tfhub_modules/539544f0a997d91c327c23285ea00c37588d92cc/assets/universal_encoder_8k_spm.model'.


In [9]:
def process_to_IDs_in_sparse_format(sp, sentences):
    # An utility method that processes sentences with the sentence piece processor
    # 'sp' and returns the results in tf.SparseTensor-similar format:
    # (values, indices, dense_shape)
    ids = [sp.EncodeAsIds(x) for x in sentences]
    max_len = max(len(x) for x in ids)
    dense_shape=(len(ids), max_len)
    values=[item for sublist in ids for item in sublist]
    indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]
    return (values, indices, dense_shape)

In [10]:
# pass the text field from the news data into the messages
messages = news_df['text']
values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, messages)

In [11]:
# generate embeddings
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(
        encodings,
        feed_dict={input_placeholder.values: values,
                   input_placeholder.indices: indices,
                   input_placeholder.dense_shape: dense_shape})

In [12]:
# convert message_embeddings to list for use with sklearn kmeans
news_df['embedding'] = message_embeddings.tolist()

In [13]:
news_df.head()

Unnamed: 0,article_id,publishedAt,text,embedding
0,c2b2b5d8-9b26-4c9f-8f43-4521889be456,2021-08-24 10:47:42+00:00,South African Unemployment Rate Rises to Highe...,"[-0.015288193710148335, -0.056386541575193405,..."
1,b7ffdf3f-fd51-4fdf-8426-90f21e7a2267,2021-08-24 10:53:28+00:00,Afghan basketball star welcomed by new team in...,"[0.018410377204418182, 0.049869049340486526, -..."
2,aea7857a-6e9a-43a6-96a8-5a52de44e6de,2021-08-24 11:24:21+00:00,How are women in Afghanistan reacting to Talib...,"[0.022476550191640854, 0.021421032026410103, 0..."
3,d9c15bd2-d69e-4ddb-bf38-e8cf686389f1,2021-08-24 11:21:35+00:00,Afghan women footballers evacuated by Australi...,"[0.037078578025102615, 0.05098992958664894, -0..."
4,444a5117-e1fe-40ea-b297-5c9929531dd1,2021-08-24 11:34:53+00:00,Housing inventories continue rebound with July...,"[-0.03148503229022026, 0.03165409713983536, -0..."


In [14]:
# rearrange embedding series from dataframe into vertical stack for kmeans model
X = numpy.vstack(news_df['embedding'])
X.shape

(999, 512)

In [15]:
#Applying kmeans to the dataset / Creating the kmeans classifier
kmeans = KMeans(n_clusters = 20, init = 'k-means++', max_iter = 100, random_state = 0)
y_kmeans = kmeans.fit_predict(X)

In [16]:
# add cluster predictions to dataframe
news_df['cluster'] = y_kmeans

In [17]:
# add distance to nearest cluster
news_df['distance'] = numpy.min(kmeans.fit_transform(X), axis=1)

In [18]:
# evaluate closest articles for clusters
pandas.set_option('display.max_colwidth', None)
news_df[['text','cluster','distance']][news_df['cluster'] == 1].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
613,"9 takeaways from the Bears’ 18th training camp practice. From a healthy offensive line to Justin Fields getting some reps with starting receivers, here's what we learned from Monday's practice.. Following their preseason loss to the Buffalo Bills, the Chicago Bears returned to the practice field Monday at Halas Hall for a non-padded practice to kick off their final week of preseason.\r\nThere … [+8325 chars]",1,0.439975
344,"Falcons QB AJ McCarron out for season with ACL injury. With rookie Feleipe Franks as the lone quarterback behind Matt Ryan, the Falcons are in the market for a third QB.. The Atlanta Falcons were limited to just one quarterback for the majority of Saturdays preseason game against the Dolphins after AJ McCarron went down with an apparent knee injury.\r\nMcCarron was in o… [+691 chars]",1,0.459337
900,"Packers preseason highlights: Oren Burks flashes potential against Texans. The Packers got an encouraging performance from LB Oren Burks to open the preseason.. Green Bay Packers inside linebacker Oren Burks is off to a strong start in his fourth preseason. Burks jumped off the screen against the Houston Texans, totaling a team-high seven tackles, two tackle… [+2626 chars]",1,0.462438
328,"Eagles Draft news: Carson Wentz returns to Colts’ practice as he trends towards playing Week 1. Eagles Draft news: Carson Wentz returns to Colts' practice as he trends towards playing Week 1. While the Indianapolis Colts deal with the pressure to make a Super Bowl run, the Eagles are banking on quarterback Carson Wentz playing well and deep into the playoffs.\r\nAfter some fears that Wentz … [+882 chars]",1,0.463118
985,"Packers trade Josh Jackson to Giants for Isaac Yiadom in cornerback swap. The Packers are parting ways with Josh Jackson, sending the fourth-year cornerback to the Giants for cornerback Isaac Yiadom.. GREEN BAY - The Green Bay Packers are parting ways with Josh Jackson, sending the fourth-year cornerback to the New York Giants for cornerback Isaac Yiadom, a person with knowledge of the deal confir… [+3587 chars]",1,0.470674


In [19]:
# evaluate closest articles for clusters
news_df[['text','cluster','distance']][news_df['cluster'] == 2].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
36,"Two Men Arrested After Woman's Body Found in Trunk of Car. David Manuel Mestas and Richard Anthony Sanchez were arrested on Sunday afternoon while allegedly fleeing to Las Vegas, Nevada, from South Dakota.. Two men have been arrested in Utah after a woman was found dead in the trunk of a car parked at a gas station with a gunshot wound to the head.\r\nDavid Manuel Mestas and Richard Anthony Sanchez, who a… [+2902 chars]",2,0.543041
147,"Texas woman accused of setting deadly fire was previously charged with another murder. A Texas woman accused of setting a man on fire and is charged with another murder remains at large, police said.. A Texas woman accused of setting a man on fire and killing him remains at large, police said, and it's not the first time she's been accused of a deadly crime. \r\nEmma Presler, 19, is charged with mur… [+1329 chars]",2,0.562615
953,"Georgia Man Arrested After Threatening to Shoot Judge During Livestream. The man made his threats to county officials and law enforcement in a 24-minute video live-streamed to social media.. A Georgia man has been arrested for threatening the life of a judge during a video live-streamed to social media. Micquel Deandre Gay, 36, was taken into police custody on Monday after threatening to… [+2786 chars]",2,0.573833
661,"California Police Officer Accidentally Kills Bystander After Attempting to Shoot at Suspect. ""The suspect with the felony warrant was ultimately arrested at the scene. Neither the officers nor the suspect were injured,"" the Santa Barbara County Sheriff's Office said in the press release.. A police officer in California accidentally killed a bystander while attempting to shoot at a suspect over the past weekend.\r\nAccording to a press release from the Santa Barbara County Sheriff's Offi… [+2784 chars]",2,0.576131
482,Man arrested over Birmingham Gay Village attack. Two men needed hospital treatment after the attack which began with abuse from inside a car.. image captionRob and Patrick faced homophobic abuse and were hit with bottles during a night out in Birmingham\r\nA man has been arrested in connection with a violent homophobic attack in Birmingham.\r\n… [+1226 chars],2,0.583191


In [20]:
# evaluate closest articles for clusters
news_df[['text','cluster','distance']][news_df['cluster'] == 3].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
391,"The Last Bus: Timothy Spall film gives teen busker big break. Caitlin Agnew was just 16 when two of her songs were picked to feature on the soundtrack for The Last Bus.. image sourceCaitlin Agnew\r\nimage captionCaitlin Agnew sings and plays guitar, ukulele and keyboard\r\nA Scottish teenage busker is celebrating after two of her songs were selected for the soundtrack of… [+3075 chars]",3,0.674235
405,"Henri Cuts Short Manilow Set At New York Virus Recovery Concert. New York (AP) -- This time, Barry Manilow didn’t make it through the rain.. New York (AP) -- This time, Barry Manilow didnt make it through the rain.\r\nUnlike the Grammy-winning recording artists 1980 hit, I Made It Through Rain, the superstar-laden Homecoming Concert in New … [+4936 chars]",3,0.684552
938,"Madonna celebrates 63rd birthday in Italy with boyfriend and kids. Madonna was glammed out in a multi-colored dress and heels as she ate ice cream and mingled with friends.. Madonna celebrated her 63rd birthday with her boyfriend Ahlamalik Williams and her kids in Italy this week.\r\n""Let the birthday games begin,"" the musician captioned a photo album on social media. \r\nMa… [+2004 chars]",3,0.700668
73,"Every Taylor Swift Era Featured In Her TikTok Debut. Taylor Swift has been rerecording her old albums with all new versions, and used TikTok to make an important announcement.. Taylor Swift has arrived on TikTok, and has already amassed more than one million followers in just a few hours. Her debut video has shown her in various Swift eras, harking back to her music of old.… [+3095 chars]",3,0.70962
477,"Henri cuts short Manilow set at NYC virus recovery concert. NEW YORK (AP) — This time, Barry Manilow didn’t make it through the rain.. Mayor Bill de Blasio later tweeted, While its disappointing that tonights concert had to end early, the safety of everyone in attendance had to come first.\r\nThe highly promoted mega-concert featured … [+1248 chars]",3,0.724578


In [21]:
# add temporary columns for keywords
news_df['keywords'] = numpy.nan

# rename distance to topic_perc_contrib to fit legacy schema
news_df.rename(columns={'cluster': 'dominant_topic', 'distance': 'topic_perc_contrib'}, inplace=True)

# drop embedding columns
news_df.drop(['embedding'], axis=1, inplace=True)

In [22]:
# evaluate data to be written to bigquery
pandas.reset_option('display')

news_df.head()

Unnamed: 0,article_id,publishedAt,text,dominant_topic,topic_perc_contrib,keywords
0,c2b2b5d8-9b26-4c9f-8f43-4521889be456,2021-08-24 10:47:42+00:00,South African Unemployment Rate Rises to Highe...,15,0.75393,
1,b7ffdf3f-fd51-4fdf-8426-90f21e7a2267,2021-08-24 10:53:28+00:00,Afghan basketball star welcomed by new team in...,18,0.635603,
2,aea7857a-6e9a-43a6-96a8-5a52de44e6de,2021-08-24 11:24:21+00:00,How are women in Afghanistan reacting to Talib...,18,0.713864,
3,d9c15bd2-d69e-4ddb-bf38-e8cf686389f1,2021-08-24 11:21:35+00:00,Afghan women footballers evacuated by Australi...,18,0.683186,
4,444a5117-e1fe-40ea-b297-5c9929531dd1,2021-08-24 11:34:53+00:00,Housing inventories continue rebound with July...,15,0.780097,


In [23]:
# configure BigQuery job
job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("article_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("publishedAt", bigquery.enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("text", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("dominant_topic", bigquery.enums.SqlTypeNames.INT64),
        bigquery.SchemaField("topic_perc_contrib", bigquery.enums.SqlTypeNames.FLOAT64),
        bigquery.SchemaField("keywords", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

In [24]:
table_id = "news-site-280319.topics.article_topics"
job = client.load_table_from_dataframe(
    news_df,
    table_id, 
    job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

<google.cloud.bigquery.job.LoadJob at 0x155370820>

In [25]:
# check job status
job.result().state

'DONE'