# News Topic Modeling

This notebook is used to derive topics from news text


In [1]:
!gcloud config configurations activate news-site

Activated [news-site].


In [2]:
from google.cloud import bigquery

import pandas
import re
import numpy
import matplotlib.pyplot as plt
from pprint import pprint

In [3]:
token = !gcloud auth print-access-token
import google.oauth2.credentials
credentials = google.oauth2.credentials.Credentials(token[0])

In [4]:
client = bigquery.Client(credentials=credentials)
sql = """
    SELECT 
        article_id,
        publishedAt,
        CONCAT(title, '. ', description, '. ', content) AS text
    FROM `news-site-280319.news.articles`
    WHERE
      title IS NOT NULL
      AND description IS NOT NULL
      AND content IS NOT NULL
      AND DATE(publishedAt) >= DATE_SUB(CURRENT_DATE(), INTERVAL 14 DAY)
"""

df = client.query(sql).to_dataframe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2570 entries, 0 to 2569
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   article_id   2570 non-null   object             
 1   publishedAt  2570 non-null   datetime64[ns, UTC]
 2   text         2570 non-null   object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 60.4+ KB


In [5]:
df.head()

Unnamed: 0,article_id,publishedAt,text
0,f8993822-3164-4478-bacb-643a476fd8f3,2021-02-14 22:00:11+00:00,Here's the biggest news you missed this weeken...
1,e5546dc0-fc08-4fa0-ae0e-0e67e6390e2a,2021-02-14 21:11:11+00:00,"Celtics embarrassed by Wiz on the road, drop 2..."
2,f37f2f79-0125-4338-852a-43e7e01ca1f9,2021-02-14 21:15:04+00:00,John Wall ready for first return to Washington...
3,1d9bfeb9-484f-4800-b8ef-87b28c3ca836,2021-02-14 21:24:13+00:00,Victor Oladipo remains sidelined for Rockets w...
4,b71ca1c4-2d88-49ca-90db-04ab6dc22398,2021-02-14 21:37:07+00:00,Kevin Durant says he speaks with Klay Thompson...


In [6]:
import tensorflow as tf
import tensorflow_hub as hub

In [7]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [8]:
from sklearn.cluster import KMeans

In [9]:
news_df = df

In [10]:
news_df['tf_embedding'] = embed(news_df['text'])

In [11]:
news_df['tf_embedding']

0       (tf.Tensor(-0.0123543525, shape=(), dtype=floa...
1       (tf.Tensor(-0.009412782, shape=(), dtype=float...
2       (tf.Tensor(0.04508125, shape=(), dtype=float32...
3       (tf.Tensor(-0.051791146, shape=(), dtype=float...
4       (tf.Tensor(0.028435577, shape=(), dtype=float3...
                              ...                        
2565    (tf.Tensor(-0.039459262, shape=(), dtype=float...
2566    (tf.Tensor(-0.028848702, shape=(), dtype=float...
2567    (tf.Tensor(-0.059504002, shape=(), dtype=float...
2568    (tf.Tensor(-0.022636443, shape=(), dtype=float...
2569    (tf.Tensor(0.008352046, shape=(), dtype=float3...
Name: tf_embedding, Length: 2570, dtype: object

In [12]:
def tensor_to_array(tuple_tensors):
    embedding_list = []
    for tensor in tuple_tensors:
        embedding_list.append(tensor.numpy())
    embedding_list = numpy.asarray(embedding_list)
    return embedding_list

news_df['embedding'] = news_df['tf_embedding'].apply(lambda x: tensor_to_array(x))

In [13]:
news_df.head()

Unnamed: 0,article_id,publishedAt,text,tf_embedding,embedding
0,f8993822-3164-4478-bacb-643a476fd8f3,2021-02-14 22:00:11+00:00,Here's the biggest news you missed this weeken...,"(tf.Tensor(-0.0123543525, shape=(), dtype=floa...","[-0.0123543525, -0.05595495, -0.054358713, -0...."
1,e5546dc0-fc08-4fa0-ae0e-0e67e6390e2a,2021-02-14 21:11:11+00:00,"Celtics embarrassed by Wiz on the road, drop 2...","(tf.Tensor(-0.009412782, shape=(), dtype=float...","[-0.009412782, -0.043367088, 0.052012645, -0.0..."
2,f37f2f79-0125-4338-852a-43e7e01ca1f9,2021-02-14 21:15:04+00:00,John Wall ready for first return to Washington...,"(tf.Tensor(0.04508125, shape=(), dtype=float32...","[0.04508125, -0.061285354, 0.065952845, 0.0424..."
3,1d9bfeb9-484f-4800-b8ef-87b28c3ca836,2021-02-14 21:24:13+00:00,Victor Oladipo remains sidelined for Rockets w...,"(tf.Tensor(-0.051791146, shape=(), dtype=float...","[-0.051791146, -0.06263638, 0.063439004, -0.00..."
4,b71ca1c4-2d88-49ca-90db-04ab6dc22398,2021-02-14 21:37:07+00:00,Kevin Durant says he speaks with Klay Thompson...,"(tf.Tensor(0.028435577, shape=(), dtype=float3...","[0.028435577, 0.025807155, 0.064303845, -0.049..."


In [14]:
X = numpy.vstack(news_df['embedding'])

In [15]:
X.shape

(2570, 512)

In [16]:
#Applying kmeans to the dataset / Creating the kmeans classifier
kmeans = KMeans(n_clusters = 20, init = 'k-means++', max_iter = 100, random_state = 0)
y_kmeans = kmeans.fit_predict(X)

In [17]:
# add cluster predictions to dataframe
news_df['cluster'] = y_kmeans

In [18]:
# add distance to nearest cluster
news_df['distance'] = numpy.min(kmeans.fit_transform(X), axis=1)

In [19]:
# evaluate closest articles for clusters
pandas.set_option('display.max_colwidth', None)
news_df[['text','cluster','distance']][news_df['cluster'] == 0].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
50,"Everton 0-2 Fulham: Scott Parker welcomes Cottagers win. Fulham manager Scott Parker praises a ""superb, brilliant"" performance from his side after a 2-0 Premier League win over Everton at Goodison Park.. Fulham manager Scott Parker praises a ""superb, brilliant"" performance from his side after a 2-0 Premier League win over Everton at Goodison Park.\r\nMATCH REPORT: Everton 0-2 Fulham\r\nWatch highlights f… [+128 chars]",0,0.690412
1256,Everton 1-3 Manchester City: Twelve league wins in a row for Pep Guardiola's side. Manchester City open up a 10-point lead at the top of the Premier League with victory at Everton.. Manchester City conceded from open play in the Premier League for the first time since 3 January\r\nManchester City's relentless pursuit of a third Premier League title in four seasons continued after … [+5133 chars],0,0.696691
370,"Manchester City v Tottenham Hotspur. Preview followed by live coverage of Saturday's Premier League game between Manchester City and Tottenham Hotspur.. Manchester City have only conceded one goal in Ruben Dias' nine most recent starts\r\nTEAM NEWS\r\nManchester City are hopeful Rodrigo will be fit, despite the Spaniard limping off during the FA Cup win … [+3383 chars]",0,0.711231
1886,FA Cup Highlights: Man Utd edge past West Ham in extra time. Watch highlights as Scott McTominay's well-taken extra-time strike is enough to send Manchester United into the FA Cup quarter-finals at the expense of West Ham.. Watch highlights as Scott McTominay's well-taken extra-time strike is enough to send Manchester United into the FA Cup quarter-finals at the expense of West Ham.\r\nMATCH REPORT: Man Utd 1-0 West Ham\r\n… [+26 chars],0,0.720839
1049,Wolverhampton Wanderers 1-0 Leeds United: Illan Meslier's own goal gives hosts victory. Illan Meslier's unfortunate own goal following Adama Traore's sensational strike gives Wolves victory against Leeds in an entertaining encounter.. Wolves have now won back-to-back league games for the first time since October\r\nIllan Meslier's unfortunate own goal following Adama Traore's sensational strike gave Wolverhampton Wanderers victory a… [+5377 chars],0,0.725705


In [20]:
# evaluate closest articles for clusters
news_df[['text','cluster','distance']][news_df['cluster'] == 1].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
1764,"Things to Know: In pandemic, Americans choose dying at home. Here’s what’s happening Sunday with the coronavirus pandemic in the U.S.:. California Gov. Gavin Newsom is facing a potential recall election, in part stemming from the coronavirus pandemic. California voters weary of restrictions that have cut them off from jobs, classroom… [+1213 chars]",1,0.801828
1201,"U.S. COVID Cases Are Down, but the Virus Isn't in Retreat. The outbreak is moving in the right direction, but experts told Newsweek they fear progress could be jeopardized by complacency and new variants.. COVID cases, deaths and hospitalizations in the U.S. have plummeted in recent weeks. However, experts have told Newsweek it may be too soon to view the pandemic as in retreat, particularly as the thr… [+6799 chars]",1,0.802352
1731,"Gov. DeSantis: Florida lifts people up by keeping businesses, schools open amid coronavirus pandemic. Florida Gov. Ron DeSantis told “Fox & Friends Weekend” on Sunday that keeping businesses and schools open amid the coronavirus pandemic has helped “lift people up.”. FloridaGov. Ron DeSantis told ""Fox &amp; Friends Weekend"" on Sunday that keeping businesses and schools open amid the coronavirus pandemic has helped ""lift people up."" \r\n""As other states have tried t… [+3515 chars]",1,0.806449
47,"CDC chief warns it's too soon in U.S. to lift COVID-19 mask mandates - Reuters. The head of the U.S. Centers for Disease Control and Prevention said on Sunday it is ""absolutely"" too soon to lift mask mandates, citing daily COVID-19 case numbers that despite recent declines remain more than double the levels seen last summer.. NEW YORK (Reuters) - The head of the U.S. Centers for Disease Control and Prevention said on Sunday it is absolutely too soon to lift mask mandates, citing daily COVID-19 case numbers that despite re… [+2474 chars]",1,0.819685
2389,"Health Care Workers Risk Burn Out. How Will They Get the Care They Need? | Opinion. After an already superhuman effort over the last year, working through successive surges in caseloads and the end not yet in sight, three quarters of frontline health care workers report feeling overwhelmed.. It's been a long plague year. COVID-19 hospitalization rates are declining, but the stress level of health care workers remains sky-high. We're in the pandemic's third wave, and only a few weeks away… [+5737 chars]",1,0.832454


In [21]:
# evaluate closest articles for clusters
news_df[['text','cluster','distance']][news_df['cluster'] == 2].sort_values(by=['distance']).head(5)

Unnamed: 0,text,cluster,distance
2347,"Myanmar protesters rally against military coup for fifth day after protester shot. Protesters took to the streets in cities around Myanmar on Wednesday for a fifth day, opposing resistance to a military takeover.. Protesters took to the streets in cities across Myanmar for a fifth consecutive day Wednesday, to oppose a military takeover that ousted the countrys democratically elected leader last week.\r\nThe pro… [+2557 chars]",2,0.647353
1644,"At least one killed in protest in Myanmar: emergency service. A man was killed in Myanmar on Saturday when police fired to disperse protesting opponents of a Feb. 1 military coup in the second city of Mandalay, media and an ambulance service said.. By Reuters Staff\r\nPolice advance on the street during protests against the military coup, in Mandalay, Myanmar, February 20, 2021. REUTERS/Stringer NO RESALES. NO ARCHIVES.\r\n(Reuters) - A man was kil… [+546 chars]",2,0.6555
1799,In Pictures: Defiance as thousands rally in Myanmar. People pour onto the streets across Myanmar for a second day to protest last week's military coup.. Thousands of anti-coup protesters in Myanmar hit the streets on Sunday as an internet blackout failed to stifle growing outrage at the militarys overthrow of elected leader Aung San Suu Kyi.\r\nThe ral… [+701 chars],2,0.657086
1488,"A ‘war zone’: Witnesses describe violence at Myanmar protests. Security forces in Mandalay are using increasingly violent and lethal methods to snuff out anti-coup protests.. Yangon, Myanmar Saturday was the worst day of bloodshed yet during mass protests in Myanmar against the February 1 military coup, which overthrew the elected government headed by Aung San Suu Kyi and… [+5836 chars]",2,0.65728
2435,"Myanmar leader urges civil servants to return to work - Reuters. Myanmar's junta leader on Thursday urged government employees to return to work after absences that he blamed on harassment by ""unscrupulous persons"", in his first public remarks on the protests against him and a growing civil disobedience campaign.. By Reuters Staff\r\nFILE PHOTO: Myanmar Commander in Chief Senior General Min Aung Hlaing salutes as he attends an event marking the anniversary of Martyrs' Day at the Martyrs' Mausoleum in Yangon July… [+559 chars]",2,0.659474


In [22]:
# add temporary columns for keywords
news_df['keywords'] = numpy.nan

# rename distance to topic_perc_contrib to fit legacy schema
news_df.rename(columns={'cluster': 'dominant_topic', 'distance': 'topic_perc_contrib'}, inplace=True)

# drop embedding columns
news_df.drop(['tf_embedding', 'embedding'], axis=1, inplace=True)

In [23]:
pandas.reset_option('display')

news_df.head()

Unnamed: 0,article_id,publishedAt,text,dominant_topic,topic_perc_contrib,keywords
0,f8993822-3164-4478-bacb-643a476fd8f3,2021-02-14 22:00:11+00:00,Here's the biggest news you missed this weeken...,11,0.72224,
1,e5546dc0-fc08-4fa0-ae0e-0e67e6390e2a,2021-02-14 21:11:11+00:00,"Celtics embarrassed by Wiz on the road, drop 2...",15,0.774252,
2,f37f2f79-0125-4338-852a-43e7e01ca1f9,2021-02-14 21:15:04+00:00,John Wall ready for first return to Washington...,15,0.828915,
3,1d9bfeb9-484f-4800-b8ef-87b28c3ca836,2021-02-14 21:24:13+00:00,Victor Oladipo remains sidelined for Rockets w...,15,0.779041,
4,b71ca1c4-2d88-49ca-90db-04ab6dc22398,2021-02-14 21:37:07+00:00,Kevin Durant says he speaks with Klay Thompson...,3,0.875718,


In [24]:
# output_df.to_parquet('df.parquet.gzip', allow_truncated_timestamps=True)

# bq_df = pandas.read_parquet('df.parquet.gzip')

In [25]:
# configure BigQuery job
job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("article_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("publishedAt", bigquery.enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("text", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("dominant_topic", bigquery.enums.SqlTypeNames.INT64),
        bigquery.SchemaField("topic_perc_contrib", bigquery.enums.SqlTypeNames.FLOAT64),
        bigquery.SchemaField("keywords", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

In [26]:
table_id = "news-site-280319.topics.article_topics"
job = client.load_table_from_dataframe(
    news_df,
    table_id, 
    job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

<google.cloud.bigquery.job.LoadJob at 0x193dc6160>

In [27]:
# check job status
job.result().state

'DONE'