# News Topic Modeling

This notebook is used to derive topics from news text


In [1]:
!gcloud config configurations activate news-site

Activated [news-site].


In [2]:
from google.cloud import bigquery

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_lg

import pandas
import re
from tqdm import tqdm_notebook as tqdm
import numpy
from pprint import pprint

In [3]:
client = bigquery.Client()
sql = """
    SELECT 
        article_id,
        publishedAt,
        CONCAT(title, '. ', description, '. ', content) AS text
    FROM `news-site-280319.news.articles`
    WHERE
      title IS NOT NULL
      AND description IS NOT NULL
      AND content IS NOT NULL
      AND DATE(publishedAt) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
"""

df = client.query(sql).to_dataframe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622 entries, 0 to 621
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   article_id   622 non-null    object             
 1   publishedAt  622 non-null    datetime64[ns, UTC]
 2   text         622 non-null    object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 14.7+ KB


In [4]:
df.head()

Unnamed: 0,article_id,publishedAt,text
0,b7d51d16-2b40-411c-b709-479a0011887a,2020-07-14 13:33:01+00:00,JPMorgan's Q2 earnings beat forecasts as inves...
1,0e2771ce-63f3-49a5-9832-5db61973e6ef,2020-07-14 13:33:01+00:00,JPMorgan's Q2 earnings beat forecasts as inves...
2,bd7035a3-018d-446e-8f9d-106f9e81745e,2020-07-13 12:52:38.035442+00:00,Andrzej Duda wins re-election as Polish presid...
3,db614f80-87d0-42d9-adf4-22908c21abdc,2020-07-13 12:02:55+00:00,Loud Rumors Tease PlayStation 5 Price & Launch...
4,71d551f3-c592-44f5-bf7b-eba1a8acdadc,2020-07-13 11:28:00+00:00,"No Rahul Gandhi Meet, Says Sachin Pilot, Congr..."


In [5]:
news_text = df['text']

news_text.head()

0    JPMorgan's Q2 earnings beat forecasts as inves...
1    JPMorgan's Q2 earnings beat forecasts as inves...
2    Andrzej Duda wins re-election as Polish presid...
3    Loud Rumors Tease PlayStation 5 Price & Launch...
4    No Rahul Gandhi Meet, Says Sachin Pilot, Congr...
Name: text, dtype: object

In [6]:
# replace all new line returns with spaces
news_text = news_text.str.replace('\r\n', ' ', regex=True)

In [7]:
# convert to lowercase
news_text = news_text.str.lower()

In [8]:
news_text[3]

'loud rumors tease playstation 5 price & launch date reveal today. rumors suggest sony plans to reveal the ps5 price, release date, and launch pre-orders later today at 12:30 pm pt/9:30 pm cest.. <ul><li>sony will reportedly reveal the ps5 price and release date later on today.</li><li>according to analyst roberto serrano, sony will share the news at around 12:30 pm pt/9:30 pm cest.</li><li>b… [+2511 chars]'

In [9]:
nlp = spacy.load("en")

In [10]:
def lemmatizer(doc):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [11]:
# update stop words list
custom_stop_list = ["char", "   ", "  ", "|"]
nlp.Defaults.stop_words.update(custom_stop_list)

for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

In [12]:
# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [13]:
doc_list = []
# Iterates through each article in the corpus.
for doc in news_text:
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)

In [14]:
# Creates mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [15]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=50, 
                                           random_state=0,
                                           update_every=1,
                                           passes=5,
                                           alpha='auto',
                                           per_word_topics=True)

In [16]:
pprint(lda_model.print_topics(num_words=10))

[(21,
  '0.031*"uk" + 0.013*"announce" + 0.013*"misconduct" + 0.013*"blanchet" + '
  '0.013*"sexual" + 0.013*"allegation" + 0.013*"huawei" + 0.013*"g" + '
  '0.013*"5" + 0.011*"françois"'),
 (30,
  '0.030*"mask" + 0.027*"wear" + 0.019*"podcast" + 0.016*"michelle" + '
  '0.016*"obama" + 0.013*"new" + 0.010*"july" + 0.010*"school" + '
  '0.010*"jackson" + 0.010*"ronny"'),
 (28,
  '0.014*"fauci" + 0.010*"g" + 0.010*"minister" + 0.010*"5" + 0.010*"house" + '
  '0.010*"white" + 0.010*"thousand" + 0.009*"anthony" + 0.009*"vote" + '
  '0.009*"dr"'),
 (43,
  '0.012*"player" + 0.009*"nfl" + 0.009*"twitter" + 0.009*"lionel" + '
  '0.008*"messi" + 0.008*"la" + 0.008*"liga" + 0.007*"reveal" + '
  '0.007*"account" + 0.007*"target"'),
 (33,
  '0.017*"insider" + 0.012*"crossfit" + 0.012*"ceo" + 0.012*"business" + '
  '0.010*"man" + 0.008*"community" + 0.008*"glassman" + 0.008*"allege" + '
  '0.008*"audio" + 0.008*"project"'),
 (26,
  '0.022*"travolta" + 0.016*"john" + 0.015*"die" + 0.015*"preston" + 

In [17]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pandas.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                        pandas.Series(
                            [int(topic_num), round(prop_topic,4),topic_keywords]
                        ), ignore_index=True
                    )
            else:
                break
    sent_topics_df.columns = ['dominant_topic', 'perc_contribution', 'topic_keywords']

    # Add original text to the end of the output
    contents = pandas.Series(texts)
    sent_topics_df = pandas.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=news_text)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['document_no', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,document_no,dominant_topic,topic_perc_contrib,keywords,text
0,0,32.0,0.9817,"earning, investment, old, >, revenue, jpmorgan...",jpmorgan's q2 earnings beat forecasts as inves...
1,1,32.0,0.9817,"earning, investment, old, >, revenue, jpmorgan...",jpmorgan's q2 earnings beat forecasts as inves...
2,2,35.0,0.9779,"google, fire, fauci, new, announce, ai, facebo...",andrzej duda wins re-election as polish presid...
3,3,20.0,0.9855,"account, >, news, <, coronavirus, elon, pre, a...",loud rumors tease playstation 5 price & launch...
4,4,10.0,0.9844,"berkshire, week, 5, buffett, warren, buy, $, b...","no rahul gandhi meet, says sachin pilot, congr..."
5,5,42.0,0.9676,"new, coronavirus, face, york, pilot, sachin, p...","decades later, these remakes haven’t fixed the..."
6,6,32.0,0.9353,"earning, investment, old, >, revenue, jpmorgan...",china imposes sanctions on us lawmakers in ret...
7,7,2.0,0.979,"cnn, find, dr, happen, trump, messenger, chat,...",white house cites this interview in attempt to...
8,8,8.0,0.6682,"house, white, trump, fauci, lego, anthony, new...",white house takes aim at anthony fauci over co...
9,9,3.0,0.9779,">, 5, g, look, ban, huawei, <, new, kit, light",khloe is a kylie jenner clone - but she's not ...


In [18]:
# Group top 5 sentences under each topic
sent_topics_sorteddf = pandas.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('dominant_topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pandas.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['perc_contribution'], 
                                             ascending=[0]).head(1)],
                                             axis=0
                                             )

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['topic_Num', 'perc_Contribution', 'keywords', 'text']

# Show
sent_topics_sorteddf.head(20)

Unnamed: 0,topic_Num,perc_Contribution,keywords,text
0,0.0,0.9809,"payment, court, year, pandemic, rugby, play, r...",kerala gold smuggling case: nia court remands ...
1,1.0,0.9847,"amazon, 2020, shopping, lake, john, lewis, fin...",amazon's smart shopping cart knows what you're...
2,2.0,0.9858,"cnn, find, dr, happen, trump, messenger, chat,...",analog devices to acquire rival chipmaker maxi...
3,3.0,0.9847,">, 5, g, look, ban, huawei, <, new, kit, light",russia rejects coronavirus vaccine hacking all...
4,4.0,0.9865,"year, use, reveal, technology, federal, recogn...","with ashampoo 3d cad 7, there’s no design or c..."
5,5.0,0.9844,"datum, season, mandela, pope, state, assam, le...","obit | zindzi mandela, remembered for her 1985..."
6,6.0,0.9862,"trump, judge, jail, u.s, president, force, cas...",meadows signals imminent indictments in durham...
7,7.0,0.9835,"late, coronavirus, statue, charge, dublin, kel...",'ellen degeneres show' execs held 'low morale'...
8,8.0,0.9855,"house, white, trump, fauci, lego, anthony, new...",trump says he has 'very good relationship' wit...
9,9.0,0.9844,"coronavirus, country, claim, wear, mask, billi...",'you have to be here' - african researchers in...


In [19]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['dominant_topic'].value_counts()

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['dominant_topic', 'topic_keywords']].groupby(
                        ['dominant_topic', 'topic_keywords']
                    )['topic_keywords'].count().reset_index(name='num_documents')

topic_num_keywords['perc_documents'] = ((topic_num_keywords['num_documents'])/(topic_num_keywords['num_documents'].sum()))*100

# show top topics
topic_num_keywords.head(50).sort_values(by='perc_documents', ascending=False)

Unnamed: 0,dominant_topic,topic_keywords,num_documents,perc_documents
14,14.0,"ginsburg, cancer, ruth, bader, court, justice,...",23,3.697749
37,37.0,"cbs, cbsn, copyright, interactive, inc, ©, wat...",22,3.536977
42,42.0,"new, coronavirus, face, york, pilot, sachin, p...",19,3.054662
36,36.0,"netflix, star, stock, warn, year, >, federal, ...",18,2.893891
47,47.0,"watch, 2020, start, year, dog, news, free, ubi...",18,2.893891
23,23.0,"company, year, covid-19, new, australian, coro...",17,2.733119
20,20.0,"account, >, news, <, coronavirus, elon, pre, a...",16,2.572347
24,24.0,"monday, announce, business, roger, insider, tr...",15,2.411576
29,29.0,"year, new, mueller, book, investigation, lotto...",15,2.411576
8,8.0,"house, white, trump, fauci, lego, anthony, new...",15,2.411576


In [20]:
news_text

0      jpmorgan's q2 earnings beat forecasts as inves...
1      jpmorgan's q2 earnings beat forecasts as inves...
2      andrzej duda wins re-election as polish presid...
3      loud rumors tease playstation 5 price & launch...
4      no rahul gandhi meet, says sachin pilot, congr...
                             ...                        
617    when you hear these names, which k-pop idol do...
618    what's your favorite bts song for every letter...
619    teens shared ways that millennials are out of ...
620    if you were a whimsical tumblr girl in 2012, y...
621    which deadly animal are you most like?. discov...
Name: text, Length: 622, dtype: object

In [21]:
df_dominant_topic

Unnamed: 0,document_no,dominant_topic,topic_perc_contrib,keywords,text
0,0,32.0,0.9817,"earning, investment, old, >, revenue, jpmorgan...",jpmorgan's q2 earnings beat forecasts as inves...
1,1,32.0,0.9817,"earning, investment, old, >, revenue, jpmorgan...",jpmorgan's q2 earnings beat forecasts as inves...
2,2,35.0,0.9779,"google, fire, fauci, new, announce, ai, facebo...",andrzej duda wins re-election as polish presid...
3,3,20.0,0.9855,"account, >, news, <, coronavirus, elon, pre, a...",loud rumors tease playstation 5 price & launch...
4,4,10.0,0.9844,"berkshire, week, 5, buffett, warren, buy, $, b...","no rahul gandhi meet, says sachin pilot, congr..."
...,...,...,...,...,...
617,617,35.0,0.9395,"google, fire, fauci, new, announce, ai, facebo...","when you hear these names, which k-pop idol do..."
618,618,23.0,0.9396,"company, year, covid-19, new, australian, coro...",what's your favorite bts song for every letter...
619,619,38.0,0.9649,"police, government, president, coronavirus, vi...",teens shared ways that millennials are out of ...
620,620,24.0,0.9436,"monday, announce, business, roger, insider, tr...","if you were a whimsical tumblr girl in 2012, y..."


In [22]:
output_df = pandas.concat([df[['article_id', 'publishedAt']], news_text, df_dominant_topic], axis=1)

output_df.columns = ['article_id', 'publishedAt', 'text', 'drop', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'drop']

output_df.drop(output_df['drop'], axis=1, inplace=True)

output_df['dominant_topic'] = output_df['dominant_topic'].astype(int)

output_df

Unnamed: 0,article_id,publishedAt,text,dominant_topic,topic_perc_contrib,keywords
0,b7d51d16-2b40-411c-b709-479a0011887a,2020-07-14 13:33:01+00:00,jpmorgan's q2 earnings beat forecasts as inves...,32,0.9817,"earning, investment, old, >, revenue, jpmorgan..."
1,0e2771ce-63f3-49a5-9832-5db61973e6ef,2020-07-14 13:33:01+00:00,jpmorgan's q2 earnings beat forecasts as inves...,32,0.9817,"earning, investment, old, >, revenue, jpmorgan..."
2,bd7035a3-018d-446e-8f9d-106f9e81745e,2020-07-13 12:52:38.035442+00:00,andrzej duda wins re-election as polish presid...,35,0.9779,"google, fire, fauci, new, announce, ai, facebo..."
3,db614f80-87d0-42d9-adf4-22908c21abdc,2020-07-13 12:02:55+00:00,loud rumors tease playstation 5 price & launch...,20,0.9855,"account, >, news, <, coronavirus, elon, pre, a..."
4,71d551f3-c592-44f5-bf7b-eba1a8acdadc,2020-07-13 11:28:00+00:00,"no rahul gandhi meet, says sachin pilot, congr...",10,0.9844,"berkshire, week, 5, buffett, warren, buy, $, b..."
...,...,...,...,...,...,...
617,4cd3ff5b-289d-41f9-9a6a-d2864256a4db,2020-07-19 17:07:27.965117+00:00,"when you hear these names, which k-pop idol do...",35,0.9395,"google, fire, fauci, new, announce, ai, facebo..."
618,087fdf71-52e7-4bfa-a815-7f3a7d0a6879,2020-07-19 16:07:23.328105+00:00,what's your favorite bts song for every letter...,23,0.9396,"company, year, covid-19, new, australian, coro..."
619,ecece0fa-234f-4c3e-a51e-2170210d5491,2020-07-19 17:22:24.134988+00:00,teens shared ways that millennials are out of ...,38,0.9649,"police, government, president, coronavirus, vi..."
620,2de8a1a4-8195-4b00-8884-394ff60db9cd,2020-07-19 15:22:27.208455+00:00,"if you were a whimsical tumblr girl in 2012, y...",24,0.9436,"monday, announce, business, roger, insider, tr..."


In [23]:
output_df.to_parquet('df.parquet.gzip', allow_truncated_timestamps=True)

bq_df = pandas.read_parquet('df.parquet.gzip')

In [24]:
# configure BigQuery job
job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("article_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("publishedAt", bigquery.enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("text", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("dominant_topic", bigquery.enums.SqlTypeNames.INT64),
        bigquery.SchemaField("topic_perc_contrib", bigquery.enums.SqlTypeNames.FLOAT64),
        bigquery.SchemaField("keywords", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

In [25]:
table_id = "news-site-280319.topics.article_topics"
job = client.load_table_from_dataframe(
    bq_df,
    table_id, 
    job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

<google.cloud.bigquery.job.LoadJob at 0x129e906a0>

In [26]:
# check job status
job.result().state

'DONE'