# News Topic Modeling

This notebook is used to derive topics from news text


In [1]:
!gcloud config list

[core]
account = choi.steve@gmail.com
disable_usage_reporting = False
project = news-site-280319

Your active configuration is: [news-site]


In [2]:
from google.cloud import bigquery

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_lg

import pandas
import re
from tqdm import tqdm_notebook as tqdm
import numpy
from pprint import pprint

In [3]:
client = bigquery.Client()
sql = """
    SELECT 
        article_id,
        publishedAt,
        CONCAT(title, '. ', description, '. ', content) AS text
    FROM `news-site-280319.news.articles`
    WHERE
      title IS NOT NULL
      AND description IS NOT NULL
      AND content IS NOT NULL
"""

df = client.query(sql).to_dataframe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456 entries, 0 to 455
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   article_id   456 non-null    object             
 1   publishedAt  456 non-null    datetime64[ns, UTC]
 2   text         456 non-null    object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 10.8+ KB


In [4]:
df.head()

Unnamed: 0,article_id,publishedAt,text
0,9f54d198-91f6-469c-9cbf-c4cf2ff1032c,2020-07-01 00:37:25.913584+00:00,Biden: Trump's entire presidency has been a gi...
1,a4809d5f-32f6-48f5-a100-2aa2cd43223b,2020-07-01 00:32:13+00:00,Premier warns entirety of Victoria could go ba...
2,bf3bacbb-4ca6-4518-b73f-0c72f6ded8f4,2020-07-01 00:52:19.822680+00:00,Wall Street stocks seal best quarter since 199...
3,ae5b2f71-73ca-4395-ab9b-a70157fe417e,2020-07-01 00:22:21.011005+00:00,Energy Source is back: Is Texas about to sink ...
4,24ef003d-5f80-49fe-9bc6-0765ed17046f,2020-07-01 00:08:09.012992+00:00,"Your Life at Home. Parenting, technology and w..."


In [5]:
news_text = df['text']

news_text.head()

0    Biden: Trump's entire presidency has been a gi...
1    Premier warns entirety of Victoria could go ba...
2    Wall Street stocks seal best quarter since 199...
3    Energy Source is back: Is Texas about to sink ...
4    Your Life at Home. Parenting, technology and w...
Name: text, dtype: object

In [6]:
# replace all new line returns with spaces
news_text = news_text.str.replace('\r\n', ' ', regex=True)

In [7]:
# convert to lowercase
news_text = news_text.str.lower()

In [8]:
news_text[3]

'energy source is back: is texas about to sink the oil market?. the latest energy news: bernard looney’s plan to ‘reinvent bp’, texas’s oil-price threat, supreme court extends penneast’s pipeline saga. welcome to energy source, the ft’s relaunched newsletter about the world’s most important business. twice a week, starting today, energy source will deliver essential news, smart analysis, and inside… [+9434 chars]'

In [9]:
nlp = spacy.load("en")

In [10]:
def lemmatizer(doc):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [11]:
# update stop words list
custom_stop_list = ["char", "   ", "  ", "|"]
nlp.Defaults.stop_words.update(custom_stop_list)

for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

In [12]:
# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [13]:
doc_list = []
# Iterates through each article in the corpus.
for doc in news_text:
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)

In [14]:
# Creates mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [15]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=50, 
                                           random_state=0,
                                           update_every=5,
                                           passes=100,
                                           alpha='auto',
                                           per_word_topics=True)

In [16]:
pprint(lda_model.print_topics(num_words=10))

[(21,
  '0.020*"course" + 0.013*"cast" + 0.013*"x" + 0.013*"stewart" + '
  '0.013*"patrick" + 0.013*"man" + 0.013*"jackman" + 0.013*"hugh" + '
  '0.013*"daughter" + 0.007*"raise"'),
 (4,
  '0.022*"bubba" + 0.018*"north" + 0.018*"find" + 0.014*"partnership" + '
  '0.014*"rope" + 0.014*"carolina" + 0.014*"lose" + 0.014*"apocalypse" + '
  '0.014*"owner" + 0.009*"noose"'),
 (31,
  '0.019*"new" + 0.015*"tiny" + 0.015*"draw" + 0.015*"chance" + 0.015*"health" '
  '+ 0.015*"report" + 0.011*"protester" + 0.011*"ontario" + 0.011*"kentucky" + '
  '0.011*"west"'),
 (9,
  '0.024*"intelligence" + 0.020*"trump" + 0.016*"donald" + 0.012*"include" + '
  '0.012*"daily" + 0.012*"president" + 0.012*"russian" + 0.012*"account" + '
  '0.012*"boogaloo" + 0.008*"taliban"'),
 (26,
  '0.021*"delhi" + 0.021*"face" + 0.014*"earthquake" + 0.010*"near" + '
  '0.010*"team" + 0.010*"snyder" + 0.010*"4.7" + 0.010*"investment" + '
  '0.010*"hutchinson" + 0.010*"change"'),
 (37,
  '0.016*"raji" + 0.012*"virus" + 0.012*"

In [17]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pandas.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                        pandas.Series(
                            [int(topic_num), round(prop_topic,4),topic_keywords]
                        ), ignore_index=True
                    )
            else:
                break
    sent_topics_df.columns = ['dominant_topic', 'perc_contribution', 'topic_keywords']

    # Add original text to the end of the output
    contents = pandas.Series(texts)
    sent_topics_df = pandas.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=news_text)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['document_no', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,document_no,dominant_topic,topic_perc_contrib,keywords,text
0,0,29.0,0.3724,"intelligence, bounty, troop, afghanistan, repo...",biden: trump's entire presidency has been a gi...
1,1,23.0,0.9892,"coronavirus, case, state, vaccine, covid-19, a...",premier warns entirety of victoria could go ba...
2,2,20.0,0.989,"coronavirus, pandemic, year, new, update, chin...",wall street stocks seal best quarter since 199...
3,3,17.0,0.9901,"gun, real, energy, sergio, ramos, madrid, pert...",energy source is back: is texas about to sink ...
4,4,49.0,0.9787,"coronavirus, live, backup, news, world, surge,...","your life at home. parenting, technology and w..."
5,5,28.0,0.9906,"state, flag, mississippi, confederate, bill, e...","with a pen stroke, mississippi drops confedera..."
6,6,25.0,0.9884,"chinese, movie, watch, tv, dna, india, trading...","uphold rights of international investors, says..."
7,7,25.0,0.9805,"chinese, movie, watch, tv, dna, india, trading...",panetta: russian bounties close to 'act of war...
8,8,40.0,0.4943,"trump, cbs, u.s, russia, house, reserve, white...","u.s. laws allow ""horrific"" child marriages, su..."
9,9,40.0,0.6938,"trump, cbs, u.s, russia, house, reserve, white...",white house attacks leaks as lawmakers demand ...


In [18]:
# Group top 5 sentences under each topic
sent_topics_sorteddf = pandas.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('dominant_topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pandas.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['perc_contribution'], 
                                             ascending=[0]).head(1)],
                                             axis=0
                                             )

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['topic_Num', 'perc_Contribution', 'keywords', 'text']

# Show
sent_topics_sorteddf.head(20)

Unnamed: 0,topic_Num,perc_Contribution,keywords,text
0,0.0,0.9938,"taxi, $, >, new, tesla, national, commission, ...",$500 off razer blade 15 intel core i7 gtx 1660...
1,1.0,0.9907,"roberts, amp, load, china, julia, justice, lib...",john roberts sides with liberals on supreme co...
2,2.0,0.9905,"federal, week, look, summer, student, governme...",linus torvalds: 'i do no coding any more' - s...
3,3.0,0.9917,"telescope, sibling, pick, work, popular, templ...",dr. bonnie henry says she's concerned about co...
4,4.0,0.9912,"bubba, north, find, partnership, rope, carolin...",north carolina speedway loses partnerships aft...
5,5.0,0.9916,"prime, minister, new, student, city, president...","new york city gunman shoots man dead, wounds w..."
6,6.0,0.9916,"state, maxwell, coronavirus, tuesday, ghislain...","cendana capital, which has been backing seed f..."
7,7.0,0.9892,"inslee, trump, jay, washington, governor, shah...",inslee faults trump for failing to push masks ...
8,8.0,0.9906,"life, john, black, matter, collins, reiner, ca...","police investigating suspicious death, maintai..."
9,9.0,0.9911,"intelligence, trump, donald, include, daily, p...",intelligence on russian bounty plot was includ...


In [19]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['dominant_topic'].value_counts()

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['dominant_topic', 'topic_keywords']].groupby(
                        ['dominant_topic', 'topic_keywords']
                    )['topic_keywords'].count().reset_index(name='num_documents')

topic_num_keywords['perc_documents'] = ((topic_num_keywords['num_documents'])/(topic_num_keywords['num_documents'].sum()))*100

# show top topics
topic_num_keywords.head(50).sort_values(by='perc_documents', ascending=False)

Unnamed: 0,dominant_topic,topic_keywords,num_documents,perc_documents
40,40.0,"trump, cbs, u.s, russia, house, reserve, white...",20,4.385965
20,20.0,"coronavirus, pandemic, year, new, update, chin...",16,3.508772
46,46.0,"case, state, coronavirus, covid-19, new, publi...",16,3.508772
38,38.0,"nation, 2020, face, transcript, follow, air, i...",16,3.508772
30,30.0,"trump, video, power, white, president, support...",14,3.070175
5,5.0,"prime, minister, new, student, city, president...",14,3.070175
45,45.0,"coronavirus, ubisoft, change, covid-19, americ...",14,3.070175
8,8.0,"life, john, black, matter, collins, reiner, ca...",14,3.070175
44,44.0,"newsletter, quizzes, trump, senator, president...",14,3.070175
13,13.0,"$, business, people, small, pilgrim, man, ign,...",13,2.850877


In [20]:
news_text

0      biden: trump's entire presidency has been a gi...
1      premier warns entirety of victoria could go ba...
2      wall street stocks seal best quarter since 199...
3      energy source is back: is texas about to sink ...
4      your life at home. parenting, technology and w...
                             ...                        
451    here are the hardest "grease" questions about ...
452    here's everything you need to know about that ...
453    your taste in tattoos will reveal what movie o...
454    what gemstone will be in your engagement ring ...
455    coronavirus australia live updates: melbourne ...
Name: text, Length: 456, dtype: object

In [21]:
df_dominant_topic

Unnamed: 0,document_no,dominant_topic,topic_perc_contrib,keywords,text
0,0,29.0,0.3724,"intelligence, bounty, troop, afghanistan, repo...",biden: trump's entire presidency has been a gi...
1,1,23.0,0.9892,"coronavirus, case, state, vaccine, covid-19, a...",premier warns entirety of victoria could go ba...
2,2,20.0,0.9890,"coronavirus, pandemic, year, new, update, chin...",wall street stocks seal best quarter since 199...
3,3,17.0,0.9901,"gun, real, energy, sergio, ramos, madrid, pert...",energy source is back: is texas about to sink ...
4,4,49.0,0.9787,"coronavirus, live, backup, news, world, surge,...","your life at home. parenting, technology and w..."
...,...,...,...,...,...
451,451,46.0,0.9634,"case, state, coronavirus, covid-19, new, publi...","here are the hardest ""grease"" questions about ..."
452,452,10.0,0.9684,"coronavirus, pandemic, month, rule, covid-19, ...",here's everything you need to know about that ...
453,453,25.0,0.9722,"chinese, movie, watch, tv, dna, india, trading...",your taste in tattoos will reveal what movie o...
454,454,13.0,0.5081,"$, business, people, small, pilgrim, man, ign,...",what gemstone will be in your engagement ring ...


In [22]:
output_df = pandas.concat([df[['article_id', 'publishedAt']], news_text, df_dominant_topic], axis=1)

output_df.columns = ['article_id', 'publishedAt', 'text', 'drop', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'drop']

output_df.drop(output_df['drop'], axis=1, inplace=True)

output_df['dominant_topic'] = output_df['dominant_topic'].astype(int)

output_df

Unnamed: 0,article_id,publishedAt,text,dominant_topic,topic_perc_contrib,keywords
0,9f54d198-91f6-469c-9cbf-c4cf2ff1032c,2020-07-01 00:37:25.913584+00:00,biden: trump's entire presidency has been a gi...,29,0.3724,"intelligence, bounty, troop, afghanistan, repo..."
1,a4809d5f-32f6-48f5-a100-2aa2cd43223b,2020-07-01 00:32:13+00:00,premier warns entirety of victoria could go ba...,23,0.9892,"coronavirus, case, state, vaccine, covid-19, a..."
2,bf3bacbb-4ca6-4518-b73f-0c72f6ded8f4,2020-07-01 00:52:19.822680+00:00,wall street stocks seal best quarter since 199...,20,0.9890,"coronavirus, pandemic, year, new, update, chin..."
3,ae5b2f71-73ca-4395-ab9b-a70157fe417e,2020-07-01 00:22:21.011005+00:00,energy source is back: is texas about to sink ...,17,0.9901,"gun, real, energy, sergio, ramos, madrid, pert..."
4,24ef003d-5f80-49fe-9bc6-0765ed17046f,2020-07-01 00:08:09.012992+00:00,"your life at home. parenting, technology and w...",49,0.9787,"coronavirus, live, backup, news, world, surge,..."
...,...,...,...,...,...,...
451,ecb79e27-e4f6-4b64-8510-caafea74950c,2020-06-30 23:52:26.565947+00:00,"here are the hardest ""grease"" questions about ...",46,0.9634,"case, state, coronavirus, covid-19, new, publi..."
452,7123aed3-b55e-4ba7-8fd4-2826039a0b09,2020-06-30 00:52:27.296074+00:00,here's everything you need to know about that ...,10,0.9684,"coronavirus, pandemic, month, rule, covid-19, ..."
453,ae355183-9fb4-4e94-a79e-e843e4079dc3,2020-06-30 00:52:25.687100+00:00,your taste in tattoos will reveal what movie o...,25,0.9722,"chinese, movie, watch, tv, dna, india, trading..."
454,ac9bbb3c-06ff-4558-9281-55ea1aaeb60f,2020-06-30 00:22:26.341299+00:00,what gemstone will be in your engagement ring ...,13,0.5081,"$, business, people, small, pilgrim, man, ign,..."


In [23]:
output_df.to_parquet('df.parquet.gzip', allow_truncated_timestamps=True)

bq_df = pandas.read_parquet('df.parquet.gzip')

In [24]:
# configure BigQuery job
job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("article_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("publishedAt", bigquery.enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("text", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("dominant_topic", bigquery.enums.SqlTypeNames.INT64),
        bigquery.SchemaField("topic_perc_contrib", bigquery.enums.SqlTypeNames.FLOAT64),
        bigquery.SchemaField("keywords", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

In [25]:
table_id = "news-site-280319.topics.article_topics"
job = client.load_table_from_dataframe(
    bq_df,
    table_id, 
    job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

<google.cloud.bigquery.job.LoadJob at 0x127168c70>

In [26]:
# check job status
job.result().state

'DONE'