# News Topic Modeling

This notebook is used to derive topics from news text


In [1]:
!gcloud config configurations activate news-site

Activated [news-site].


Updates are available for some Cloud SDK components.  To install them,
please run:
  $ gcloud components update



In [2]:
from google.cloud import bigquery

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_md

import pandas
import re
from tqdm import tqdm_notebook as tqdm
import numpy
from pprint import pprint

In [3]:
client = bigquery.Client()
sql = """
    SELECT 
        article_id,
        publishedAt,
        CONCAT(title, '. ', description, '. ', content) AS text
    FROM `news-site-280319.news.articles`
    WHERE
      title IS NOT NULL
      AND description IS NOT NULL
      AND content IS NOT NULL
      AND DATE(publishedAt) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
"""

df = client.query(sql).to_dataframe()

df.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 889 entries, 0 to 888
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   article_id   889 non-null    object             
 1   publishedAt  889 non-null    datetime64[ns, UTC]
 2   text         889 non-null    object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 21.0+ KB


In [4]:
df.head()

Unnamed: 0,article_id,publishedAt,text
0,3a9144f9-f74d-4362-86a6-ae75cc47930c,2020-09-26 11:48:25+00:00,Police break up parties at Edinburgh student h...
1,7e68d7c5-b7e8-46bc-a25e-ccd5b39c921e,2020-09-26 10:06:00+00:00,2020/09/26 10:00 GMT. The latest five minute n...
2,8313acac-487a-4437-86f0-2882fedaf907,2020-09-26 11:06:00+00:00,2020/09/26 11:00 GMT. The latest five minute n...
3,c65d1095-ebc1-41a4-871e-e8575c2c8859,2020-09-26 10:44:53+00:00,Fleetwood Town v AFC Wimbledon. Live coverage ...
4,0923ad9f-e2de-43d3-a864-89fcb4dd60f8,2020-09-26 10:43:27+00:00,Queens Park Rangers v Middlesbrough. Live cove...


In [5]:
news_text = df['text']

news_text.head()

0    Police break up parties at Edinburgh student h...
1    2020/09/26 10:00 GMT. The latest five minute n...
2    2020/09/26 11:00 GMT. The latest five minute n...
3    Fleetwood Town v AFC Wimbledon. Live coverage ...
4    Queens Park Rangers v Middlesbrough. Live cove...
Name: text, dtype: object

In [6]:
# replace all new line returns with spaces
news_text = news_text.str.replace('\r\n', ' ', regex=True)

In [7]:
# convert to lowercase
news_text = news_text.str.lower()

In [8]:
nlp = spacy.load("en_core_web_md")

In [9]:
def lemmatizer(doc):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [10]:
# update stop words list
custom_stop_list = ["char", "   ", "  ", "|", "reuters"]
nlp.Defaults.stop_words.update(custom_stop_list)

for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

In [11]:
# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [12]:
doc_list = []
# Iterates through each article in the corpus.
for doc in news_text:
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)

In [13]:
# Creates mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [14]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=20, 
                                           random_state=0,
                                           update_every=1,
                                           passes=100,
                                           alpha='auto',
                                           per_word_topics=True)

In [15]:
pprint(lda_model.print_topics(num_words=10))

[(0,
  &#39;0.049*&quot;trump&quot; + 0.032*&quot;president&quot; + 0.021*&quot;covid-19&quot; + 0.015*&quot;house&quot; + &#39;
  &#39;0.014*&quot;white&quot; + 0.012*&quot;walter&quot; + 0.012*&quot;reed&quot; + 0.011*&quot;military&quot; + &#39;
  &#39;0.011*&quot;coronavirus&quot; + 0.010*&quot;donald&quot;&#39;),
 (1,
  &#39;0.025*&quot;reuters&quot; + 0.016*&quot;6&quot; + 0.010*&quot;google&quot; + 0.009*&quot;16&quot; + 0.008*&quot;staff&quot; + &#39;
  &#39;0.007*&quot;4&quot; + 0.007*&quot;victory&quot; + 0.007*&quot;&gt;&quot; + 0.007*&quot;&lt;&quot; + 0.007*&quot;7&quot;&#39;),
 (2,
  &#39;0.062*&quot;trump&quot; + 0.039*&quot;president&quot; + 0.020*&quot;donald&quot; + 0.015*&quot;biden&quot; + &#39;
  &#39;0.015*&quot;u.s&quot; + 0.015*&quot;presidential&quot; + 0.014*&quot;coronavirus&quot; + 0.014*&quot;debate&quot; + &#39;
  &#39;0.012*&quot;test&quot; + 0.010*&quot;chinese&quot;&#39;),
 (3,
  &#39;0.009*&quot;night&quot; + 0.009*&quot;launch&quot; + 0.008*&quot;davi

In [16]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pandas.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                        pandas.Series(
                            [int(topic_num), round(prop_topic,4),topic_keywords]
                        ), ignore_index=True
                    )
            else:
                break
    sent_topics_df.columns = ['dominant_topic', 'perc_contribution', 'topic_keywords']

    # Add original text to the end of the output
    contents = pandas.Series(texts)
    sent_topics_df = pandas.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=news_text)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['document_no', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'text']

# Show
df_dominant_topic.head(5)

Unnamed: 0,document_no,dominant_topic,topic_perc_contrib,keywords,text
0,0,0.0,0.9917,"trump, president, covid-19, house, white, walt...",police break up parties at edinburgh student h...
1,1,12.0,0.9848,"reuters, everton, league, score, 4, season, br...",2020/09/26 10:00 gmt. the latest five minute n...
2,2,12.0,0.9848,"reuters, everton, league, score, 4, season, br...",2020/09/26 11:00 gmt. the latest five minute n...
3,3,12.0,0.6141,"reuters, everton, league, score, 4, season, br...",fleetwood town v afc wimbledon. live coverage ...
4,4,1.0,0.6045,"reuters, 6, google, 16, staff, 4, victory, >, ...",queens park rangers v middlesbrough. live cove...


In [17]:
# # Group top 5 sentences under each topic
# sent_topics_sorteddf = pandas.DataFrame()

# sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('dominant_topic')

# for i, grp in sent_topics_outdf_grpd:
#     sent_topics_sorteddf = pandas.concat([sent_topics_sorteddf, 
#                                              grp.sort_values(['perc_contribution'], 
#                                              ascending=[0]).head(1)],
#                                              axis=0
#                                              )

# # Reset Index    
# sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# # Format
# sent_topics_sorteddf.columns = ['topic_Num', 'perc_Contribution', 'keywords', 'text']

# # Show
# sent_topics_sorteddf.head(5)

In [18]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['dominant_topic'].value_counts()

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['dominant_topic', 'topic_keywords']].groupby(
                        ['dominant_topic', 'topic_keywords']
                    )['topic_keywords'].count().reset_index(name='num_documents')

topic_num_keywords['perc_documents'] = ((topic_num_keywords['num_documents'])/(topic_num_keywords['num_documents'].sum()))*100

# show top topics
topic_num_keywords.head(5).sort_values(by='perc_documents', ascending=False)

Unnamed: 0,dominant_topic,topic_keywords,num_documents,perc_documents
0,0.0,"trump, president, covid-19, house, white, walt...",58,6.524184
1,1.0,"reuters, 6, google, 16, staff, 4, victory, >, ...",53,5.961755
4,4.0,"reuters, coronavirus, new, pixel, 5, record, g...",53,5.961755
2,2.0,"trump, president, donald, biden, u.s, presiden...",48,5.399325
3,3.0,"night, launch, david, image, packers, game, ru...",30,3.374578


In [19]:
output_df = pandas.concat([df[['article_id', 'publishedAt']], news_text, df_dominant_topic], axis=1)

output_df.columns = ['article_id', 'publishedAt', 'text', 'drop', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'drop']

output_df.drop(output_df['drop'], axis=1, inplace=True)

output_df['dominant_topic'] = output_df['dominant_topic'].astype(int)

output_df

Unnamed: 0,article_id,publishedAt,text,dominant_topic,topic_perc_contrib,keywords
0,3a9144f9-f74d-4362-86a6-ae75cc47930c,2020-09-26 11:48:25+00:00,police break up parties at edinburgh student h...,0,0.9917,"trump, president, covid-19, house, white, walt..."
1,7e68d7c5-b7e8-46bc-a25e-ccd5b39c921e,2020-09-26 10:06:00+00:00,2020/09/26 10:00 gmt. the latest five minute n...,12,0.9848,"reuters, everton, league, score, 4, season, br..."
2,8313acac-487a-4437-86f0-2882fedaf907,2020-09-26 11:06:00+00:00,2020/09/26 11:00 gmt. the latest five minute n...,12,0.9848,"reuters, everton, league, score, 4, season, br..."
3,c65d1095-ebc1-41a4-871e-e8575c2c8859,2020-09-26 10:44:53+00:00,fleetwood town v afc wimbledon. live coverage ...,12,0.6141,"reuters, everton, league, score, 4, season, br..."
4,0923ad9f-e2de-43d3-a864-89fcb4dd60f8,2020-09-26 10:43:27+00:00,queens park rangers v middlesbrough. live cove...,1,0.6045,"reuters, 6, google, 16, staff, 4, victory, >, ..."
...,...,...,...,...,...,...
884,9c88b86d-2ebe-49be-9dc0-1518038adb44,2020-10-01 11:00:00+00:00,focus-meatpackers in the americas accelerate a...,0,0.9947,"trump, president, covid-19, house, white, walt..."
885,b7b9928f-909b-4edb-9921-dbe4c7e2ce4d,2020-10-01 11:17:55+00:00,ask the captain: how does ice on the wing affe...,3,0.9932,"night, launch, david, image, packers, game, ru..."
886,87f165b7-4e6e-4cd1-a365-ebd4d5ac7dee,2020-10-01 10:57:30+00:00,us jobless claims likely remained high as layo...,6,0.9941,"reuters, man, police, attack, home, video, sus..."
887,aea97082-cab0-4e9c-9bd6-4b09f76f75a7,2020-10-01 10:59:02+00:00,coronavirus updates: massive airline layoffs c...,4,0.9949,"reuters, coronavirus, new, pixel, 5, record, g..."


In [21]:
# configure BigQuery job
job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("article_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("publishedAt", bigquery.enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("text", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("dominant_topic", bigquery.enums.SqlTypeNames.INT64),
        bigquery.SchemaField("topic_perc_contrib", bigquery.enums.SqlTypeNames.FLOAT64),
        bigquery.SchemaField("keywords", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

In [22]:
table_id = "news-site-280319.topics.article_topics"
job = client.load_table_from_dataframe(
    output_df,
    table_id, 
    job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

&lt;google.cloud.bigquery.job.LoadJob at 0x12ca69ee0&gt;

In [23]:
# check job status
job.result().state

&#39;DONE&#39;