# News Topic Modeling

This notebook is used to derive topics from news text


In [1]:
!gcloud config configurations activate news-site

Activated [news-site].


In [2]:
from google.cloud import bigquery

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_md

import pandas
import re
import numpy
from pprint import pprint

In [3]:
client = bigquery.Client()
sql = """
    SELECT 
        article_id,
        publishedAt,
        CONCAT(title, '. ', description, '. ', content) AS text
    FROM `news-site-280319.news.articles`
    WHERE
      title IS NOT NULL
      AND description IS NOT NULL
      AND content IS NOT NULL
      AND DATE(publishedAt) >= DATE_SUB(CURRENT_DATE(), INTERVAL 14 DAY)
"""

df = client.query(sql).to_dataframe()

df.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 2378 entries, 0 to 2377
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   article_id   2378 non-null   object             
 1   publishedAt  2378 non-null   datetime64[ns, UTC]
 2   text         2378 non-null   object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 55.9+ KB


In [23]:
news_text = df['text']
news_text.head(5)

0    2020/09/26 10:00 GMT. The latest five minute n...
1    2020/09/26 11:00 GMT. The latest five minute n...
2    Fleetwood Town v AFC Wimbledon. Live coverage ...
3    Queens Park Rangers v Middlesbrough. Live cove...
4    Millwall v Brentford. Live coverage of Saturda...
Name: text, dtype: object

In [6]:
# replace all new line returns with spaces
news_text = news_text.str.replace('\r\n', ' ', regex=True)

In [7]:
# convert to lowercase
news_text = news_text.str.lower()

In [8]:
nlp = spacy.load("en_core_web_md")

In [9]:
def lemmatizer(doc):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [10]:
# update stop words list
custom_stop_list = ["char", "   ", "  ", "|", "reuters"]
nlp.Defaults.stop_words.update(custom_stop_list)

for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

In [11]:
# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [12]:
doc_list = []
# Iterates through each article in the corpus.
for doc in news_text:
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)

In [13]:
# Creates mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [14]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=20, 
                                           random_state=0,
                                           update_every=1,
                                           passes=100,
                                           alpha='auto',
                                           per_word_topics=True)

In [15]:
pprint(lda_model.print_topics(num_words=10))

[(0,
  &#39;0.014*&quot;&gt;&quot; + 0.010*&quot;&lt;&quot; + 0.010*&quot;talk&quot; + 0.010*&quot;capacity&quot; + 0.009*&quot;antibody&quot; &#39;
  &#39;+ 0.008*&quot;september&quot; + 0.007*&quot;month&quot; + 0.007*&quot;chelsea&quot; + 0.007*&quot;continue&quot; + &#39;
  &#39;0.007*&quot;defender&quot;&#39;),
 (1,
  &#39;0.016*&quot;court&quot; + 0.013*&quot;supreme&quot; + 0.012*&quot;sound&quot; + 0.012*&quot;death&quot; + &#39;
  &#39;0.011*&quot;party&quot; + 0.010*&quot;ginsburg&quot; + 0.009*&quot;justice&quot; + 0.008*&quot;ruth&quot; + &#39;
  &#39;0.008*&quot;bader&quot; + 0.008*&quot;disaster&quot;&#39;),
 (2,
  &#39;0.022*&quot;united&quot; + 0.011*&quot;gas&quot; + 0.010*&quot;states&quot; + 0.010*&quot;friday&quot; + &#39;
  &#39;0.010*&quot;report&quot; + 0.009*&quot;cut&quot; + 0.008*&quot;stake&quot; + 0.008*&quot;france&quot; + &#39;
  &#39;0.008*&quot;police&quot; + 0.008*&quot;people&quot;&#39;),
 (3,
  &#39;0.011*&quot;win&quot; + 0.010*&quot;deal&quot; + 0.0

In [16]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pandas.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                        pandas.Series(
                            [int(topic_num), round(prop_topic,4),topic_keywords]
                        ), ignore_index=True
                    )
            else:
                break
    sent_topics_df.columns = ['dominant_topic', 'perc_contribution', 'topic_keywords']

    # Add original text to the end of the output
    contents = pandas.Series(texts)
    sent_topics_df = pandas.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=news_text)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['document_no', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'text']

# Show
df_dominant_topic.head(5)

Unnamed: 0,document_no,dominant_topic,topic_perc_contrib,keywords,text
0,0,11.0,0.9344,"world, police, late, service, bbc, protest, ne...",2020/09/26 10:00 gmt. the latest five minute n...
1,1,11.0,0.9344,"world, police, late, service, bbc, protest, ne...",2020/09/26 11:00 gmt. the latest five minute n...
2,2,13.0,0.8615,"game, season, new, saturday, touchdown, nfl, v...",fleetwood town v afc wimbledon. live coverage ...
3,3,13.0,0.7713,"game, season, new, saturday, touchdown, nfl, v...",queens park rangers v middlesbrough. live cove...
4,4,13.0,0.538,"game, season, new, saturday, touchdown, nfl, v...",millwall v brentford. live coverage of saturda...


In [17]:
# # Group top 5 sentences under each topic
# sent_topics_sorteddf = pandas.DataFrame()

# sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('dominant_topic')

# for i, grp in sent_topics_outdf_grpd:
#     sent_topics_sorteddf = pandas.concat([sent_topics_sorteddf, 
#                                              grp.sort_values(['perc_contribution'], 
#                                              ascending=[0]).head(1)],
#                                              axis=0
#                                              )

# # Reset Index    
# sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# # Format
# sent_topics_sorteddf.columns = ['topic_Num', 'perc_Contribution', 'keywords', 'text']

# # Show
# sent_topics_sorteddf.head(5)

In [18]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['dominant_topic'].value_counts()

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['dominant_topic', 'topic_keywords']].groupby(
                        ['dominant_topic', 'topic_keywords']
                    )['topic_keywords'].count().reset_index(name='num_documents')

topic_num_keywords['perc_documents'] = ((topic_num_keywords['num_documents'])/(topic_num_keywords['num_documents'].sum()))*100

# show top topics
topic_num_keywords.head(5).sort_values(by='perc_documents', ascending=False)

Unnamed: 0,dominant_topic,topic_keywords,num_documents,perc_documents
4,4.0,"u.s, year, future, company, oil, million, chin...",132,5.550883
3,3.0,"win, deal, big, new, 5, score, everton, kentuc...",111,4.667788
1,1.0,"court, supreme, sound, death, party, ginsburg,...",94,3.952902
2,2.0,"united, gas, states, friday, report, cut, stak...",84,3.53238
0,0.0,">, <, talk, capacity, antibody, september, mon...",75,3.153911


In [19]:
output_df = pandas.concat([df[['article_id', 'publishedAt']], news_text, df_dominant_topic], axis=1)

output_df.columns = ['article_id', 'publishedAt', 'text', 'drop', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'drop']

output_df.drop(output_df['drop'], axis=1, inplace=True)

output_df['dominant_topic'] = output_df['dominant_topic'].astype(int)

output_df

Unnamed: 0,article_id,publishedAt,text,dominant_topic,topic_perc_contrib,keywords
0,7e68d7c5-b7e8-46bc-a25e-ccd5b39c921e,2020-09-26 10:06:00+00:00,2020/09/26 10:00 gmt. the latest five minute n...,11,0.9344,"world, police, late, service, bbc, protest, ne..."
1,8313acac-487a-4437-86f0-2882fedaf907,2020-09-26 11:06:00+00:00,2020/09/26 11:00 gmt. the latest five minute n...,11,0.9344,"world, police, late, service, bbc, protest, ne..."
2,c65d1095-ebc1-41a4-871e-e8575c2c8859,2020-09-26 10:44:53+00:00,fleetwood town v afc wimbledon. live coverage ...,13,0.8615,"game, season, new, saturday, touchdown, nfl, v..."
3,0923ad9f-e2de-43d3-a864-89fcb4dd60f8,2020-09-26 10:43:27+00:00,queens park rangers v middlesbrough. live cove...,13,0.7713,"game, season, new, saturday, touchdown, nfl, v..."
4,246ddb7a-27ec-40a4-afdc-bd45bdd6ea3c,2020-09-26 10:43:25+00:00,millwall v brentford. live coverage of saturda...,13,0.5380,"game, season, new, saturday, touchdown, nfl, v..."
...,...,...,...,...,...,...
2373,7a27e564-540e-4464-a5df-421d3d3da6c4,2020-10-03 21:02:33+00:00,no. 3 florida keeps rolling with defeat of sou...,8,0.7854,"trump, biden, election, president, joe, state,..."
2374,9cca6cab-f844-484c-9372-4c9274a7e22e,2020-10-03 20:36:42+00:00,jennifer aniston loves these sweaty betty legg...,19,0.4395,"live, ufc, result, airline, las, vegas, 9, wei..."
2375,af4b3347-d937-40bc-9d10-02c73ed15dc2,2020-10-03 20:08:42+00:00,colliding crises shake already chaotic campaig...,1,0.3522,"court, supreme, sound, death, party, ginsburg,..."
2376,82b7aeda-d162-44a2-9444-2f27df8b181b,2020-10-03 20:43:01+00:00,"was trump ever on oxygen? health, security exp...",16,0.5241,"trump, president, house, covid-19, donald, whi..."


In [20]:
# configure BigQuery job
job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("article_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("publishedAt", bigquery.enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("text", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("dominant_topic", bigquery.enums.SqlTypeNames.INT64),
        bigquery.SchemaField("topic_perc_contrib", bigquery.enums.SqlTypeNames.FLOAT64),
        bigquery.SchemaField("keywords", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

In [21]:
table_id = "news-site-280319.topics.article_topics"
job = client.load_table_from_dataframe(
    output_df,
    table_id, 
    job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

&lt;google.cloud.bigquery.job.LoadJob at 0x1328abd60&gt;

In [22]:
# check job status
job.result().state

&#39;DONE&#39;