# News Topic Modeling

This notebook is used to derive topics from news text


In [1]:
!gcloud config configurations activate news-site

Activated [news-site].


In [2]:
from google.cloud import bigquery

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_md

import pandas
import re
from tqdm import tqdm_notebook as tqdm
import numpy
from pprint import pprint

In [3]:
client = bigquery.Client()
sql = """
    SELECT 
        article_id,
        publishedAt,
        CONCAT(title, '. ', description, '. ', content) AS text
    FROM `news-site-280319.news.articles`
    WHERE
      title IS NOT NULL
      AND description IS NOT NULL
      AND content IS NOT NULL
      AND DATE(publishedAt) >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
"""

df = client.query(sql).to_dataframe()

df.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 889 entries, 0 to 888
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   article_id   889 non-null    object             
 1   publishedAt  889 non-null    datetime64[ns, UTC]
 2   text         889 non-null    object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 21.0+ KB


In [4]:
df.head()

Unnamed: 0,article_id,publishedAt,text
0,a23e6a41-e4b7-4454-ab61-5f5f976a6676,2020-09-28 11:48:01+00:00,Turkey ‘indicts six more Saudis’ over Jamal Kh...
1,9eca8dbd-da7f-4b78-b341-ff068addf30c,2020-09-30 11:50:05+00:00,Earthquake swarm in California as 4 tremors sh...
2,2bd0895f-c264-444b-ac7b-be77d3b78187,2020-09-28 11:06:00+00:00,2020/09/28 11:00 GMT. The latest five minute n...
3,1bcd7cb9-c8b5-40e9-a7c7-fb96a6659710,2020-09-28 11:14:27+00:00,French Open 2020: Petra Kvitova beats Oceane D...
4,cd7c854a-fe31-46de-8874-0f71425a8b9a,2020-09-28 11:28:37+00:00,Ben Curran: Northamptonshire opener signs new ...


In [5]:
news_text = df['text']

news_text.head()

0    Turkey ‘indicts six more Saudis’ over Jamal Kh...
1    Earthquake swarm in California as 4 tremors sh...
2    2020/09/28 11:00 GMT. The latest five minute n...
3    French Open 2020: Petra Kvitova beats Oceane D...
4    Ben Curran: Northamptonshire opener signs new ...
Name: text, dtype: object

In [6]:
# replace all new line returns with spaces
news_text = news_text.str.replace('\r\n', ' ', regex=True)

In [7]:
# convert to lowercase
news_text = news_text.str.lower()

In [8]:
news_text[3]

&quot;french open 2020: petra kvitova beats oceane dodin to move into second round. seventh seed petra kvitova moves into the second round of the french open by beating france&#39;s oceane dodin on day two in paris.. a maximum of 1,000 fans were able to watch petra kvitova beat oceane dodin on court philippe chatrier &lt;table&gt;&lt;tr&gt;&lt;th&gt;french open 2020&lt;/th&gt;&lt;/tr&gt; &lt;tr&gt;&lt;td&gt;dates: 27 september - 11 october. venue: rola… [+815 chars]&quot;

In [9]:
nlp = spacy.load("en_core_web_md")

In [10]:
def lemmatizer(doc):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [11]:
# update stop words list
custom_stop_list = ["char", "   ", "  ", "|", "reuters"]
nlp.Defaults.stop_words.update(custom_stop_list)

for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

In [12]:
# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [13]:
doc_list = []
# Iterates through each article in the corpus.
for doc in news_text:
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)

In [14]:
# Creates mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [15]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=20, 
                                           random_state=0,
                                           update_every=1,
                                           passes=100,
                                           alpha='auto',
                                           per_word_topics=True)

In [16]:
pprint(lda_model.print_topics(num_words=10))

[(0,
  &#39;0.014*&quot;6&quot; + 0.010*&quot;french&quot; + 0.009*&quot;new&quot; + 0.009*&quot;covid-19&quot; + 0.008*&quot;7&quot; + &#39;
  &#39;0.008*&quot;work&quot; + 0.007*&quot;2&quot; + 0.006*&quot;seed&quot; + 0.006*&quot;end&quot; + 0.006*&quot;season&quot;&#39;),
 (1,
  &#39;0.010*&quot;game&quot; + 0.010*&quot;google&quot; + 0.007*&quot;fight&quot; + 0.007*&quot;2020&quot; + &#39;
  &#39;0.007*&quot;bundesliga&quot; + 0.007*&quot;eu&quot; + 0.006*&quot;score&quot; + 0.006*&quot;fitbit&quot; + &#39;
  &#39;0.006*&quot;4&quot; + 0.006*&quot;braut&quot;&#39;),
 (2,
  &#39;0.018*&quot;trump&quot; + 0.016*&quot;ap&quot; + 0.013*&quot;madrid&quot; + 0.011*&quot;white&quot; + &#39;
  &#39;0.010*&quot;thursday&quot; + 0.009*&quot;house&quot; + 0.009*&quot;friday&quot; + 0.008*&quot;covid-19&quot; + &#39;
  &#39;0.008*&quot;president&quot; + 0.007*&quot;bethesda&quot;&#39;),
 (3,
  &#39;0.011*&quot;european&quot; + 0.009*&quot;new&quot; + 0.008*&quot;police&quot; + 0.008*&quot;sto

In [17]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pandas.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                        pandas.Series(
                            [int(topic_num), round(prop_topic,4),topic_keywords]
                        ), ignore_index=True
                    )
            else:
                break
    sent_topics_df.columns = ['dominant_topic', 'perc_contribution', 'topic_keywords']

    # Add original text to the end of the output
    contents = pandas.Series(texts)
    sent_topics_df = pandas.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=news_text)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['document_no', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,document_no,dominant_topic,topic_perc_contrib,keywords,text
0,0,19.0,0.5884,"space, year, french, open, paris, million, toi...",turkey ‘indicts six more saudis’ over jamal kh...
1,1,1.0,0.9943,"game, google, fight, 2020, bundesliga, eu, sco...",earthquake swarm in california as 4 tremors sh...
2,2,13.0,0.9234,"staff, saturday, minute, bbc, world, news, lat...",2020/09/28 11:00 gmt. the latest five minute n...
3,3,12.0,0.568,"french, open, canadian, reach, kvitova, 3, >, ...",french open 2020: petra kvitova beats oceane d...
4,4,13.0,0.9935,"staff, saturday, minute, bbc, world, news, lat...",ben curran: northamptonshire opener signs new ...
5,5,5.0,0.9942,"company, u.s, uk, hit, pgl, dollar, million, c...",japan's little-known nikkei 500 hits record hi...
6,6,3.0,0.9946,"european, new, police, story, $, open, union, ...",exclusive: eu chair germany proposes rule of l...
7,7,10.0,0.4382,"everton, brighton, 2, league, premier, 4, jame...",coronavirus: rugby players association calls f...
8,8,13.0,0.994,"staff, saturday, minute, bbc, world, news, lat...",saudi arabia to host first ladies european tou...
9,9,9.0,0.5718,"prime, johnson, start, saturday, time, ministe...",the morning after: amazon prime day is happeni...


In [18]:
# Group top 5 sentences under each topic
sent_topics_sorteddf = pandas.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('dominant_topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pandas.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['perc_contribution'], 
                                             ascending=[0]).head(1)],
                                             axis=0
                                             )

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['topic_Num', 'perc_Contribution', 'keywords', 'text']

# Show
sent_topics_sorteddf.head(20)

Unnamed: 0,topic_Num,perc_Contribution,keywords,text
0,0.0,0.9955,"6, french, new, covid-19, 7, work, 2, seed, en...",teenager sinner stuns 11th seed goffin on fren...
1,1.0,0.9958,"game, google, fight, 2020, bundesliga, eu, sco...",exclusive: google set to win eu approval for f...
2,2.0,0.9953,"trump, ap, madrid, white, thursday, house, fri...",swiss voters split on $6.5 billion purchase of...
3,3.0,0.9958,"european, new, police, story, $, open, union, ...",mubadala to invest $2 bln in u.s. private equi...
4,4.0,0.9952,"new, coronavirus, senate, trump, republicans, ...",asia's capital raising rush delivers record fe...
5,5.0,0.9954,"company, u.s, uk, hit, pgl, dollar, million, c...",fortune 100 companies commit $3.3 billion to f...
6,6.0,0.995,"debate, presidential, biden, year, candidate, ...",euro zone inflation will remain negative this ...
7,7.0,0.9947,"armenia, karabakh, nagorno, azerbaijan, ufc, w...",pensioners' paradise lost: covid sows fear amo...
8,8.0,0.9951,"new, india, exchange, york, $, year, mexico, c...",lebanon to allow hard-hit students abroad to g...
9,9.0,0.9953,"prime, johnson, start, saturday, time, ministe...","the 145th preakness stakes live stream, post p..."


In [19]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['dominant_topic'].value_counts()

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['dominant_topic', 'topic_keywords']].groupby(
                        ['dominant_topic', 'topic_keywords']
                    )['topic_keywords'].count().reset_index(name='num_documents')

topic_num_keywords['perc_documents'] = ((topic_num_keywords['num_documents'])/(topic_num_keywords['num_documents'].sum()))*100

# show top topics
topic_num_keywords.head(50).sort_values(by='perc_documents', ascending=False)

Unnamed: 0,dominant_topic,topic_keywords,num_documents,perc_documents
15,15.0,"court, u.s, supreme, trump, saturday, senate, ...",77,8.661417
17,17.0,"trump, president, covid-19, donald, test, posi...",75,8.436445
13,13.0,"staff, saturday, minute, bbc, world, news, lat...",63,7.086614
11,11.0,"coronavirus, case, u.s, new, covid-19, pandemi...",54,6.074241
10,10.0,"everton, brighton, 2, league, premier, 4, jame...",53,5.961755
9,9.0,"prime, johnson, start, saturday, time, ministe...",44,4.949381
12,12.0,"french, open, canadian, reach, kvitova, 3, >, ...",44,4.949381
14,14.0,"week, walmart, staff, britain, uk, election, i...",43,4.836895
19,19.0,"space, year, french, open, paris, million, toi...",43,4.836895
3,3.0,"european, new, police, story, $, open, union, ...",42,4.724409


In [20]:
news_text

0      turkey ‘indicts six more saudis’ over jamal kh...
1      earthquake swarm in california as 4 tremors sh...
2      2020/09/28 11:00 gmt. the latest five minute n...
3      french open 2020: petra kvitova beats oceane d...
4      ben curran: northamptonshire opener signs new ...
                             ...                        
884    meatpackers in the americas accelerate automat...
885    focus-meatpackers in the americas accelerate a...
886    ask the captain: how does ice on the wing affe...
887    us jobless claims likely remained high as layo...
888    coronavirus updates: massive airline layoffs c...
Name: text, Length: 889, dtype: object

In [21]:
df_dominant_topic

Unnamed: 0,document_no,dominant_topic,topic_perc_contrib,keywords,text
0,0,19.0,0.5884,"space, year, french, open, paris, million, toi...",turkey ‘indicts six more saudis’ over jamal kh...
1,1,1.0,0.9943,"game, google, fight, 2020, bundesliga, eu, sco...",earthquake swarm in california as 4 tremors sh...
2,2,13.0,0.9234,"staff, saturday, minute, bbc, world, news, lat...",2020/09/28 11:00 gmt. the latest five minute n...
3,3,12.0,0.5680,"french, open, canadian, reach, kvitova, 3, >, ...",french open 2020: petra kvitova beats oceane d...
4,4,13.0,0.9935,"staff, saturday, minute, bbc, world, news, lat...",ben curran: northamptonshire opener signs new ...
...,...,...,...,...,...
884,884,18.0,0.8310,"atlanta, year, home, coronavirus, pandemic, ph...",meatpackers in the americas accelerate automat...
885,885,18.0,0.9943,"atlanta, year, home, coronavirus, pandemic, ph...",focus-meatpackers in the americas accelerate a...
886,886,7.0,0.9931,"armenia, karabakh, nagorno, azerbaijan, ufc, w...",ask the captain: how does ice on the wing affe...
887,887,11.0,0.5511,"coronavirus, case, u.s, new, covid-19, pandemi...",us jobless claims likely remained high as layo...


In [22]:
output_df = pandas.concat([df[['article_id', 'publishedAt']], news_text, df_dominant_topic], axis=1)

output_df.columns = ['article_id', 'publishedAt', 'text', 'drop', 'dominant_topic', 'topic_perc_contrib', 'keywords', 'drop']

output_df.drop(output_df['drop'], axis=1, inplace=True)

output_df['dominant_topic'] = output_df['dominant_topic'].astype(int)

output_df

Unnamed: 0,article_id,publishedAt,text,dominant_topic,topic_perc_contrib,keywords
0,a23e6a41-e4b7-4454-ab61-5f5f976a6676,2020-09-28 11:48:01+00:00,turkey ‘indicts six more saudis’ over jamal kh...,19,0.5884,"space, year, french, open, paris, million, toi..."
1,9eca8dbd-da7f-4b78-b341-ff068addf30c,2020-09-30 11:50:05+00:00,earthquake swarm in california as 4 tremors sh...,1,0.9943,"game, google, fight, 2020, bundesliga, eu, sco..."
2,2bd0895f-c264-444b-ac7b-be77d3b78187,2020-09-28 11:06:00+00:00,2020/09/28 11:00 gmt. the latest five minute n...,13,0.9234,"staff, saturday, minute, bbc, world, news, lat..."
3,1bcd7cb9-c8b5-40e9-a7c7-fb96a6659710,2020-09-28 11:14:27+00:00,french open 2020: petra kvitova beats oceane d...,12,0.5680,"french, open, canadian, reach, kvitova, 3, >, ..."
4,cd7c854a-fe31-46de-8874-0f71425a8b9a,2020-09-28 11:28:37+00:00,ben curran: northamptonshire opener signs new ...,13,0.9935,"staff, saturday, minute, bbc, world, news, lat..."
...,...,...,...,...,...,...
884,6a050c53-c560-4030-ab36-faaf8e18b836,2020-10-01 11:14:54+00:00,meatpackers in the americas accelerate automat...,18,0.8310,"atlanta, year, home, coronavirus, pandemic, ph..."
885,9c88b86d-2ebe-49be-9dc0-1518038adb44,2020-10-01 11:00:00+00:00,focus-meatpackers in the americas accelerate a...,18,0.9943,"atlanta, year, home, coronavirus, pandemic, ph..."
886,b7b9928f-909b-4edb-9921-dbe4c7e2ce4d,2020-10-01 11:17:55+00:00,ask the captain: how does ice on the wing affe...,7,0.9931,"armenia, karabakh, nagorno, azerbaijan, ufc, w..."
887,87f165b7-4e6e-4cd1-a365-ebd4d5ac7dee,2020-10-01 10:57:30+00:00,us jobless claims likely remained high as layo...,11,0.5511,"coronavirus, case, u.s, new, covid-19, pandemi..."


In [23]:
# output_df.to_parquet('df.parquet.gzip', allow_truncated_timestamps=True)

# bq_df = pandas.read_parquet('df.parquet.gzip')

In [24]:
# configure BigQuery job
job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("article_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("publishedAt", bigquery.enums.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("text", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("dominant_topic", bigquery.enums.SqlTypeNames.INT64),
        bigquery.SchemaField("topic_perc_contrib", bigquery.enums.SqlTypeNames.FLOAT64),
        bigquery.SchemaField("keywords", bigquery.enums.SqlTypeNames.STRING),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

In [25]:
table_id = "news-site-280319.topics.article_topics"
job = client.load_table_from_dataframe(
    # bq_df,
    output_df,
    table_id, 
    job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

&lt;google.cloud.bigquery.job.LoadJob at 0x1324f6370&gt;

In [26]:
# check job status
job.result().state

&#39;DONE&#39;