# MSCA 32018 Natural Language Processing and Cognitive Computing
## Final Project - Filtering

Shijia Huang

-----

In [1]:
# Import basic libraries
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# Import NLP libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

from pprint import pprint
import string
from rake_nltk import Rake
import math
from textblob import TextBlob

import spacy
from spacy import displacy
from spacy.util import minibatch, compounding
spacy.prefer_gpu()
print(spacy.__version__)

import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim as gensimvis
#import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

3.5.2


In [3]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [4]:
import multiprocessing as mp

num_processors = mp.cpu_count()
print(f'Available CPUs: {num_processors}')

Available CPUs: 12


### Read Cleaned New Articles

In [5]:
%%time

# GCP version
path = "gs://nlp-final-project-data/data/"
df_news = pd.read_parquet(path + 'news_cleaned.parquet', engine='pyarrow')
df_news.shape

CPU times: user 2.41 s, sys: 1.87 s, total: 4.28 s
Wall time: 4.23 s


(200043, 4)

In [6]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200043 entries, 0 to 200042
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             200043 non-null  int64 
 1   date           200043 non-null  object
 2   cleaned title  200043 non-null  object
 3   cleaned text   200043 non-null  object
dtypes: int64(1), object(3)
memory usage: 6.1+ MB


In [7]:
df_news.head()

Unnamed: 0,id,date,cleaned title,cleaned text
0,0,2021-03-18,Artificial intelligence improves parking efficiency in Chinese cities,"Artificial intelligence improves parking efficiency in Chinese cities Photo taken on July 1, 2019, shows a sign for electronic toll collection ETC newly set up at a roadside parking space on Yangzhuang road, Shijingshan district, Beijing. Some urban areas of the city started to use ETC system for roadside parking spaces since July 1, 2019. Peoples Daily OnlineLi Wenming Thanks to the application of an artificial intelligence AIempowered roadside electronic toll collection ETC system, Chinas ..."
1,1,2020-02-27,Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot,"Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot admin Latest posts by admin see all Mansplaining in conferences: How can we get him to forestall February 27, 2020 Coronavirus Could Explode in the U.S. Overnight Like it Did in Italy February 27, 2020 Levi Strauss marks the next phase in corporate paid leave policies February 27, 2020 Scientists who designed an artificially clever robotic that helped youngsters with autism spice up their ..."
2,2,2021-03-26,"Forget ML, AI and Industry 4.0 – obsolescence should be your focus","Forget ML, AI and Industry 4.0 obsolescence should be your focus The world entered a new era of accelerated transformation in the last eighteen months that will continue to evolve and press forward for years to come. Most businesses are playing catchup trying to make sense of a new timeline where the ten years that had been set aside for careful planning and implementation of what was coming up next no longer exists. The next is happening now and, regardless of your industry or seniority, t..."
3,3,2021-03-10,Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered,"Strategy Analytics: 71 of Smartphones Sold Globally in 2021 will be AI Powered BOSTONBUSINESS WIREStrategy Analytics in a newly published report, Smartphones: Global Artificial Intelligence Technologies Forecast to 2025, finds that ondevice Artificial Intelligence AI is being rapidly implemented by smartphone vendors. AI is used in various functions inside smartphones such as intelligent power optimization, imaging, virtual assistants, and to enhance device performance. The report highlights..."
4,4,2020-10-20,Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagnostic Support Application,"Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagnostic Support Application TOKYO, Oct 20, 2020 ACN Newswire Olympus Corporation took part in a groundbreaking project as a business promoter, in cooperation with the Ministry of Internal Affairs and Communications MIC, entitled, Survey Study for International Expansion of AI Diagnosis Support System Using UltraHigh Magnifying Endoscopes in India. The project aims to develop advanced endoscopy di..."


In [8]:
### SAMPLE DATA
#df_news = df_news.sample(frac=0.01, random_state=42)
df_news.shape

## Text Normalization

In [9]:
# function to remove special characters from a sentence
def remove_spc_char(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9 @ . , : - _]', '', text)
    return cleaned_text

In [10]:
# function to tokenize text into words and remove punctuation
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [11]:
# function to remove stopwords from list of tokens
stop_words = stopwords.words('english')

def remove_stopwords(texts):
    cleaned_texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    return cleaned_texts


In [12]:
# function to make ngrams from original tokens
def make_bigrams(texts_wstops):
    bigram = gensim.models.Phrases(texts_wstops, min_count=1, threshold=3)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    texts_nostops = remove_stopwords(texts_wstops)
    return [bigram_mod[doc] for doc in texts_nostops]

def make_trigrams(texts_wstops):
    bigram = gensim.models.Phrases(texts_wstops, min_count=1, threshold=3)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram = gensim.models.Phrases(bigram[texts_wstops], threshold=1)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    texts_nostops = remove_stopwords(texts_wstops)
    return [trigram_mod[bigram_mod[doc]] for doc in texts_nostops]

In [13]:
# function to lemmatize words
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

### Normalize News Titles

In [14]:
%%time

# remove special characters
title_cleaned = np.array(df_news['cleaned title'].apply(remove_spc_char))

# tokenize the news title
title_tokens = list(sent_to_words(title_cleaned))

# remove stopwords
title_token_nostops = remove_stopwords(title_tokens)

# make bigrams and trigrams
title_bigrams = make_bigrams(title_tokens)
title_trigrams = make_trigrams(title_tokens)

# combine tokens and ngrams
df_news['title_tokens'] = list(zip(title_token_nostops, title_bigrams, title_trigrams))
df_news['title_tokens'] = df_news['title_tokens'].apply(lambda x: list(x[0] + x[1] + x[2]))
title_tokens_all = df_news['title_tokens'].tolist()

CPU times: user 50 s, sys: 518 ms, total: 50.5 s
Wall time: 50.7 s


In [15]:
%%time

# lemmatize the tokens keeping only noun, adj, vb, adv
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
title_lemmatized = lemmatization(title_tokens_all, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN'])

df_news['title_lemmatized'] = title_lemmatized
df_news[['cleaned title', 'title_tokens', 'title_lemmatized']].head()

CPU times: user 9min 21s, sys: 3.81 s, total: 9min 24s
Wall time: 9min 27s


Unnamed: 0,cleaned title,title_tokens,title_lemmatized
0,Artificial intelligence improves parking efficiency in Chinese cities,"[artificial, intelligence, improves, parking, efficiency, chinese, cities, artificial_intelligence, improves, parking, efficiency, chinese, cities, artificial_intelligence, improves, parking, efficiency, chinese, cities]","[artificial, intelligence, improve, parking, efficiency, chinese, city, artificial_intelligence, improve, parking, efficiency, chinese, city, artificial_intelligence, improve, parking, efficiency, chinese, city]"
1,Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot,"[children, autism, saw, learning, social, skills, boosted, playing, ai, robot, children, autism, saw, learning, social, skills, boosted, playing, ai, robot, children, autism, saw, learning, social, skills, boosted, playing, ai_robot]","[child, autism, see, learn, social, skill, boost, play, robot, child, autism, see, learn, social, skill, boost, play, robot, child, autism, see, learn, social, skill, boost, play, ai_robot]"
2,"Forget ML, AI and Industry 4.0 – obsolescence should be your focus","[forget, ml, ai, industry, obsolescence, focus, forget, ml, ai, industry, obsolescence, focus, forget, ml, ai, industry, obsolescence, focus]","[forget, ml, ai, industry, obsolescence, focus, forget, ml, ai, industry, obsolescence, focus, forget, ml, ai, industry, obsolescence, focus]"
3,Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered,"[strategy, analytics, smartphones, sold, globally, ai, powered, strategy, analytics, smartphones_sold, globally, ai, powered, strategy, analytics, smartphones_sold, globally, ai_powered]","[strategy, analytic, smartphone, sell, globally, ai, powered, strategy, analytic, smartphones_sold, globally, ai, powered, strategy, analytic, smartphones_sold, globally, ai_powere]"
4,Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagnostic Support Application,"[olympus, support, endoscopic, ai, diagnosis, education, doctors, india, launch, ai, diagnostic, support, application, olympus, support_endoscopic, ai, diagnosis, education, doctors, india, launch, ai, diagnostic, support, application, olympus, support_endoscopic, ai_diagnosis, education, doctors, india, launch, ai, diagnostic, support, application]","[olympu, support, endoscopic, ai, diagnosis, education, doctor, india, launch, ai, diagnostic, support, application, ai, diagnosis, education, doctor, india, launch, ai, diagnostic, support, application, education, doctor, india, launch, ai, diagnostic, support, application]"


### Normalize News Text

In [16]:
%%time

# tokenize the news text
text_cleaned = np.array(df_news['cleaned text'])
text_tokens = list(sent_to_words(text_cleaned))

# remove stopwords
text_token_nostops = remove_stopwords(text_tokens)

# make bigrams and trigrams
text_bigrams = make_bigrams(text_tokens)
text_trigrams = make_trigrams(text_tokens)

# combine tokens and ngrams
df_news['text_tokens'] = list(zip(text_token_nostops, text_bigrams, text_trigrams))
df_news['text_tokens'] = df_news['text_tokens'].apply(lambda x: list(x[0] + x[1] + x[2]))
text_tokens_all = df_news['text_tokens'].tolist()

CPU times: user 55min 52s, sys: 14min 31s, total: 1h 10min 23s
Wall time: 1h 25min 17s


In [18]:
%%time

# lemmatize the tokens keeping only noun, adj, vb, adv
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
text_lemmatized = lemmatization(text_tokens_all, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

df_news['text_lemmatized'] = text_lemmatized
df_news[['cleaned text', 'text_tokens', 'text_lemmatized']].head()

CPU times: user 4h 5min 28s, sys: 1h 54min 45s, total: 6h 14s
Wall time: 14h 34min 12s


Unnamed: 0,cleaned text,text_tokens,text_lemmatized
0,"Artificial intelligence improves parking efficiency in Chinese cities Photo taken on July 1, 2019, shows a sign for electronic toll collection ETC newly set up at a roadside parking space on Yangzhuang road, Shijingshan district, Beijing. Some urban areas of the city started to use ETC system for roadside parking spaces since July 1, 2019. Peoples Daily OnlineLi Wenming Thanks to the application of an artificial intelligence AIempowered roadside electronic toll collection ETC system, Chinas ...","[artificial, intelligence, improves, parking, efficiency, chinese, cities, photo, taken, july, shows, sign, electronic, toll, collection, etc, newly, set, roadside, parking, space, yangzhuang, road, shijingshan, district, beijing, urban, areas, city, started, use, etc, system, roadside, parking, spaces, since, july, peoples, daily, onlineli, wenming, thanks, application, artificial, intelligence, aiempowered, roadside, electronic, toll, collection, etc, system, chinas, capital, city, beijing...","[artificial, intelligence, improve, parking, efficiency, chinese, city, photo, take, show, sign, electronic, toll, collection, newly, set, roadside, parking, space, area, start, use, system, roadside, parking, space, people, daily, onlineli, wenme, thank, application, artificial, intelligence, aiempowere, roadside, electronic, toll, collection, system, china, see, significant, improvement, efficiency, parking, fee, collection, turnover, roadside, parking, spot, order, roadside, parking, traf..."
1,"Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot admin Latest posts by admin see all Mansplaining in conferences: How can we get him to forestall February 27, 2020 Coronavirus Could Explode in the U.S. Overnight Like it Did in Italy February 27, 2020 Levi Strauss marks the next phase in corporate paid leave policies February 27, 2020 Scientists who designed an artificially clever robotic that helped youngsters with autism spice up their ...","[children, autism, saw, learning, social, skills, boosted, playing, ai, robot, admin, latest, posts, admin, see, mansplaining, conferences, get, forestall, february, coronavirus, could, explode, overnight, like, italy, february, levi, strauss, marks, next, phase, corporate, paid, leave, policies, february, scientists, designed, artificially, clever, robotic, helped, youngsters, autism, spice, studying, social, talents, hope, era, may, future, help, others, developmental, dysfunction, learn, ...","[child, autism, see, learn, social, skill, boost, play, robot, late, post, admin, see, mansplaining, conference, get, explode, overnight, mark, next, phase, corporate, pay, leave, policy, scientist, design, artificially, clever, robotic, help, youngster, autism, spice, study, social, talent, era, future, help, other, developmental, dysfunction, learn, notice, youngster, gentle, average, autism, take, domestic, s, refer, socially, assistive, robotic, name, kiwi, month, accord, commentary, way..."
2,"Forget ML, AI and Industry 4.0 obsolescence should be your focus The world entered a new era of accelerated transformation in the last eighteen months that will continue to evolve and press forward for years to come. Most businesses are playing catchup trying to make sense of a new timeline where the ten years that had been set aside for careful planning and implementation of what was coming up next no longer exists. The next is happening now and, regardless of your industry or seniority, t...","[forget, ml, ai, industry, obsolescence, focus, world, entered, new, era, accelerated, transformation, last, eighteen, months, continue, evolve, press, forward, years, come, businesses, playing, catchup, trying, make, sense, new, timeline, ten, years, set, aside, careful, planning, implementation, coming, next, longer, exists, next, happening, regardless, industry, seniority, status, quo, shifted, better, face, back, invited, attend, pompous, meeting, london, brazilian, embassy, along, selec...","[forget, ai, industry, obsolescence, focus, world, enter, new, era, accelerate, transformation, last, month, continue, evolve, press, forward, year, come, business, play, catchup, try, make, sense, new, timeline, year, set, aside, careful, planning, implementation, come, next, long, exist, next, happen, regardless, industry, seniority, status, quo, shift, well, face, back, invite, attend, pompous, meeting, brazilian, embassy, select, lead, name, oil, energy, industry, get, update, go, happen..."
3,"Strategy Analytics: 71 of Smartphones Sold Globally in 2021 will be AI Powered BOSTONBUSINESS WIREStrategy Analytics in a newly published report, Smartphones: Global Artificial Intelligence Technologies Forecast to 2025, finds that ondevice Artificial Intelligence AI is being rapidly implemented by smartphone vendors. AI is used in various functions inside smartphones such as intelligent power optimization, imaging, virtual assistants, and to enhance device performance. The report highlights...","[strategy, analytics, smartphones, sold, globally, ai, powered, bostonbusiness, wirestrategy, analytics, newly, published, report, smartphones, global, artificial, intelligence, technologies, forecast, finds, ondevice, artificial, intelligence, ai, rapidly, implemented, smartphone, vendors, ai, used, various, functions, inside, smartphones, intelligent, power, optimization, imaging, virtual, assistants, enhance, device, performance, report, highlights, fact, ai, become, important, technology...","[strategy, analytic, smartphone, sell, globally, ai, power, bostonbusiness, wirestrategy, analytic, newly, publish, report, smartphone, global, artificial, intelligence, technology, forecast, find, ondevice, artificial, intelligence, ai, rapidly, implement, vendor, ai, use, various, function, smartphone, intelligent, power, optimization, image, virtual, assistant, enhance, device, performance, report, highlight, fact, ai, become, important, technology, modern, smartphone, push, add, ondevice..."
4,"Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagnostic Support Application TOKYO, Oct 20, 2020 ACN Newswire Olympus Corporation took part in a groundbreaking project as a business promoter, in cooperation with the Ministry of Internal Affairs and Communications MIC, entitled, Survey Study for International Expansion of AI Diagnosis Support System Using UltraHigh Magnifying Endoscopes in India. The project aims to develop advanced endoscopy di...","[olympus, support, endoscopic, ai, diagnosis, education, doctors, india, launch, ai, diagnostic, support, application, tokyo, oct, acn, newswire, olympus, corporation, took, part, groundbreaking, project, business, promoter, cooperation, ministry, internal, affairs, communications, mic, entitled, survey, study, international, expansion, ai, diagnosis, support, system, using, ultrahigh, magnifying, endoscopes, india, project, aims, develop, advanced, endoscopy, diagnostics, india, relatively,...","[olympu, support, ai, diagnosis, education, doctor, launch, ai, diagnostic, support, application, corporation, take, part, groundbreaking, project, business, promoter, cooperation, affair, communication, entitle, survey, study, international, expansion, ai, diagnosis, support, system, use, ultrahigh, magnifying, endoscope, project, aim, develop, advanced, endoscopy, diagnostic, relatively, endoscopist, collaboration, cybernet, establish, ai, diagnostic, support, system, major, medical, insti..."


In [19]:
%%time

# save lemmatized news text as parquet file
# path = "gs://nlp-final-project-data/data/"
# df_news.to_parquet(path + 'news_lemmatized.parquet', engine='pyarrow')

## News Article Filtering with TF-IDF Keywords

In [None]:
%%time

# load lemmatized news text from parquet file on GCP
path = "gs://nlp-final-project-data/data/"
df_news = pd.read_parquet(path + 'news_lemmatized.parquet', engine='pyarrow')
df_news.shape

In [None]:
%%time
# read from local file
# path = '/Users/silvia/Desktop/Spring 2023/32018 NLP/Final Project Data/data/news_lemmatized.parquet'
# df_news = pd.read_parquet(path, engine='pyarrow')
# df_news.shape

In [25]:
# drop article with no title tokens after lemmatization
df_news = df_news[df_news['title_lemmatized'].map(len) > 3]

In [21]:
# extract top n keywords from lemmatized text using TF-IDF
def get_keywords_tfidf(text, n):

    # create tf-idf vectorizer object
    vectorizer = TfidfVectorizer()

    # tokenize and build vocab
    tfidf = vectorizer.fit_transform(text)

    # sort weights in descending order and get top n
    sorted_weights = np.argsort(np.asarray(tfidf.sum(axis=0))).flatten()[::-1][:n]

    # get feature names
    feature_names = np.array(vectorizer.get_feature_names())

    # get top n keywords
    keywords = feature_names[sorted_weights]

    return keywords

### Title Keywords

In [26]:
%%time

# extact keywords from news title
df_news['title_keywords'] = df_news['title_lemmatized'].apply(lambda x: get_keywords_tfidf(x, n=10))
df_news[['cleaned title', 'title_lemmatized', 'title_keywords']].head()

CPU times: user 3min 27s, sys: 1.71 s, total: 3min 29s
Wall time: 3min 30s


Unnamed: 0,cleaned title,title_lemmatized,title_keywords
0,Artificial intelligence improves parking efficiency in Chinese cities,"[artificial, intelligence, improve, parking, efficiency, chinese, city, artificial_intelligence, improve, parking, efficiency, chinese, city, artificial_intelligence, improve, parking, efficiency, chinese, city]","[parking, improve, efficiency, city, chinese, artificial_intelligence, intelligence, artificial]"
1,Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot,"[child, autism, see, learn, social, skill, boost, play, robot, child, autism, see, learn, social, skill, boost, play, robot, child, autism, see, learn, social, skill, boost, play, ai_robot]","[social, skill, see, play, learn, child, boost, autism, robot, ai_robot]"
2,"Forget ML, AI and Industry 4.0 – obsolescence should be your focus","[forget, ml, ai, industry, obsolescence, focus, forget, ml, ai, industry, obsolescence, focus, forget, ml, ai, industry, obsolescence, focus]","[obsolescence, ml, industry, forget, focus, ai]"
3,Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered,"[strategy, analytic, smartphone, sell, globally, ai, powered, strategy, analytic, smartphones_sold, globally, ai, powered, strategy, analytic, smartphones_sold, globally, ai_powere]","[strategy, globally, analytic, smartphones_sold, powered, ai, smartphone, sell, ai_powere]"
4,Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagnostic Support Application,"[olympu, support, endoscopic, ai, diagnosis, education, doctor, india, launch, ai, diagnostic, support, application, ai, diagnosis, education, doctor, india, launch, ai, diagnostic, support, application, education, doctor, india, launch, ai, diagnostic, support, application]","[ai, support, launch, india, education, doctor, diagnostic, application, diagnosis, olympu]"


### Text Keywords

In [27]:
%%time

# extract keywords from news text
df_news['text_keywords'] = df_news['text_lemmatized'].apply(lambda x: get_keywords_tfidf(x, n=10))
df_news[['cleaned text', 'text_lemmatized', 'text_keywords']].head()

CPU times: user 16min 34s, sys: 1min 7s, total: 17min 42s
Wall time: 25min 44s


Unnamed: 0,cleaned text,text_lemmatized,text_keywords
0,"Artificial intelligence improves parking efficiency in Chinese cities Photo taken on July 1, 2019, shows a sign for electronic toll collection ETC newly set up at a roadside parking space on Yangzhuang road, Shijingshan district, Beijing. Some urban areas of the city started to use ETC system for roadside parking spaces since July 1, 2019. Peoples Daily OnlineLi Wenming Thanks to the application of an artificial intelligence AIempowered roadside electronic toll collection ETC system, Chinas ...","[artificial, intelligence, improve, parking, efficiency, chinese, city, photo, take, show, sign, electronic, toll, collection, newly, set, roadside, parking, space, area, start, use, system, roadside, parking, space, people, daily, onlineli, wenme, thank, application, artificial, intelligence, aiempowere, roadside, electronic, toll, collection, system, china, see, significant, improvement, efficiency, parking, fee, collection, turnover, roadside, parking, spot, order, roadside, parking, traf...","[system, parking, roadside, city, vehicle, camera, collection, aipark, roadside_parke, eye]"
1,"Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot admin Latest posts by admin see all Mansplaining in conferences: How can we get him to forestall February 27, 2020 Coronavirus Could Explode in the U.S. Overnight Like it Did in Italy February 27, 2020 Levi Strauss marks the next phase in corporate paid leave policies February 27, 2020 Scientists who designed an artificially clever robotic that helped youngsters with autism spice up their ...","[child, autism, see, learn, social, skill, boost, play, robot, late, post, admin, see, mansplaining, conference, get, explode, overnight, mark, next, phase, corporate, pay, leave, policy, scientist, design, artificially, clever, robotic, help, youngster, autism, spice, study, social, talent, era, future, help, other, developmental, dysfunction, learn, notice, youngster, gentle, average, autism, take, domestic, s, refer, socially, assistive, robotic, name, kiwi, month, accord, commentary, way...","[robotic, youngster, kid, child, kiwi, market, autism, learn, crew, talent]"
2,"Forget ML, AI and Industry 4.0 obsolescence should be your focus The world entered a new era of accelerated transformation in the last eighteen months that will continue to evolve and press forward for years to come. Most businesses are playing catchup trying to make sense of a new timeline where the ten years that had been set aside for careful planning and implementation of what was coming up next no longer exists. The next is happening now and, regardless of your industry or seniority, t...","[forget, ai, industry, obsolescence, focus, world, enter, new, era, accelerate, transformation, last, month, continue, evolve, press, forward, year, come, business, play, catchup, try, make, sense, new, timeline, year, set, aside, careful, planning, implementation, come, next, long, exist, next, happen, regardless, industry, seniority, status, quo, shift, well, face, back, invite, attend, pompous, meeting, brazilian, embassy, select, lead, name, oil, energy, industry, get, update, go, happen...","[electronic, come, card, industry, repair, new, system, require, test, business]"
3,"Strategy Analytics: 71 of Smartphones Sold Globally in 2021 will be AI Powered BOSTONBUSINESS WIREStrategy Analytics in a newly published report, Smartphones: Global Artificial Intelligence Technologies Forecast to 2025, finds that ondevice Artificial Intelligence AI is being rapidly implemented by smartphone vendors. AI is used in various functions inside smartphones such as intelligent power optimization, imaging, virtual assistants, and to enhance device performance. The report highlights...","[strategy, analytic, smartphone, sell, globally, ai, power, bostonbusiness, wirestrategy, analytic, newly, publish, report, smartphone, global, artificial, intelligence, technology, forecast, find, ondevice, artificial, intelligence, ai, rapidly, implement, vendor, ai, use, various, function, smartphone, intelligent, power, optimization, image, virtual, assistant, enhance, device, performance, report, highlight, fact, ai, become, important, technology, modern, smartphone, push, add, ondevice...","[ai, smartphone, strategy, analytic, ondevice, power, technology, report, well, become]"
4,"Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagnostic Support Application TOKYO, Oct 20, 2020 ACN Newswire Olympus Corporation took part in a groundbreaking project as a business promoter, in cooperation with the Ministry of Internal Affairs and Communications MIC, entitled, Survey Study for International Expansion of AI Diagnosis Support System Using UltraHigh Magnifying Endoscopes in India. The project aims to develop advanced endoscopy di...","[olympu, support, ai, diagnosis, education, doctor, launch, ai, diagnostic, support, application, corporation, take, part, groundbreaking, project, business, promoter, cooperation, affair, communication, entitle, survey, study, international, expansion, ai, diagnosis, support, system, use, ultrahigh, magnifying, endoscope, project, aim, develop, advanced, endoscopy, diagnostic, relatively, endoscopist, collaboration, cybernet, establish, ai, diagnostic, support, system, major, medical, insti...","[ai, support, diagnostic, train, system, endoscope, doctor, project, use, diagnosis]"


In [28]:
# Combine all keywords into one list across all news titles
title_keywords_all = []
for keywords in df_news['title_keywords']:
    title_keywords_all.extend(keywords)

title_keywords_all = pd.Series(title_keywords_all)
title_keywords_all.value_counts().head(30)


ai                         113353
intelligence                25125
artificial_intelligence     21939
market                      19097
artificial                  15959
chatgpt                     14164
use                         13399
new                         13043
global                      10831
platform                     8208
technology                   8110
machine                      8110
launch                       7791
google                       7788
datum                        6410
learn                        6312
microsoft                    6076
solution                     5757
forecast                     5613
machine_learne               5574
tool                         5317
analysis                     5188
growth                       5183
company                      4494
research                     4423
industry                     4225
science                      4200
help                         4151
tech                         4073
report        

In [29]:
# Combine all keywords into one list across all news article text
text_keywords_all = []
for keywords in df_news['text_keywords']:
    text_keywords_all.extend(keywords)

text_keywords_all = pd.Series(text_keywords_all)
text_keywords_all.value_counts().head(30)

ai              103452
use              54302
technology       41912
company          40686
say              39567
market           31454
new              29309
datum            27475
report           24135
global           17969
solution         17042
business         16878
industry         16735
intelligence     16317
analysis         16270
platform         15647
system           15302
make             14339
chatgpt          14254
research         13529
service          12883
model            12623
growth           12574
help             12434
also             12271
customer         12212
work             11698
product          10457
artificial        9984
tool              9982
Name: count, dtype: int64

In [30]:
# select keywords from top 30 keywords that related to ai, ds, ml and industry forecasting
title_keywords = ['ai', 'ml', 'artificial', 'intelligence', 'artificial_intelligence', 'data', 'science', 'machine', 'machine_learne', 'technology', 'chatgpt', 'model', 'industry', 'forecast', 'market']

text_keywords = ['ai', 'ml', 'artificial', 'intelligence', 'artificial_intelligence', 'data', 'science', 'technology', 'machine', 'chatgpt', 'model', 'industry', 'forecast', 'market']

In [31]:
# Filter out news articles that do not contain any of the keywords

# filter by title keywords
df_news_filtered = df_news[df_news['title_keywords'].apply(lambda x: any([keyword in x for keyword in title_keywords]))].reset_index(drop=True)

# filter by text keywords
df_news_filtered = df_news_filtered[df_news_filtered['text_keywords'].apply(lambda x: any([keyword in x for keyword in text_keywords]))].reset_index(drop=True)


df_news_filtered.shape

(154283, 10)

In [32]:
df_news_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154283 entries, 0 to 154282
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                154283 non-null  int64 
 1   date              154283 non-null  object
 2   cleaned title     154283 non-null  object
 3   cleaned text      154283 non-null  object
 4   title_tokens      154283 non-null  object
 5   title_lemmatized  154283 non-null  object
 6   text_tokens       154283 non-null  object
 7   text_lemmatized   154283 non-null  object
 8   title_keywords    154283 non-null  object
 9   text_keywords     154283 non-null  object
dtypes: int64(1), object(9)
memory usage: 11.8+ MB


## Save Filtered Data

In [33]:
%%time

# save processed news text as parquet file
# path = "gs://nlp-final-project-data/data/"
# df_news_filtered.to_parquet(path + 'news_filtered.parquet', engine='pyarrow')