# NLP Final Project
Sam Ding

In [1]:
# basic data analytics
import pandas as pd
import numpy as np
import sklearn


# nlp modules
import nltk
import spacy

import multiprocessing
import string


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import warnings

# warnings.simplefilter('once')
warnings.simplefilter('ignore')

num_processors = multiprocessing.cpu_count()
num_processors

workers = num_processors-1

print(f'Using {workers} workers')

In [2]:
# %%time

# df_news_final_project = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
# df_news_final_project.shape

CPU times: user 4.69 s, sys: 4.94 s, total: 9.63 s
Wall time: 1min 3s


(200332, 5)

In [None]:
# # zero-shot classification
# import torch
# from transformers import pipeline
# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [18]:
df_news_final_project = pd.read_csv('sample_600.csv', index_col=0)
# df_news_final_project.shape

In [19]:
df_news_final_project.head(3)

Unnamed: 0,url,date,language,title,text
39396,https://www.wkms.org/npr-news/npr-news/2022-10...,2022-10-10,en,Artificial intelligence could soon diagnose il...,\n\nArtificial intelligence could soon diagnos...
143316,https://www.wbko.com/prnewswire/2022/08/25/ult...,2022-08-25,en,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...
100092,https://www.marketscreener.com/quote/stock/POO...,2022-11-08,en,"IN BRIEF: Poolbeg makes ""significant breakthro...","\n\nIN BRIEF: Poolbeg makes ""significant break..."


In [21]:
# Clean-up newlines
df_news_final_project['text_clean'] = df_news_final_project['text'].str.replace('\n', ' ')

# clean up tabs
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace('\t', ' ')

# clean up links
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace(r'http\S+|https\S+|www.\S+', '', case=False)

# clean up remnants of web crawls
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace(r'&#\d+;', '', case=False)

In [22]:
df_news_final_project['title']

39396     Artificial intelligence could soon diagnose il...
143316    UltraSight Receives CE Mark for Novel Cardiac ...
100092    IN BRIEF: Poolbeg makes "significant breakthro...
102690    Square Peg aims for the AI sweet spot with lat...
21501     Patch 1.04: A Wagonload of AI · Grand Tacticia...
                                ...                        
12183     Know the Future Opportunities of AI Baby Monit...
69983        EMERGE Consortium awarded grant by European...
9710      Pinecone Recognized as a 2021 Gartner® Cool Ve...
45140     DataRobot and Hexaware Collaborate to Help Cus...
150157    How the KU community feels about ChatGPT and w...
Name: title, Length: 600, dtype: object

In [21]:
# NER
nlp = spacy.load("en_core_web_sm")

indexlist = []
entities = []
labels = []

docs = nlp.pipe(
    df_news_final_project['title'].tolist(),
    disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"],
    batch_size=200,
    n_process=2
)

for i, doc in enumerate(docs):
    index = df_news_final_project.index[i]
    for ent in doc.ents:
        indexlist.append(index)
        entities.append(ent.text)
        labels.append(ent.label_)


ner_df = pd.DataFrame({"Index": indexlist, "Entities":entities,'Labels':labels})

In [30]:
ner_df

Unnamed: 0,Index,Entities,Labels
0,196584,MBA,WORK_OF_ART
1,70131,Ferrum Health,ORG
2,70131,$6 Million,MONEY
3,70131,Healthcare,ORG
4,98725,10,CARDINAL
...,...,...,...
1237,125981,Replaced Crypto,PERSON
1238,125981,the Hot Tech Topic of,ORG
1239,125981,Davos,GPE
1240,125981,NBC,ORG


In [31]:
# find indexes where labels have no ORG

with_org_list = list(ner_df[ner_df['Labels'] == 'ORG']['Index'].unique())

no_org_list = list(set(df_news_final_project.index) - set(with_org_list))

# get the text for those indexes

df_news_final_project[df_news_final_project.index.isin(no_org_list)]['title']

196584    Could AI pass an MBA exam at an Ivy League uni...
165334    Kootenay Silver Announces Drill Results From C...
189220              Novel AI Tool To Thwart Covid Mutations
52969                Singer-songwriter, bard Hall, 85, dies
32886     Russia-Ukraine War: AI Has An Eight-Point Peac...
                                ...                        
31231     Decentralized Machine Learning Price Tops $0.0...
86090     OpenAI: ChatGPT back in Italy after meeting wa...
52514     Europe’s technology industry top artificial in...
25009     We asked ChatGPT and Bing 20 different questio...
17043     Artificial intelligence weapons invites ‘moral...
Name: title, Length: 110, dtype: object