# NLP Final Project
Sam Ding

In [1]:
# basic data analytics
import pandas as pd
import numpy as np
import sklearn
import pickle

# nlp modules
import nltk
import spacy
import re

import multiprocessing
import string

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import warnings

# warnings.simplefilter('once')
warnings.simplefilter('ignore')

num_processors = multiprocessing.cpu_count()
num_processors

workers = num_processors-1

print(f'Using {workers} workers')

Using 7 workers


In [2]:
# %%time

# df_news_final_project = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
# df_news_final_project.shape

In [3]:
# # zero-shot classification
# import torch
# from transformers import pipeline
# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [4]:
df_news_final_project = pd.read_csv('sample_600.csv', index_col=0)
# df_news_final_project.shape

In [5]:
df_news_final_project.head(3)

Unnamed: 0,url,date,language,title,text
39396,https://www.wkms.org/npr-news/npr-news/2022-10...,2022-10-10,en,Artificial intelligence could soon diagnose il...,\n\nArtificial intelligence could soon diagnos...
143316,https://www.wbko.com/prnewswire/2022/08/25/ult...,2022-08-25,en,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...
100092,https://www.marketscreener.com/quote/stock/POO...,2022-11-08,en,"IN BRIEF: Poolbeg makes ""significant breakthro...","\n\nIN BRIEF: Poolbeg makes ""significant break..."


In [6]:
# Clean-up newlines
# df_news_final_project['text_clean'] = df_news_final_project['text'].str.replace('\n', ' ')

# clean up tabs
df_news_final_project['text_clean'] = df_news_final_project['text'].str.replace('\t', ' ')

# clean up links
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '')

# clean up remnants of web crawls
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace(r'&#\d+;', '', case=False)

In [7]:
# NER
nlp = spacy.load("en_core_web_md")

indexlist = []
entities = []
labels = []

docs = nlp.pipe(
    df_news_final_project['title'].tolist(),
    disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"],
    batch_size=200,
    n_process=2
)

for i, doc in enumerate(docs):
    index = df_news_final_project.index[i]
    for ent in doc.ents:
        indexlist.append(index)
        entities.append(ent.text)
        labels.append(ent.label_)


ner_df = pd.DataFrame({"Index": indexlist, "Entities":entities,'Labels':labels})

In [8]:
ner_df

Unnamed: 0,Index,Entities,Labels
0,143316,UltraSight,ORG
1,143316,CE Mark,PERSON
2,102690,Square Peg,PERSON
3,102690,AI,ORG
4,21501,1.04,CARDINAL
...,...,...,...
1333,9710,Artificial Intelligence and Machine Learning,ORG
1334,45140,Hexaware Collaborate to,ORG
1335,45140,Help Customers Accelerate,ORG
1336,45140,Journey,PRODUCT


In [9]:
# find indexes where labels have no ORG

with_org_list = list(ner_df[ner_df['Labels'] == 'ORG']['Index'].unique())

no_org_list = list(set(df_news_final_project.index) - set(with_org_list))

# get the title for those indexes

df_news_final_project[df_news_final_project.index.isin(no_org_list)]['title']

39396     Artificial intelligence could soon diagnose il...
100092    IN BRIEF: Poolbeg makes "significant breakthro...
21501     Patch 1.04: A Wagonload of AI · Grand Tacticia...
83882     Mohammad Hosseini: Should we bring AI into hos...
151392    SHUTTERSTOCK PARTNERS WITH OPENAI AND LEADS TH...
                                ...                        
9820          Artificial Intelligence Wish List - NewsBreak
119656    Rockies' trade deadline: Trevor Story, Jon Gra...
129684    Can ChatGPT help with investments if you want ...
117860    BuzzFeed to use artificial intelligence for co...
150157    How the KU community feels about ChatGPT and w...
Name: title, Length: 171, dtype: object

In [10]:
df_news_final_project.head(2)

Unnamed: 0,url,date,language,title,text,text_clean
39396,https://www.wkms.org/npr-news/npr-news/2022-10...,2022-10-10,en,Artificial intelligence could soon diagnose il...,\n\nArtificial intelligence could soon diagnos...,\n\nArtificial intelligence could soon diagnos...
143316,https://www.wbko.com/prnewswire/2022/08/25/ult...,2022-08-25,en,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...


In [11]:
notitle = []
for i in df_news_final_project.index.values:
    notitle.append(df_news_final_project['text_clean'][i].replace(df_news_final_project['title'][i], 'Hahahah 23, 2026')) # replace by this chunk so title can also be split by pattern

df_news_final_project['text_notitle'] = notitle

# drop everything after string 'for more information'ABC

df_news_final_project['text_notitle'] = df_news_final_project['text_notitle'].str.split(r'[F|f]or more information', expand=True)[0]


In [12]:
pattern = r'(\w{3,10}\.*\s\d{1,2}\,*\s20\d{2})|(\d{1,2}\s\w{3,10}\.*\s20\d{2})|\n+'

df_news_final_project['split'] = df_news_final_project['text_notitle'].apply(lambda x: re.split(pattern=pattern, string=x))
df_news_final_project['split_len'] = df_news_final_project['split'].apply(lambda x: len(x))

len 1 was 116, after new pattern 78, after new pattern all splitable.

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def get_most_similar(index):
    '''
    This function takes in an index of the dataframe and returns the most similar text to the title,
    filtering out other unnecessary texts.
    '''

    title = df_news_final_project['title'][index]
    texts = df_news_final_project['split'][index]

    # print('Original:\n', texts)

    # compare capital letters and periods, drop if there are more capital letters in the texts
    texts = [x for x in texts if x != None]

    # print('Dropping None:\n', texts)

    # print('Dropping more period than capital:\n', texts)

    # take out texts that are too short
    texts = [x for x in texts if len(x) > 150]

    # print('Dropping too short:\n', texts)

    # drop texts with Tab patterns
    pattern = r'([A-Z][a-z]+(\n|\t)+){4}'
    texts =  [re.sub(pattern, '', x) for x in texts]

    # drop texts with Sign up or email patterns
    texts = [x for x in texts if re.search(r'([S|s]ign up)|([E|e]mail)', x) == None]

    # print('Dropping Tab patterns:\n', texts)

    # keep texts with small letter patterns
    pattern = r'([a-z]+\s){4}'
    texts = [x for x in texts if re.search(pattern, x) != None]

    # print('Keeping small letter patterns:\n', texts)


    # return NA if there are no text splits left
    if len(texts) == 0:
        return np.nan

    # Tokenize the title and texts
    tokenized_title = nltk.word_tokenize(title.lower())
    tokenized_texts = [nltk.word_tokenize(text.lower()) for text in texts]

    # Convert the tokenized texts to strings
    text_strings = [' '.join(tokens) for tokens in tokenized_texts]

    # print('Tokenized texts:\n', len(text_strings))

    index_to_return = 0

    # Create a TF-IDF vectorizer and fit it to the text strings
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(text_strings)
        # print(tfidf_matrix.shape)

    # Compute the cosine similarity between the title and each text
    title_vector = vectorizer.transform([' '.join(tokenized_title)])
    similarity_scores = cosine_similarity(title_vector, tfidf_matrix)

    most_similar_index = similarity_scores.argmax()
        
    most_similar_score = similarity_scores.max()
    index_to_return = most_similar_index
        
    list_articles = []
    list_scores = []

    for i in range(0,len(similarity_scores[0])):
        if similarity_scores[0][i] >= 0.07:
            list_articles.append(text_strings[i])
            list_scores.append(similarity_scores[0][i])
        # print(len(text_strings), len(similarity_scores[0]))
                
        # if len(text_strings) > 1:
        #     second_most_similar_index = similarity_scores.argsort()[0][-2]
        #     # print(similarity_scores)
        #     second_most_similar_score = similarity_scores[0][second_most_similar_index]
        #     if most_similar_score > 0.2:
        #         index_to_return = second_most_similar_index
        # # return texts[index_to_return]
        # return 

    # except:
    #     pass
    

    return ' '.join(list_articles)
    


In [16]:
most_similar = []

for i in df_news_final_project.index.values:
    most_similar.append(get_most_similar(i))

df_news_final_project['text_clean'] = most_similar

df_news_final_project.dropna(subset=['text_clean'], inplace=True)

df_news_final_project.shape

(574, 9)

In [17]:
from pyspark.sql.functions import udf
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import StopWordsCleaner, Tokenizer
import sparknlp

# start Spark session
spark = sparknlp.start()

# create Spark dataframe
df_spark = spark.createDataFrame(df_news_final_project)

# define a UDF that detects if the article has keywords
keywords = ['artificial intelligence', 'machine learning', 'data science', 'data analytics']

def has_keywords(text):
    for keyword in keywords:
        if keyword in text:
            return True
    return False

has_keywords_udf = udf(has_keywords)

# add a column to the dataframe that indicates if the article has keywords
df_spark = df_spark.withColumn('has_keywords', has_keywords_udf(df_spark['text_clean']))

# drop articles that do not have keywords
df_spark = df_spark.filter(df_spark['has_keywords'] == True)

# make df_spark the new df_news_final_project
df_news_final_project = df_spark.toPandas()


:: loading settings :: url = jar:file:/Users/sding/.env/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/sding/.ivy2/cache
The jars for the packages stored in: /Users/sding/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-44619dba-778c-4755-9028-939da756d8ba;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.4.2 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.16.0 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.guava#failur

In [26]:
df_news_final_project.shape

(350, 10)

In [24]:
len("© 2023 ideastream public media1375 euclid avenue , cleveland , ohio 44115 ( 216 ) 916-6100 | ( 877 ) 399-3307wksu is a public media service licensed to kent state university and operated by ideastream public media . the tech world enthusiastic about the possibilities of artificial intelligence , but where does that leave meta 's plans for the metaverse ? some large companies are already dialing back their plans . © 2023 ideastream public media1375 euclid avenue , cleveland , ohio 44115 ( 216 ) 916-6100 | ( 877 ) 399-3307wksu is a public media service licensed to kent state university and operated by ideastream public media .")

632

In [25]:
# drop text clean that are less than 700 characters
df_news_final_project = df_news_final_project[df_news_final_project['text_clean'].apply(lambda x: len(x)) > 700]

Here we have the cleaned text for downstream analysis. Currently there are 587 documents that are not null and have the mentions of those words in Data Science.

## Sentiment Analysis

In [27]:
df_news_final_project.iloc[:150][['title', 'text_clean']].to_csv('sentanalysis150.csv', index=False)

Unnamed: 0,url,date,language,title,text,text_clean,text_notitle,split,split_len,has_keywords
0,https://www.wbko.com/prnewswire/2022/08/25/ult...,2022-08-25,en,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...,at 7:30 am cdt|updated : 1 hour agolandmark ac...,"Hahahah 23, 2026\n\nSkip to contentNewsWeather...","[, Hahahah 23, 2026, None, , None, None, Skip ...",16,true
1,https://www.marketscreener.com/quote/stock/POO...,2022-11-08,en,"IN BRIEF: Poolbeg makes ""significant breakthro...","\n\nIN BRIEF: Poolbeg makes ""significant break...",poolbeg pharma plc - london-based clinical sta...,"\n\nHahahah 23, 2026\n \n\n ...","[, None, None, , Hahahah 23, 2026, None, , Non...",745,true
2,https://forextv.com/top-news/forgerock-powers-...,2020-06-23,en,ForgeRock Powers New Era of Digital Identity w...,\n\nForgeRock Powers New Era of Digital Identi...,"( globe newswire ) — forgerock® , the leading ...","\n\nHahahah 23, 2026\n\n \n\n \n\nBreaking New...","[, None, None, , Hahahah 23, 2026, None, , Non...",295,true
3,https://pantagraph.com/opinion/columnists/moha...,2023-04-14,en,Mohammad Hosseini: Should we bring AI into hos...,\nMohammad Hosseini: Should we bring AI into h...,"recently , two major news stories in the techn...","\nHahahah 23, 2026\n\nSkip to main contentSkip...","[, None, None, , Hahahah 23, 2026, None, , Non...",865,true
4,https://www.nbc11news.com/prnewswire/2021/10/2...,2021-10-26,en,New Data from Samsara Shows How AI and Digital...,New Data from Samsara Shows How AI and Digital...,"/prnewswire/ -- samsara , the pioneer of the c...","Hahahah 23, 2026\n\nSkip to contentNo Wait Wea...","[, Hahahah 23, 2026, None, , None, None, Skip ...",16,true
...,...,...,...,...,...,...,...,...,...,...
359,https://www.businesstoday.in/technology/story/...,2023-01-25,en,Can ChatGPT help with investments if you want ...,Can ChatGPT help with investments if you wan...,news technology can chatgpt help with investme...,"Hahahah 23, 2026 ...","[ , Hahahah 23, 2026, None, ...",376,true
360,https://www.kshb.com/news/national/buzzfeed-to...,2023-01-30,en,BuzzFeed to use artificial intelligence for co...,\nBuzzFeed to use artificial intelligence for ...,file photo shows the entrance to buzzfeed in n...,"\nHahahah 23, 2026\n1 weather alerts\n1 closin...","[, None, None, , Hahahah 23, 2026, None, , Non...",628,true
361,https://www.einpresswire.com/article/614170798...,2023-01-31,en,EMERGE Consortium awarded grant by European...,\n EMERGE Consortium awarded grant by Europea...,project scored first in the eic ’ s pathfinder...,\n EMERGE Consortium awarded grant by Europea...,"[, None, None, EMERGE Consortium awarded gra...",1573,true
362,https://www.kktv.com/prnewswire/2021/10/28/pin...,2021-10-28,en,Pinecone Recognized as a 2021 Gartner® Cool Ve...,Pinecone Recognized as a 2021 Gartner® Cool Ve...,"/prnewswire/ -- pinecone systems inc. , a mach...","Hahahah 23, 2026\n\nSkip to contentNewsWeather...","[, Hahahah 23, 2026, None, , None, None, Skip ...",19,true


In [30]:
print(df_news_final_project.iloc[13]['text'])

A Layman’s Guide to Artificial Intelligence (AI) | HackerNoonStart WritingNotificationssee  moreStart WritingReadDiscover (Sort By)All TopicsTop StoriesTechBriefLive ReactionsNoonificationSuperTagBooksSloggingAll NewslettersRandom StoryDaily ArchiveTechAll Topics (Click Here)AIBlockchainCybersecurityData ScienceDecentralizationFinanceFuturismGamingHardware ReviewsMachine LearningMediaProgrammingEngineeringRoboticsVirtual RealitySoftwareAPIsArchitectureCCodingEngineeringJavaJavascriptNodeJSOpen SourcePHPProgrammingPythonReactRubySQLSwiftTop StoriesLatestDavid Copperfield: Chapter 52 - I Assist At An ExplosionThe Essays of Adam Smith: ADAM SMITH ON THE EXTERNAL SENSES - Of the Sense of TASTINGThe Evolution of Modern Medicine: Chapter III - MEDIAEVAL PRACTICETook - A Twitter Bot that Tweets Books.How to Avoid Git Disasters (Gitstasters) Part 1: Git ResetCompetition vs Tribalism - What's Better for the Future Growth Of Web3Learning Python VariablesGoing from N00b GameFi Investor to Boss Mo