# NLP Final Project
Sam Ding

In [1]:
# basic data analytics
import pandas as pd
import numpy as np
import sklearn
import pickle

# nlp modules
import nltk
import spacy
import re

import multiprocessing
import string

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import warnings

# warnings.simplefilter('once')
warnings.simplefilter('ignore')

num_processors = multiprocessing.cpu_count()
num_processors

workers = num_processors-1

print(f'Using {workers} workers')

Using 7 workers


In [2]:
# %%time

# df_news_final_project = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
# df_news_final_project.shape

In [3]:
# # zero-shot classification
# import torch
# from transformers import pipeline
# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [4]:
df_news_final_project = pd.read_csv('sample_600.csv', index_col=0)
# df_news_final_project.shape

In [5]:
df_news_final_project.head(3)

Unnamed: 0,url,date,language,title,text
39396,https://www.wkms.org/npr-news/npr-news/2022-10...,2022-10-10,en,Artificial intelligence could soon diagnose il...,\n\nArtificial intelligence could soon diagnos...
143316,https://www.wbko.com/prnewswire/2022/08/25/ult...,2022-08-25,en,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...
100092,https://www.marketscreener.com/quote/stock/POO...,2022-11-08,en,"IN BRIEF: Poolbeg makes ""significant breakthro...","\n\nIN BRIEF: Poolbeg makes ""significant break..."


In [6]:
# Clean-up newlines
# df_news_final_project['text_clean'] = df_news_final_project['text'].str.replace('\n', ' ')

# clean up tabs
df_news_final_project['text_clean'] = df_news_final_project['text'].str.replace('\t', ' ')

# clean up links
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace(r'http\S+|https\S+|www.\S+', '', case=False)

# clean up remnants of web crawls
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace(r'&#\d+;', '', case=False)

In [7]:
df_news_final_project['title'].iloc[10]

'Data Science & Strategy Firm Schireson Acquires Stun Creative & Blackbird To Form "Known" -- A New Breed Of Modern Marketing Company'

In [8]:
# NER
nlp = spacy.load("en_core_web_md")

indexlist = []
entities = []
labels = []

docs = nlp.pipe(
    df_news_final_project['title'].tolist(),
    disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"],
    batch_size=200,
    n_process=2
)

for i, doc in enumerate(docs):
    index = df_news_final_project.index[i]
    for ent in doc.ents:
        indexlist.append(index)
        entities.append(ent.text)
        labels.append(ent.label_)


ner_df = pd.DataFrame({"Index": indexlist, "Entities":entities,'Labels':labels})

In [9]:
ner_df

Unnamed: 0,Index,Entities,Labels
0,143316,UltraSight,ORG
1,143316,CE Mark,PERSON
2,102690,Square Peg,PERSON
3,102690,AI,ORG
4,21501,1.04,CARDINAL
...,...,...,...
1333,9710,Artificial Intelligence and Machine Learning,ORG
1334,45140,Hexaware Collaborate to,ORG
1335,45140,Help Customers Accelerate,ORG
1336,45140,Journey,PRODUCT


In [10]:
# find indexes where labels have no ORG

with_org_list = list(ner_df[ner_df['Labels'] == 'ORG']['Index'].unique())

no_org_list = list(set(df_news_final_project.index) - set(with_org_list))

# get the title for those indexes

df_news_final_project[df_news_final_project.index.isin(no_org_list)]['title']

39396     Artificial intelligence could soon diagnose il...
100092    IN BRIEF: Poolbeg makes "significant breakthro...
21501     Patch 1.04: A Wagonload of AI · Grand Tacticia...
83882     Mohammad Hosseini: Should we bring AI into hos...
151392    SHUTTERSTOCK PARTNERS WITH OPENAI AND LEADS TH...
                                ...                        
9820          Artificial Intelligence Wish List - NewsBreak
119656    Rockies' trade deadline: Trevor Story, Jon Gra...
129684    Can ChatGPT help with investments if you want ...
117860    BuzzFeed to use artificial intelligence for co...
150157    How the KU community feels about ChatGPT and w...
Name: title, Length: 171, dtype: object

In [11]:
df_news_final_project.head(2)

Unnamed: 0,url,date,language,title,text,text_clean
39396,https://www.wkms.org/npr-news/npr-news/2022-10...,2022-10-10,en,Artificial intelligence could soon diagnose il...,\n\nArtificial intelligence could soon diagnos...,\n\nArtificial intelligence could soon diagnos...
143316,https://www.wbko.com/prnewswire/2022/08/25/ult...,2022-08-25,en,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...


In [12]:
notitle = []
for i in df_news_final_project.index.values:
    notitle.append(df_news_final_project['text_clean'][i].replace(df_news_final_project['title'][i], 'Hahahah 23, 2026')) # replace by this chunk so title can also be split by pattern

df_news_final_project['text_notitle'] = notitle


In [13]:
pattern = r'(\w{3,10}\.*\s\d{1,2}\,*\s20\d{2})|(\d{1,2}\s\w{3,10}\.*\s20\d{2})|\n+'
df_news_final_project['split'] = df_news_final_project['text_notitle'].apply(lambda x: re.split(pattern=pattern, string=x))
df_news_final_project['split_len'] = df_news_final_project['split'].apply(lambda x: len(x))

len 1 was 116, after new pattern 78, after new pattern all splitable.

In [78]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def get_most_similar(index):
    '''
    This function takes in an index of the dataframe and returns the most similar text to the title,
    filtering out other unnecessary texts.
    '''

    title = df_news_final_project['title'][index]
    texts = df_news_final_project['split'][index]

    # print('Original:\n', texts)

    # compare capital letters and periods, drop if there are more capital letters in the texts
    texts = [x for x in texts if x != None]

    # print('Dropping None:\n', texts)

    # print('Dropping more period than capital:\n', texts)

    # take out texts that are too short
    texts = [x for x in texts if len(x) > 150]

    # print('Dropping too short:\n', texts)

    # drop texts with Tab patterns
    pattern = r'([A-Z][a-z]+(\n|\t)+){4}'
    texts =  [re.sub(pattern, '', x) for x in texts]

    # print('Dropping Tab patterns:\n', texts)

    # keep texts with small letter patterns
    pattern = r'([a-z]+\s){4}'
    texts = [x for x in texts if re.search(pattern, x) != None]

    print('Keeping small letter patterns:\n', texts)

    # return NA if there are no text splits left
    if len(texts) == 0:
        return np.nan

    # Tokenize the title and texts
    tokenized_title = nltk.word_tokenize(title.lower())
    tokenized_texts = [nltk.word_tokenize(text.lower()) for text in texts]

    # Convert the tokenized texts to strings
    text_strings = [' '.join(tokens) for tokens in tokenized_texts]

    # print('Tokenized texts:\n', len(text_strings))

    index_to_return = 0
    i = 1
    if i == 1:
        # Create a TF-IDF vectorizer and fit it to the text strings
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(text_strings)
        # print(tfidf_matrix.shape)

        # Compute the cosine similarity between the title and each text
        title_vector = vectorizer.transform([' '.join(tokenized_title)])
        similarity_scores = cosine_similarity(title_vector, tfidf_matrix)

        most_similar_index = similarity_scores.argmax()
        
        most_similar_score = similarity_scores.max()
        index_to_return = most_similar_index
        print(text)
        
        list_articles = []
        list_scores = []

        # for i in range(0,similarity_scores[0]):
        #     if similarity_scores[0] >= 0.05:
                
        if len(text_strings) > 1:
            second_most_similar_index = similarity_scores.argsort()[0][-2]
            # print(similarity_scores)
            second_most_similar_score = similarity_scores[0][second_most_similar_index]
            if most_similar_score > 0.2:
                index_to_return = second_most_similar_index
        # return texts[index_to_return]
        return pd.DataFrame({"text": text_strings, "score": similarity_scores[0]})

    # except:
    #     pass
    return texts[index_to_return]
    


In [None]:
# make a list of even numbers from 0 to 10000

even_list = [x for x in range(0,10000) if x % 2 == 0]

In [79]:
get_most_similar(79650).sort_values('score')

Keeping small letter patterns:
 [' (GLOBE NEWSWIRE) — ForgeRock®, the leading provider in digital identity, announced major updates to its identity platform, aimed at creating an identity future where automation empowers organizations to deliver simplified, secure digital experiences. Customers will have new cloud and artificial intelligence (AI) capabilities as part of the ForgeRock Identity Platform that simplifies identity journeys so people can get to what they want faster while still protecting an organization’s security and an individual’s privacy.', 'A recently published report by Forrester Research¹ states: “Accelerated by the recent COVID-19 pandemic, the growth of customer interactions online has increased exponentially. This is making these channels the primary battleground for where your organization delights, retains, or loses customers. If done well, CIAM can help today’s digital businesses gain deep customer insights to successfully deliver new products and services that

Unnamed: 0,text,score
15,this website uses cookies to improve your expe...,0.026076
12,author recent posts globenewswireglobenewswire...,0.035508
9,to learn more about today ’ s news from forger...,0.055612
1,a recently published report by forrester resea...,0.071182
14,before deciding to invest in foreign exchange ...,0.083577
3,"forgerock autonomous identity – today , risk m...",0.095059
6,forgerock ’ s identity platform constantly ada...,0.114804
2,"today , organizations are forced to make trade...",0.117036
10,"speakers include forgerock ceo fran rosch , fo...",0.129826
7,"rex thexton , managing director , applied cybe...",0.141837


In [74]:
get_most_similar(150157).sort_values('score')

Keeping small letter patterns:
 ['Dr. Genelle Belmas, journalism professor specializing in media law at the University of Kansas, talks about ChatGPT during a workshop on Friday, Feb. 24. Belmas had been testing the chatbot and studying its applications in the field of intellectual property.', "ChatGPT, an artificial intelligence tool, has sparked researchers’ curiosity at the University of Kansas and panic on campuses nationwide. As a result, professors are trying to learn to detect cheaters while teaching their students about the chatbot.\xa0Chat Generative Pre-Trained Transformer (ChatGPT), a large-language model technology (also referred to as artificia\xadl intelligence or AI) that came out in November 2022, has capabilities that range from editing news articles to writing poems and novels and even interpreting and generating computer codes. Worry has grown among universities in the U.S. as students utilize the new technology for writing assignments.\xa0\xa0In contrast to the stro

Unnamed: 0,text,score
0,"dr. genelle belmas , journalism professor spec...",0.125103
1,"chatgpt , an artificial intelligence tool , ha...",0.174118
2,articlesafter controversial resolution and res...,0.274902


In [18]:
df_news_final_project.index[5]

79650

In [19]:
df_news_final_project['text'][79650]

"\n\nForgeRock Powers New Era of Digital Identity with Artificial Intelligence and Cloud - ForexTV\n\n \n\n \n\nBreaking News\n\nSugarmade Secures Cultivation Facility Worth Up to $1.6M in Additional Monthly BudCars Cannabis Delivery Sales\nEbix Announces Integration with SIMON; Expands InsurTech Platform with New e-App Capabilities\nHYCU® Announces Availability of 1-click Test Drive for Nutanix Mine with HYCU\nAvid Bioservices Names Nicholas Green as President and Chief Executive Officer\nU.S. ANIMAL WELFARE ORGANIZATIONS LAUNCH HUMAN ANIMAL SUPPORT SERVICES PILOT\nBIO-key Extends Deployment of Biometric Authentication Solution to a West Coast State; Securing Access to Centralized Voter Registration Database\nCatalyst Pharmaceuticals Appoints Jeffrey Del Carmen as Chief Commercial Officer\n*France Jun Flash Factory PMI 52.1 Vs. 40.6 In May, Consensus 46.0\n*France Jun Flash Services PMI 50.3 Vs. 31.1 In May, Consensus 44.2\n*France Jun Flash Composite PMI 51.3 Vs. 32.1 In May, Consens

In [20]:
get_most_similar(df_news_final_project.index[5])

Keeping small letter patterns:
 [' (GLOBE NEWSWIRE) — ForgeRock®, the leading provider in digital identity, announced major updates to its identity platform, aimed at creating an identity future where automation empowers organizations to deliver simplified, secure digital experiences. Customers will have new cloud and artificial intelligence (AI) capabilities as part of the ForgeRock Identity Platform that simplifies identity journeys so people can get to what they want faster while still protecting an organization’s security and an individual’s privacy.', 'A recently published report by Forrester Research¹ states: “Accelerated by the recent COVID-19 pandemic, the growth of customer interactions online has increased exponentially. This is making these channels the primary battleground for where your organization delights, retains, or loses customers. If done well, CIAM can help today’s digital businesses gain deep customer insights to successfully deliver new products and services that

'ForgeRock Identity Cloud – Customers from finance, healthcare and retail sectors are emerging as pioneers in deploying ForgeRock’s industry-first identity platform as a cloud-delivered service. These organizations selected ForgeRock Identity Cloud because they were seeking a no-compromise solution that offers a full-suite of identity capabilities required by large enterprises, with the flexibility to consume as a service, or deploy anywhere with the push of a button. Additionally, it’s the only solution to provide the benefits of an on-premises deployment from a multi-tenant cloud service, including customer data isolation to provide superior security and scalability.'

In [52]:
text = '                                   NDTV   Business   हिन्दी   Movies   Cricket   Lifestyle   Health   Food   Tech   HOP   Webstories   Auto   বাংলা   தமிழ்   Apps   Trains   Art                 Sections   Coronavirus   Live TV   Latest   India     Opinion   Cities   World     OffBeat   Videos   Trends   Schedule     Education   Science   People   Indians\xa0Abroad     South   Cheat\xa0Sheet   Photos   Weather   News             Alerts         LatestExamsSchoolCampus                                                            Search              '
len(re.findall(r'[A-Z]', text))

51

In [21]:
most_similar = []

for i in df_news_final_project.index.values:
    most_similar.append(get_most_similar(i))

Keeping small letter patterns:
 ["Allison LongYael Bensoussan, MD, is part of the USF Health's department of Otolaryngology - Head & Neck Surgery. She's leading an effort to collect voice data that can be used to diagnose illnesses.", 'The National Institutes of Health is funding a massive research project to collect voice data and develop an AI that could diagnose people based on their speech. \r', "Everything from your vocal cord vibrations to breathing patterns when you speak offers potential information about your health, says laryngologist Dr. Yael Bensoussan, the director of the University of South Florida's Health Voice Center and a leader on the study. \r", '"We asked experts: Well, if you close your eyes when a patient comes in, just by listening to their voice, can you have an idea of the diagnosis they have?" Bensoussan says. "And that\'s where we got all our information." \r', "Someone who speaks low and slowly might have Parkinson's disease. Slurring is a sign of a stroke.

In [22]:
most_similar

["Everything from your vocal cord vibrations to breathing patterns when you speak offers potential information about your health, says laryngologist Dr. Yael Bensoussan, the director of the University of South Florida's Health Voice Center and a leader on the study. \r",
 ' /PRNewswire/ -- UltraSight, an Israeli-based digital health pioneer transforming cardiac imaging through the power of artificial intelligence, announced it has obtained a CE Mark for its AI guidance software for cardiac ultrasound. Today\'s milestone comes just ahead of the European Society of Cardiology (ESC) Congress in Barcelona and aligns with its call for action to achieve heart health in Europe by 2025.UltraSight(PRNewswire)UltraSight\'s technology allows medical professionals, regardless of their sonography experience, to successfully capture diagnostic quality ultrasound images of the heart. The technology can be used at the point of care, paving the way for more widespread detection of cardiovascular diseas

In [23]:
df_news_final_project['text_clean'] = most_similar

In [24]:
df_news_final_project.iloc[488]['title']

'Artificial Intelligence (AI) in Drug Discovery Market Growth, Share, Trends 2020: Know Technology Exploding in Popularity| Microsoft, NVIDIA, IBM, Atomwise, DEEP GENOMICS, Cloud Pharmaceuticals – KSU | The Sentinel Newspaper'

In [25]:
df_news_final_project.iloc[488]['text']

'\nArtificial Intelligence (AI) in Drug Discovery Market Growth, Share, Trends 2020: Know Technology Exploding in Popularity| Microsoft, NVIDIA, IBM, Atomwise, DEEP GENOMICS, Cloud Pharmaceuticals – KSU | The Sentinel Newspaper\n\nMenu\n\n\nKSU | The Sentinel Newspaper \n\nSearch for\n\n \nBusiness\nHealth\nScience\nTechnology\nWorld\nPublish PRs\n \n \n\nSearch for\n\n Home/Business/Artificial Intelligence (AI) in Drug Discovery Market Growth, Share, Trends 2020: Know Technology Exploding in Popularity| Microsoft, NVIDIA, IBM, Atomwise, DEEP GENOMICS, Cloud Pharmaceuticals\n\nBusinessHealthTechnology\nArtificial Intelligence (AI) in Drug Discovery Market Growth, Share, Trends 2020: Know Technology Exploding in Popularity| Microsoft, NVIDIA, IBM, Atomwise, DEEP GENOMICS, Cloud Pharmaceuticals\n\nData Bridge Market ResearchApril 26, 2021 1  \n\n\nGlobal Healthcare Business Intelligence Market\xa0Report from DBMR highlights deep analysis on market characteristics, sizing, estimates and g

In [26]:
df_news_final_project.iloc[488]['text_clean']

'Global Healthcare Business Intelligence Market\xa0Report from DBMR highlights deep analysis on market characteristics, sizing, estimates and growth by segmentation, regional breakdowns& country along with competitive landscape, player’s market shares, and strategies that are key in the market. The exploration provides a 360° view and insights, highlighting major outcomes of the industry. These insights help the business decision-makers to formulate better business plans and make informed decisions to improved profitability. In addition, the study helps venture or private players in understanding the companies in more detail to make better informed decisions. This influential Healthcare Business Intelligence business report provides granular analysis of the market share, segmentation, revenue forecasts and geographic regions of the market. The market data within the report is displayed in a statistical format to offer a better understanding upon the market dynamics.'

In [27]:
df_news_final_project.iloc[488]['split']

['',
 None,
 None,
 '',
 'Hahahah 23, 2026',
 None,
 '',
 None,
 None,
 'Menu',
 None,
 None,
 'KSU | The Sentinel Newspaper ',
 None,
 None,
 'Search for',
 None,
 None,
 ' ',
 None,
 None,
 'Business',
 None,
 None,
 'Health',
 None,
 None,
 'Science',
 None,
 None,
 'Technology',
 None,
 None,
 'World',
 None,
 None,
 'Publish PRs',
 None,
 None,
 ' ',
 None,
 None,
 ' ',
 None,
 None,
 'Search for',
 None,
 None,
 ' Home/Business/Artificial Intelligence (AI) in Drug Discovery Market Growth, Share, Trends 2020: Know Technology Exploding in Popularity| Microsoft, NVIDIA, IBM, Atomwise, DEEP GENOMICS, Cloud Pharmaceuticals',
 None,
 None,
 'BusinessHealthTechnology',
 None,
 None,
 'Artificial Intelligence (AI) in Drug Discovery Market Growth, Share, Trends 2020: Know Technology Exploding in Popularity| Microsoft, NVIDIA, IBM, Atomwise, DEEP GENOMICS, Cloud Pharmaceuticals',
 None,
 None,
 'Data Bridge Market Res',
 'earchApril 26, 2021',
 None,
 ' 1  ',
 None,
 None,
 'Global Healthc

In [28]:
df_news_final_project.iloc[455]['title']

'IIT Madras Opens Application For Online Data Science Programme'

In [29]:
df_news_final_project.iloc[455]['text']

'\nIIT Madras Opens Application For Online Data Science Programme\n\n                                   NDTV   Business   हिन्दी   Movies   Cricket   Lifestyle   Health   Food   Tech   HOP   Webstories   Auto   বাংলা   தமிழ்   Apps   Trains   Art                 Sections   Coronavirus   Live TV   Latest   India     Opinion   Cities   World     OffBeat   Videos   Trends   Schedule     Education   Science   People   Indians\xa0Abroad     South   Cheat\xa0Sheet   Photos   Weather   News             Alerts         LatestExamsSchoolCampus                                                            Search              \n\nAdvertisement\n\nLatest\nExams\nSchool\nCampus\n\n   \n   \nHome\nEducation\n IIT Madras Opens Application For Online Data Science Programme \nIIT Madras Opens Application For Online Data Science Programme\nIIT Madras Online Data Science Programme: As a part of the application process, all the applicants will go through a qualifier process, wherein IIT Madras provides four w

In [30]:
df_news_final_project.iloc[455]['text_clean']

'IIT Madras Online Data Science Programme: As a part of the application process, all the applicants will go through a qualifier process, wherein IIT Madras provides four weeks of online training through video lectures, assignments, and live interactions with the course instructors.'

In [31]:
df_news_final_project.iloc[455]['split']

['',
 None,
 None,
 '',
 'Hahahah 23, 2026',
 None,
 '',
 None,
 None,
 '                                   NDTV   Business   हिन्दी   Movies   Cricket   Lifestyle   Health   Food   Tech   HOP   Webstories   Auto   বাংলা   தமிழ்   Apps   Trains   Art                 Sections   Coronavirus   Live TV   Latest   India     Opinion   Cities   World     OffBeat   Videos   Trends   Schedule     Education   Science   People   Indians\xa0Abroad     South   Cheat\xa0Sheet   Photos   Weather   News             Alerts         LatestExamsSchoolCampus                                                            Search              ',
 None,
 None,
 'Advertisement',
 None,
 None,
 'Latest',
 None,
 None,
 'Exams',
 None,
 None,
 'School',
 None,
 None,
 'Campus',
 None,
 None,
 '   ',
 None,
 None,
 '   ',
 None,
 None,
 'Home',
 None,
 None,
 'Education',
 None,
 None,
 ' ',
 'Hahahah 23, 2026',
 None,
 ' ',
 None,
 None,
 '',
 'Hahahah 23, 2026',
 None,
 '',
 None,
 None,
 'IIT Madras Online Data Sci

In [32]:
df_news_final_project[df_news_final_project['text_clean'].isna()].shape

(9, 9)

In [33]:
df_news_final_project['text_clean'][9710]

' /PRNewswire/ --\xa0Pinecone Systems Inc., a machine learning (ML) cloud infrastructure company, announced today that it has been named a Gartner Cool Vendor in the October 2021 Gartner Cool Vendors™ in Data for Artificial Intelligence and Machine Learning*.According to the report, "As AI and ML techniques become common in the enterprise, data is coming to the foreground. Data is what makes a difference in AI now. Data and analytics leaders want to improve the delivery of AI results with data innovations." The report also noted that "AI teams are expanding their focus from model development to data that makes these models effective. Many of them are unaware of the proven data management solutions and are looking for AI-specific data offerings to improve and simplify their data-related efforts."Vector search can be more accurate and intuitive than traditional keyword search methods, which require the user to make guesses about how data is structured. Before Pinecone, only a few tech gi

In [34]:
df_news_final_project.dropna(subset=['text_clean'], inplace=True)

In [35]:
# df_news_final_project

In [36]:
# arrange df_news_final_project by text_clean string length

df_news_final_project['text_clean_len'] = df_news_final_project['text_clean'].apply(lambda x: len(x))

df_news_final_project.sort_values(by='text_clean_len', inplace=True)

In [37]:
df_news_final_project['text_clean_len'].describe()

count      591.000000
mean      1851.411168
std       2576.471483
min        151.000000
25%        259.000000
50%        535.000000
75%       3171.500000
max      19213.000000
Name: text_clean_len, dtype: float64

In [41]:
# calculate cosine similarity between title and text

def calculate_sim(title, text):

    # Tokenize the title and texts
    tokenized_title = nltk.word_tokenize(title.lower())
    tokenized_text = nltk.word_tokenize(text.lower())

    # Convert the tokenized texts to strings
    title_string = ' '.join(tokenized_title)
    text_string = ' '.join(tokenized_text)

    # Create a TF-IDF vectorizer and fit it to the text strings
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([title_string, text_string])

    # Compute the cosine similarity between the title and each text
    title_vector = vectorizer.transform([title_string])
    text_vector = vectorizer.transform([text_string])
    similarity_scores = cosine_similarity(title_vector, text_vector)

    # Find the index of the text with the highest similarity score
    similarity_score = similarity_scores[0][0]

    return similarity_score

Here we have the cleaned text for downstream analysis. Currently there are 586 documents that are not null.

## Sentiment Analysis

In [42]:
# vect = CountVectorizer(lowercase=False, stop_words='english',
#                                   max_df=0.8, min_df=0.2, max_features=10000, ngram_range=(1,3))

vect = CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3))

In [43]:
# load model
filename = 'nb_model_sentiment.sav'
nb = pickle.load(open(filename, 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'nb_model_sentiment.sav'

In [None]:
element = 0
clf = nb

text = "Current developments in AI presents troubling outlook"
prediction = np.where(clf.predict(vect.transform([text])) < 1, "Negative", "Positive").tolist()[element]
print('Text: >>> ' + text + '\n' + 'Sentiment: >>> ' + prediction)