# NLP Final Project
Sam Ding

In [158]:
# basic data analytics
import pandas as pd
import numpy as np
import sklearn
import pickle

# nlp modules
import nltk
import spacy
import re

import multiprocessing
import string

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import warnings

# warnings.simplefilter('once')
warnings.simplefilter('ignore')

num_processors = multiprocessing.cpu_count()
num_processors

workers = num_processors-1

print(f'Using {workers} workers')

Using 7 workers


In [159]:
# %%time

# df_news_final_project = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
# df_news_final_project.shape

In [160]:
# # zero-shot classification
# import torch
# from transformers import pipeline
# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [161]:
df_news_final_project = pd.read_csv('sample_600.csv', index_col=0)
# df_news_final_project.shape

In [162]:
df_news_final_project.head(3)

Unnamed: 0,url,date,language,title,text
39396,https://www.wkms.org/npr-news/npr-news/2022-10...,2022-10-10,en,Artificial intelligence could soon diagnose il...,\n\nArtificial intelligence could soon diagnos...
143316,https://www.wbko.com/prnewswire/2022/08/25/ult...,2022-08-25,en,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...
100092,https://www.marketscreener.com/quote/stock/POO...,2022-11-08,en,"IN BRIEF: Poolbeg makes ""significant breakthro...","\n\nIN BRIEF: Poolbeg makes ""significant break..."


In [163]:
# Clean-up newlines
df_news_final_project['text_clean'] = df_news_final_project['text'].str.replace('\n', ' ')

# clean up tabs
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace('\t', ' ')

# clean up links
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace(r'http\S+|https\S+|www.\S+', '', case=False)

# clean up remnants of web crawls
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace(r'&#\d+;', '', case=False)

In [164]:
df_news_final_project['title'].iloc[10]

'Data Science & Strategy Firm Schireson Acquires Stun Creative & Blackbird To Form "Known" -- A New Breed Of Modern Marketing Company'

In [165]:
# NER
nlp = spacy.load("en_core_web_md")

indexlist = []
entities = []
labels = []

docs = nlp.pipe(
    df_news_final_project['title'].tolist(),
    disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"],
    batch_size=200,
    n_process=2
)

for i, doc in enumerate(docs):
    index = df_news_final_project.index[i]
    for ent in doc.ents:
        indexlist.append(index)
        entities.append(ent.text)
        labels.append(ent.label_)


ner_df = pd.DataFrame({"Index": indexlist, "Entities":entities,'Labels':labels})

In [166]:
ner_df

Unnamed: 0,Index,Entities,Labels
0,143316,UltraSight,ORG
1,143316,CE Mark,PERSON
2,102690,Square Peg,PERSON
3,102690,AI,ORG
4,21501,1.04,CARDINAL
...,...,...,...
1333,9710,Artificial Intelligence and Machine Learning,ORG
1334,45140,Hexaware Collaborate to,ORG
1335,45140,Help Customers Accelerate,ORG
1336,45140,Journey,PRODUCT


In [167]:
# find indexes where labels have no ORG

with_org_list = list(ner_df[ner_df['Labels'] == 'ORG']['Index'].unique())

no_org_list = list(set(df_news_final_project.index) - set(with_org_list))

# get the title for those indexes

df_news_final_project[df_news_final_project.index.isin(no_org_list)]['title']

39396     Artificial intelligence could soon diagnose il...
100092    IN BRIEF: Poolbeg makes "significant breakthro...
21501     Patch 1.04: A Wagonload of AI · Grand Tacticia...
83882     Mohammad Hosseini: Should we bring AI into hos...
151392    SHUTTERSTOCK PARTNERS WITH OPENAI AND LEADS TH...
                                ...                        
9820          Artificial Intelligence Wish List - NewsBreak
119656    Rockies' trade deadline: Trevor Story, Jon Gra...
129684    Can ChatGPT help with investments if you want ...
117860    BuzzFeed to use artificial intelligence for co...
150157    How the KU community feels about ChatGPT and w...
Name: title, Length: 171, dtype: object

In [168]:
df_news_final_project.head(2)

Unnamed: 0,url,date,language,title,text,text_clean
39396,https://www.wkms.org/npr-news/npr-news/2022-10...,2022-10-10,en,Artificial intelligence could soon diagnose il...,\n\nArtificial intelligence could soon diagnos...,Artificial intelligence could soon diagnose ...
143316,https://www.wbko.com/prnewswire/2022/08/25/ult...,2022-08-25,en,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...


In [169]:
notitle = []
for i in df_news_final_project.index.values:
    notitle.append(df_news_final_project['text_clean'][i].replace(df_news_final_project['title'][i], 'Hahahah 23, 2026')) # replace by this chunk so title can also be split by pattern

df_news_final_project['text_notitle'] = notitle


In [170]:
pattern = r'(\w{3,10}\.*\s\d{1,2}\,*\s20\d{2})|(\d{1,2}\s\w{3,10}\.*\s20\d{2})'
df_news_final_project['split'] = df_news_final_project['text_notitle'].apply(lambda x: re.split(pattern=pattern, string=x))
df_news_final_project['split_len'] = df_news_final_project['split'].apply(lambda x: len(x))

len 1 was 116, after new pattern 78, after new pattern all splitable.

In [202]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def get_most_similar(index):
    '''
    This function takes in an index of the dataframe and returns the most similar text to the title,
    filtering out other unnecessary texts.
    '''

    title = df_news_final_project['title'][index]
    texts = df_news_final_project['split'][index]

    # compare capital letters and periods, drop if there are more capital letters in the texts
    texts = [x for x in texts if x != None]

    # if there are more periods than capital letters, drop. 
    # This means that it could be an item of many tabs than actual content
    texts = [x for x in texts if x.count('.') > x.count(r'[A-Z]')]

    # take out texts that are too short
    texts = [x for x in texts if len(x) > 600]

    # drop texts with Tab patterns
    pattern = r'([A-Z][a-z]+(\n|\t|\s)+){3}'
    texts = [x for x in texts if re.search(pattern, x) != None]

    # return NA if there are no text splits left
    if len(texts) == 0:
        return np.nan

    # Tokenize the title and texts
    tokenized_title = nltk.word_tokenize(title.lower())
    tokenized_texts = [nltk.word_tokenize(text.lower()) for text in texts]

    # Convert the tokenized texts to strings
    text_strings = [' '.join(tokens) for tokens in tokenized_texts]

    most_similar_index = 0
    try:
        # Create a TF-IDF vectorizer and fit it to the text strings
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(text_strings)

        # Compute the cosine similarity between the title and each text
        title_vector = vectorizer.transform([' '.join(tokenized_title)])
        similarity_scores = cosine_similarity(title_vector, tfidf_matrix)

        # Find the index of the text with the highest similarity score
        most_similar_index = similarity_scores.argmax()
        
    except:
        pass
    # keep most similar text
    return texts[most_similar_index]


In [203]:
most_similar = []

for i in df_news_final_project.index.values:
    most_similar.append(get_most_similar(i))

In [204]:
df_news_final_project['text_clean'] = most_similar

In [205]:
df_news_final_project.iloc[324]['title']

'SECO invests in the Artificial Intelligence of Oro Networks'

In [208]:
df_news_final_project.iloc[324]['text']

'\n\n \n\nSECO invests in the Artificial Intelligence of Oro Networks\nResources\n \nBlog\n\n\nJournalists\nLog In\n \nSign Up\n \nData Privacy\n\n\nSend a Release\n \nNews\n\n\nProducts \nOverview\n\n\nDistribution by PR Newswire\n\n\nCision Communications Cloud®\n\n\nCision IR\n\n\nAll Products\n\nContact \nGeneral Inquiries\n\n\nRequest a Demo\n\n\nEditorial Bureaus\n\n\nPartnerships\n\n\nMedia Inquiries\n\n\nWorldwide Offices\n\nSearch\n\xa0\nSearch\nWhen typing in this field, a list of search results will appear and be automatically updated as you type.\n\nSearching for your content...\nNo results found. Please change your search terms and try again.\n\n\n \nNews in Focus\n\nBrowse News Releases\n\nAll News Releases\nAll Public Company\nEnglish-only\n\n\nNews Releases Overview \n\nMultimedia Gallery\n\nAll Multimedia\nAll Photos\nAll Videos\n\n\n  Multimedia Gallery Overview \n\nTrending Topics\n\nAll Trending Topics\nBusiness & Money\n\n\nAuto & TransportationAll Automotive & Tra

In [207]:
df_news_final_project.iloc[324]['split']

['     ',
 'Hahahah 23, 2026',
 None,
 " Resources   Blog   Journalists Log In   Sign Up   Data Privacy   Send a Release   News   Products  Overview   Distribution by PR Newswire   Cision Communications Cloud®   Cision IR   All Products  Contact  General Inquiries   Request a Demo   Editorial Bureaus   Partnerships   Media Inquiries   Worldwide Offices  Search \xa0 Search When typing in this field, a list of search results will appear and be automatically updated as you type.  Searching for your content... No results found. Please change your search terms and try again.     News in Focus  Browse News Releases  All News Releases All Public Company English-only   News Releases Overview   Multimedia Gallery  All Multimedia All Photos All Videos     Multimedia Gallery Overview   Trending Topics  All Trending Topics Business & Money   Auto & TransportationAll Automotive & Transportation Aerospace, Defense Air Freight Airlines & Aviation Automotive Maritime & Shipbuilding Railroads and Inter

In [206]:
df_news_final_project.iloc[324]['text_clean']

' /PRNewswire/ -- SECO S.p.A. ("SECO"), reference player in the field of technological innovation and Internet of Things solutions, today announced the signing of a binding agreement for the acquisition of the assets of Oro Networks LLC and its subsidiaries ("Oro").  SECO acquires Oro Networks - Ajay Malik (CEO Oro Networks), Massimo Mauri (CEO SECO)  Founded in 2018 by Ajay Malik, previously serving as executive manager in companies like Google, Cisco and author of several publications on the AI, Oro designs and develops Artificial Intelligence solutions which are made available on a Software-as-a-Service basis. Oro\'s solutions allow to rapidly connect and cloud manage any hardware for smart control, monitoring and display of actionable insights using machine learning, deep learning, predictive analytics and data mining. Thanks to Oro\'s real-time edge AI solution, any device can be turned into a cloud managed intelligent endpoint, and customized AI pipelines can be easily deployed o

In [211]:
df_news_final_project[df_news_final_project['text_clean'].isna()].shape

(14, 9)

In [212]:
df_news_final_project.dropna(subset=['text_clean'], inplace=True)

In [213]:
df_news_final_project

(586, 9)

Here we have the cleaned text for downstream analysis. Currently there are 586 documents that are not null.

## Sentiment Analysis

In [181]:
# vect = CountVectorizer(lowercase=False, stop_words='english',
#                                   max_df=0.8, min_df=0.2, max_features=10000, ngram_range=(1,3))

vect = CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3))

In [182]:
# load model
filename = 'nb_model_sentiment.sav'
nb = pickle.load(open(filename, 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'nb_model_sentiment.sav'

In [None]:
element = 0
clf = nb

text = "Current developments in AI presents troubling outlook"
prediction = np.where(clf.predict(vect.transform([text])) < 1, "Negative", "Positive").tolist()[element]
print('Text: >>> ' + text + '\n' + 'Sentiment: >>> ' + prediction)