# NLP Final Project
Sam Ding

In [1]:
# basic data analytics
import pandas as pd
import numpy as np
import sklearn
import pickle

# nlp modules
import nltk
import spacy
import re

import multiprocessing
import string

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import warnings

# warnings.simplefilter('once')
warnings.simplefilter('ignore')

num_processors = multiprocessing.cpu_count()
num_processors

workers = num_processors-1

print(f'Using {workers} workers')

Using 7 workers


In [2]:
# %%time

# df_news_final_project = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
# df_news_final_project.shape

In [3]:
# # zero-shot classification
# import torch
# from transformers import pipeline
# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [4]:
df_news_final_project = pd.read_csv('sample_600.csv', index_col=0)
# df_news_final_project.shape

In [5]:
df_news_final_project.head(2)

Unnamed: 0,url,date,language,title,text
39396,https://www.wkms.org/npr-news/npr-news/2022-10...,2022-10-10,en,Artificial intelligence could soon diagnose il...,\n\nArtificial intelligence could soon diagnos...
143316,https://www.wbko.com/prnewswire/2022/08/25/ult...,2022-08-25,en,UltraSight Receives CE Mark for Novel Cardiac ...,UltraSight Receives CE Mark for Novel Cardiac ...


In [6]:
# Clean-up newlines
# df_news_final_project['text_clean'] = df_news_final_project['text'].str.replace('\n', ' ')

# clean up tabs
df_news_final_project['text_clean'] = df_news_final_project['text'].str.replace('\t', ' ')

# clean up links
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '')

# clean up remnants of web crawls
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].str.replace(r'&#\d+;', '', case=False)

In [7]:
notitle = []
for i in df_news_final_project.index.values:
    notitle.append(df_news_final_project['text_clean'][i].replace(df_news_final_project['title'][i], 'Hahahah 23, 2026')) # replace by this chunk so title can also be split by pattern

df_news_final_project['text_notitle'] = notitle

# drop everything after string 'for more information'ABC

df_news_final_project['text_notitle'] = df_news_final_project['text_notitle'].str.split(r'[F|f]or more information', expand=True)[0]


In [8]:
pattern = r'(\w{3,10}\.*\s\d{1,2}\,*\s20\d{2})|(\d{1,2}\s\w{3,10}\.*\s20\d{2})|\n+'

df_news_final_project['split'] = df_news_final_project['text_notitle'].apply(lambda x: re.split(pattern=pattern, string=x))
df_news_final_project['split_len'] = df_news_final_project['split'].apply(lambda x: len(x))

In [None]:
# filter df_news_final_project so that it only contains articles that mentions "artificial intelligence" or 'data'
df_news_final_project = df_news_final_project[df_news_final_project['text_clean'].str.contains('artificial intelligence|data')]

len 1 was 116, after new pattern 78, after new pattern all splitable.

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def get_most_similar(index):
    '''
    This function takes in an index of the dataframe and returns the most similar text to the title,
    filtering out other unnecessary texts.
    '''

    title = df_news_final_project['title'][index]
    texts = df_news_final_project['split'][index]

    # print('Original:\n', texts)

    # compare capital letters and periods, drop if there are more capital letters in the texts
    texts = [x for x in texts if x != None]

    # print('Dropping None:\n', texts)

    # print('Dropping more period than capital:\n', texts)

    # take out texts that are too short
    texts = [x for x in texts if len(x) > 150]

    # print('Dropping too short:\n', texts)

    # drop texts with Tab patterns
    pattern = r'([A-Z][a-z]+(\n|\t)+){4}'
    texts =  [re.sub(pattern, '', x) for x in texts]

    # drop texts with Sign up or email patterns
    texts = [x for x in texts if re.search(r'([S|s]ign up)|([E|e]mail)', x) == None]

    # print('Dropping Tab patterns:\n', texts)

    # keep texts with small letter patterns
    pattern = r'([a-z]+\s){4}'
    texts = [x for x in texts if re.search(pattern, x) != None]

    # print('Keeping small letter patterns:\n', texts)


    # return NA if there are no text splits left
    if len(texts) == 0:
        return np.nan

    # Tokenize the title and texts
    tokenized_title = nltk.word_tokenize(title.lower())
    tokenized_texts = [nltk.word_tokenize(text.lower()) for text in texts]

    # Convert the tokenized texts to strings
    text_strings = [' '.join(tokens) for tokens in tokenized_texts]

    # print('Tokenized texts:\n', len(text_strings))

    index_to_return = 0

    # Create a TF-IDF vectorizer and fit it to the text strings
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(text_strings)
        # print(tfidf_matrix.shape)

    # Compute the cosine similarity between the title and each text
    title_vector = vectorizer.transform([' '.join(tokenized_title)])
    similarity_scores = cosine_similarity(title_vector, tfidf_matrix)

    most_similar_index = similarity_scores.argmax()
        
    most_similar_score = similarity_scores.max()
    index_to_return = most_similar_index
        
    list_articles = []
    list_scores = []

    for i in range(0,len(similarity_scores[0])):
        if similarity_scores[0][i] >= 0.07:
            list_articles.append(text_strings[i])
            list_scores.append(similarity_scores[0][i])
        # print(len(text_strings), len(similarity_scores[0]))
                
        # if len(text_strings) > 1:
        #     second_most_similar_index = similarity_scores.argsort()[0][-2]
        #     # print(similarity_scores)
        #     second_most_similar_score = similarity_scores[0][second_most_similar_index]
        #     if most_similar_score > 0.2:
        #         index_to_return = second_most_similar_index
        # # return texts[index_to_return]
        # return 

    # except:
    #     pass
    

    return ' '.join(list_articles)
    


In [10]:
most_similar = []

for i in df_news_final_project.index.values:
    most_similar.append(get_most_similar(i))

df_news_final_project['text_clean'] = most_similar

df_news_final_project.dropna(subset=['text_clean'], inplace=True)

df_news_final_project.shape

(574, 9)

In [11]:
pd.Series(df_news_final_project.index)

0       39396
1      143316
2      100092
3       21501
4       79650
        ...  
569     12183
570     69983
571      9710
572     45140
573    150157
Length: 574, dtype: int64

In [12]:
def extract_entities(indices):
    entities = []
    batch_size = len(indices)
    texts = df_news_final_project.loc[indices, 'text_clean']
    docs = nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

    for i in range(0,batch_size):
        index = indices[i]
        doc = next(docs)
        for ent in doc.ents:
            if ent.label_ in ['PERSON', 'ORG', 'PRODUCT', 'EVENT']:
                entities.append((index, ent.text, ent.label_))

    return entities

In [17]:
from pandarallel import pandarallel

# Initialize Pandarallel
pandarallel.initialize()
# divide index into batches

nlp = spacy.load("en_core_web_md")

index_batches = np.array_split(df_news_final_project.index.values, 100)

# apply function to each batch

ner_df = pd.Series(index_batches).parallel_apply(extract_entities)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [18]:
# Create a dataframe from the list of entities
ner_df = pd.DataFrame([item for sublist in ner_df for item in sublist], columns=['index', 'entity', 'label'])

In [19]:
# search for specific entities with chatgpt
ner_df.groupby('entity').count().sort_values(by='index', ascending=False).head(50)

Unnamed: 0_level_0,index,label
entity,Unnamed: 1_level_1,Unnamed: 2_level_1
chatgpt,258,258
microsoft,171,171
covid-19,93,93
ibm,76,76
https,69,69
gray media group,68,68
"gray television , inc",68,68
google,65,65
amazon,47,47
fda,23,23


In [20]:
ner_df

Unnamed: 0,index,entity,label
0,39396,allison longyael bensoussan,PERSON
1,39396,the usf health 's,ORG
2,39396,dr.,PERSON
3,39396,yael bensoussan,PERSON
4,39396,the university of south florida 's,ORG
...,...,...,...
7497,150157,articlesafter,ORG
7498,150157,farjayhawks,ORG
7499,150157,mountaineersgradey dick,ORG
7500,150157,winkansas softball,ORG


In [21]:
# merge with ner_df
ner_df.merge(df_news_final_project['date'], left_on='index', right_index=True)

Unnamed: 0,index,entity,label,date
0,39396,allison longyael bensoussan,PERSON,2022-10-10
1,39396,the usf health 's,ORG,2022-10-10
2,39396,dr.,PERSON,2022-10-10
3,39396,yael bensoussan,PERSON,2022-10-10
4,39396,the university of south florida 's,ORG,2022-10-10
...,...,...,...,...
7497,150157,articlesafter,ORG,2023-02-27
7498,150157,farjayhawks,ORG,2023-02-27
7499,150157,mountaineersgradey dick,ORG,2023-02-27
7500,150157,winkansas softball,ORG,2023-02-27


In [25]:
ner_df[ner_df['label'].isin(['PERSON', 'ORG', 'PRODUCT'])].merge(df_news_final_project['date'], left_on='index', right_index=True)

Unnamed: 0,index,entity,label,date
0,39396,allison longyael bensoussan,PERSON,2022-10-10
1,39396,the usf health 's,ORG,2022-10-10
2,39396,dr.,PERSON,2022-10-10
3,39396,yael bensoussan,PERSON,2022-10-10
4,39396,the university of south florida 's,ORG,2022-10-10
...,...,...,...,...
7497,150157,articlesafter,ORG,2023-02-27
7498,150157,farjayhawks,ORG,2023-02-27
7499,150157,mountaineersgradey dick,ORG,2023-02-27
7500,150157,winkansas softball,ORG,2023-02-27


In [26]:
df_news_final_project.index

Index([ 39396, 143316, 100092,  21501,  79650,  83882, 146180,  45208, 139248,
         9867,
       ...
       129684, 102030, 130396, 117860, 114660,  12183,  69983,   9710,  45140,
       150157],
      dtype='int64', length=574)

## NER

In [27]:
ner_df

Unnamed: 0,index,entity,label
0,39396,allison longyael bensoussan,PERSON
1,39396,the usf health 's,ORG
2,39396,dr.,PERSON
3,39396,yael bensoussan,PERSON
4,39396,the university of south florida 's,ORG
...,...,...,...
7497,150157,articlesafter,ORG
7498,150157,farjayhawks,ORG
7499,150157,mountaineersgradey dick,ORG
7500,150157,winkansas softball,ORG


In [29]:
df_news_final_project.shape

(574, 9)

In [30]:
len("© 2023 ideastream public media1375 euclid avenue , cleveland , ohio 44115 ( 216 ) 916-6100 | ( 877 ) 399-3307wksu is a public media service licensed to kent state university and operated by ideastream public media . the tech world enthusiastic about the possibilities of artificial intelligence , but where does that leave meta 's plans for the metaverse ? some large companies are already dialing back their plans . © 2023 ideastream public media1375 euclid avenue , cleveland , ohio 44115 ( 216 ) 916-6100 | ( 877 ) 399-3307wksu is a public media service licensed to kent state university and operated by ideastream public media .")

632

In [31]:
# drop text clean that are less than 700 characters
df_news_final_project = df_news_final_project[df_news_final_project['text_clean'].apply(lambda x: len(x)) > 700]

Here we have the cleaned text for downstream analysis. Currently there are 587 documents that are not null and have the mentions of those words in Data Science.

## Sentiment Analysis

In [33]:
# pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [34]:
# targeted sentiment analysis

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

In [55]:
t = df_news_final_project['text_clean'].iloc[3]
sia.polarity_scores(t)

{'neg': 0.11, 'neu': 0.748, 'pos': 0.142, 'compound': 0.6798}

In [58]:
# extract each sentence

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(t)

sentences

['the campaign ai now recruits more intelligently , and transfers units to threatened theaters actively .',
 'it is more active in construction and management of its fleets , and will also consider naval invasions against poorly protected shores .',
 'capital city defense behavior , defensive operations , weapons production and distribution among the troops , replacing commanders with better ones , and organizing early armies into grand armies with corps organization are also among the upgrades .',
 'the battle ai handles its troops better .',
 'in attack , it will bombard enemy positions with its artillery while waiting for more troops to arrive , before moving in to close combat , if required .',
 "it is more active in defensive battles to counter player 's flanking and encirclement movements with timely withdrawals , while also considering keeping a reserve to counter player 's maneuvers .",
 "all in all , there will be less confusion in the ai 's ranks during battles .",
 'along co

In [108]:
from ktrain.text.sentiment import SentimentAnalyzer
classifier = SentimentAnalyzer()



In [78]:
list(classifier.predict(sent).keys())[0]

'POSITIVE'

In [125]:
text = sents.pop(0)
# take the latter half of the text by length
text = text[len(text[0])//2:]

In [141]:
# show the 50 characters to the end
item[-600:]

" and gradient ventures ( google 's ai-focused fund ) . the funding will drive further development and commercialization of rad ai omni and rad ai continuity , the company 's first core offerings on its ai platform , and advance rad ai 's mission to empower radiologists with ai — saving them time , reducing burnout , and helping to improve the quality of patient care.rad ai logohow rad ai helps radiologists and improves patient carefounded in 2018 , rad ai has seen rapid adoption of its ai platform , and is already in use at 7 of the 10 largest private radiology practices in the united states ."

In [231]:
from ktrain.text.sentiment import SentimentAnalyzer
from textblob import TextBlob
classifier = SentimentAnalyzer()

def detect_sentiment(indices):
    sample = df_news_final_project.loc[indices, 'text_clean']
    sentiments = []
    batch_size = len(indices)
    
    for i in range(0,batch_size):
        # print('a')
        index = list(indices)[i]
        # print('b')
        sents = TextBlob(df_news_final_project['text_clean'][index]).sentences
        sents = [sentence.raw for sentence in sents]
        # print('c')
        item = ' '.join(sents[:2])
        # print('d')
        # take the bottom 500 characters
        item = item[-500:]
        # print('e')
        # print(item)
        sentiments.append(list(classifier.predict(item).keys())[0])
        # print('finished index', index)
        
    
    return sentiments

In [241]:
from concurrent.futures import ThreadPoolExecutor
def parallel_detect_sentiment(indices):
    with ThreadPoolExecutor() as executor:
        results = executor.map(detect_sentiment, indices)
    return list(results)

# Example usage:
indices = np.array_split(df_news_final_project.index.values, 100)[:10]
sentiments = parallel_detect_sentiment(indices)
print(sentiments)

[['NEUTRAL', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEUTRAL'], ['POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE'], ['NEUTRAL', 'NEGATIVE', 'POSITIVE', 'NEUTRAL', 'POSITIVE'], ['NEUTRAL', 'NEUTRAL', 'NEUTRAL', 'NEUTRAL', 'NEUTRAL'], ['POSITIVE', 'POSITIVE', 'NEUTRAL', 'NEUTRAL', 'NEUTRAL'], ['NEUTRAL', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE'], ['POSITIVE', 'NEUTRAL', 'NEUTRAL', 'POSITIVE', 'NEUTRAL'], ['POSITIVE', 'NEGATIVE', 'NEUTRAL', 'POSITIVE', 'NEUTRAL'], ['POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEUTRAL'], ['NEUTRAL', 'NEUTRAL', 'NEUTRAL', 'POSITIVE', 'NEUTRAL']]


In [211]:
list(classifier.predict(item).keys())

['NEUTRAL']

In [233]:
detect_sentiment(np.array([ 39396, 143316, 100092,  79650,  83882]))

['NEUTRAL', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEUTRAL']

In [216]:
df_news_final_project['title'][83882]

'Mohammad Hosseini: Should we bring AI into hospitals? Let’s find the middle ground'

In [160]:
np.array_split(df_news_final_project.index.values, 100)[:2]

[array([ 39396, 143316, 100092,  79650,  83882]),
 array([146180,  45208, 139248,   9867, 151392])]

In [234]:
%%time
from pandarallel import pandarallel
import time

# start_time = time.time()

# Initialize Pandarallel

classifier = SentimentAnalyzer()

# divide index into batches

index_batches = np.array_split(df_news_final_project.index.values, 100)[:10]


CPU times: user 1.32 s, sys: 535 ms, total: 1.86 s
Wall time: 1.27 s


In [237]:
index_batches

[array([ 39396, 143316, 100092,  79650,  83882]),
 array([146180,  45208, 139248,   9867, 151392]),
 array([134684,  36669, 131661,  85628, 150842]),
 array([ 66510,  53116, 105431, 147677,  47349]),
 array([ 64127,   3009, 196728,  80755, 108350]),
 array([ 18977, 190443,  79302,  28138,  90070]),
 array([  9097,  97881,  53612, 157201,  26861]),
 array([116294, 157031,  41667, 165204,  62685]),
 array([154932, 142833,  66542, 106304,  15151]),
 array([185684, 153973, 130908,  31550,  96694])]

In [235]:
from joblib import Parallel, delayed
def parallel_detect_sentiment(index_batches):
    sent_df = Parallel(n_jobs=7)(delayed(detect_sentiment)(batch) for batch in index_batches)
    return sent_df

sent_df = parallel_detect_sentiment(index_batches)



In [236]:
len(sent_df)

10

In [151]:
sample = list(df_news_final_project.sample(10)['text_clean'])

new_sample = []

for item in sample:
    sents = nltk.sent_tokenize(item)
    item = ' '.join(sents[:3])
    # take the bottom 500 characters
    item = item[-500:]
    print(item)
    new_sample.append(item)

liu jin , 31 , has never learned to paint . this “ painting ” was actually generated by him using software – ai painting websites and tools currently popular on social media have the ability to “ turn a sentence into a painting ” . how does ai paint ?
s vice president of engineering . sudeep is an accomplished technology leader with over 18 years of experience in the computer vision industry . in his new role he will be located in bengaluru , india and focused on building the next generation of products and services for the artificial intelligence industry.throughout his career sudeep focused on building computational imaging products and solutions at the intersection of computer vision , imaging , edge computing and autonomous capabilities .
 addition to dayton gastroenterology , a first for independent practices in dayton , stems from the physicians ' commitment to the continuous elevation of quality of care provided to gi patients throughout the greater dayton region . dr. appalanen

In [152]:
sentiments = []
for sent in new_sample:
    sentiments.append(list(classifier.predict(sent).keys())[0])

liu jin , 31 , has never learned to paint . this “ painting ” was actually generated by him using software – ai painting websites and tools currently popular on social media have the ability to “ turn a sentence into a painting ” . how does ai paint ?
s vice president of engineering . sudeep is an accomplished technology leader with over 18 years of experience in the computer vision industry . in his new role he will be located in bengaluru , india and focused on building the next generation of products and services for the artificial intelligence industry.throughout his career sudeep focused on building computational imaging products and solutions at the intersection of computer vision , imaging , edge computing and autonomous capabilities .
 addition to dayton gastroenterology , a first for independent practices in dayton , stems from the physicians ' commitment to the continuous elevation of quality of care provided to gi patients throughout the greater dayton region . dr. appalanen

In [153]:
sentiments

['NEUTRAL',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL',
 'NEUTRAL']

In [154]:
sample

["liu jin , 31 , has never learned to paint . this “ painting ” was actually generated by him using software – ai painting websites and tools currently popular on social media have the ability to “ turn a sentence into a painting ” . how does ai paint ? the reporter tried the overseas website dreamstudio . after adjusting the frame size and other information , the reporter entered keywords such as “ rose ” , “ clouds ” and “ river ” in the blank column of “ dream ” , and used “ oil painting ” as the style . after waiting for more than ten seconds , the ai \u200b\u200b\u200b\u200bgenerated a picture landscape oil painting in traditional style : pink roses grow on the banks of the river , and clouds under the setting sun are reflected in the river . then the reporter tried the mobile phone software ai dream generator . in addition to describing the picture scene , this mobile phone software also has different styles such as chinese style , cyberpunk style , oil painting style , dark styl

In [83]:
# get mode of the items
from collections import Counter

sentiments

counter = Counter(sentiments)
most_common_item = counter.most_common(1)[0][0]

In [84]:
most_common_item

'NEUTRAL'

In [None]:
# stopwords



In [93]:
%%time

import time
from nltk.tokenize import word_tokenize
from collections import defaultdict

# Define the number of topics
num_topics = 10

# Preprocess your news articles and create a list of documents
# Each document should be a list of tokens (words) representing an article
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

doc_complete = df_news_final_project.head(500)['text_clean'].values.tolist()

def clean(doc):
    if type(doc) != str:
        return ''
    doc = ' '.join([i for i in doc.split() if len(i) < 20])
    doc = ' '.join([i for i in doc.split() if len(i) > 1])
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in word_tokenize(punc_free))
    normalized = normalized.replace('’', '')
    normalized = normalized.replace('“', '')
    normalized = normalized.replace('”', '')
    return normalized

# Preprocess your documents
start_time = time.time()
cleaned_documents = [clean(doc) for doc in doc_complete]
clean_time = time.time() - start_time
print("Cleaning done in", clean_time, "seconds")

# Split the cleaned documents into tokens
start_time = time.time()
tokenized_documents = [doc.split() for doc in cleaned_documents]
tok_time = time.time() - start_time
print("Tokenization done in", tok_time, "seconds")

# Calculate word frequencies
start_time = time.time()
word_freq = defaultdict(int)
for document in tokenized_documents:
    for word in document:
        word_freq[word] += 1
freq_time = time.time() - start_time
print("Frequency Calculation Time:", freq_time, "seconds")

# Set the threshold for popular words
threshold = 0.5  # Adjust this value according to your needs

# Create a list of stop words based on the threshold
start_time = time.time()
stop_words = [word for word, freq in word_freq.items() if freq / len(tokenized_documents) > threshold]
stop_words_time = time.time() - start_time
print("Stop Words Creation Time:", stop_words_time, "seconds")

# Filter out stop words from the tokenized documents
start_time = time.time()
filtered_documents = [[word for word in document if word not in stop_words] for document in tokenized_documents]
filtering_time = time.time() - start_time
print("Stop Words Filtering Time:", filtering_time, "seconds")

# Create a dictionary from the preprocessed and filtered documents
start_time = time.time()
dictionary = corpora.Dictionary(filtered_documents)
dictionary_creation_time = time.time() - start_time
print("Dictionary Creation Time:", dictionary_creation_time, "seconds")

# Convert the dictionary into a bag-of-words representation
start_time = time.time()
corpus = [dictionary.doc2bow(doc) for doc in filtered_documents]
corpus_creation_time = time.time() - start_time
print("Corpus Creation Time:", corpus_creation_time, "seconds")

Cleaning done in 1.1734158992767334 seconds
Tokenization done in 0.011795759201049805 seconds
Frequency Calculation Time: 0.027017831802368164 seconds
Stop Words Creation Time: 0.0022242069244384766 seconds
Stop Words Filtering Time: 0.11146306991577148 seconds
Dictionary Creation Time: 0.11047101020812988 seconds
Corpus Creation Time: 0.06594204902648926 seconds
CPU times: user 1.46 s, sys: 44.7 ms, total: 1.5 s
Wall time: 1.5 s


In [95]:
all_documents = [' '.join(item) for item in filtered_documents]

In [96]:


tm = ktrain.text.get_topic_model(
    texts=all_documents,
    n_topics=num_topics, 
    n_features=10000,
    min_df=5,
    max_df=0.5,
    stop_words='english',
    model_type='lda',
    lda_max_iter=5,
    verbose=1)



lang: en
preprocessing texts...
fitting model...
iteration: 1 of max_iter: 5
iteration: 2 of max_iter: 5
iteration: 3 of max_iter: 5
iteration: 4 of max_iter: 5
iteration: 5 of max_iter: 5
done.


In [97]:
tm.print_topics()

topic 0 | patient medical clinical care cookie disease healthcare insurance lender imaging
topic 1 | communication project analytics guided cloud regulated course ibm transaction government
topic 2 | player forecast trend region provides study opportunity healthcare supply strategy
topic 3 | creative asset investment management insight campaign rain brand marketer morgan
topic 4 | tesla musk elon day alto ceo bot selfdriving great event
topic 5 | trump say tech investment organization offer region release way current
topic 6 | organization release security cloud digital capability today press enterprise support
topic 7 | study medical microsoft problem program bank today digital center financial
topic 8 | openai leading big microsoft africa organization analytics sector ceo access
topic 9 | openai say google microsoft search news tech way video question


In [98]:
from ktrain.text.zsl import ZeroShotClassifier

zsl = ZeroShotClassifier()
labels=['healthcare', 'self-driving', 'brand strategy', 'technology']



In [99]:
df_news_final_project['text_clean'].iloc[0]

"allison longyael bensoussan , md , is part of the usf health 's department of otolaryngology - head & neck surgery . she 's leading an effort to collect voice data that can be used to diagnose illnesses . the national institutes of health is funding a massive research project to collect voice data and develop an ai that could diagnose people based on their speech . everything from your vocal cord vibrations to breathing patterns when you speak offers potential information about your health , says laryngologist dr. yael bensoussan , the director of the university of south florida 's health voice center and a leader on the study . `` we asked experts : well , if you close your eyes when a patient comes in , just by listening to their voice , can you have an idea of the diagnosis they have ? '' bensoussan says . `` and that 's where we got all our information . '' someone who speaks low and slowly might have parkinson 's disease . slurring is a sign of a stroke . scientists could even di

In [104]:
df_news_final_project['title'].iloc[1]

'UltraSight Receives CE Mark for Novel Cardiac AI Technology'

In [124]:
doc = df_news_final_project['title'].iloc[1]
pred = zsl.predict(doc, labels=labels, include_labels=True)

# find closet topic
max(pred, key=lambda x: x[1])[0]

'technology'

In [86]:

# Classify new articles based on topics
new_articles = [
    "AI's applications in the healthcare industry",
    "How AI is revolutionizing the finance sector",
    "The impact of AI on the retail industry",
    "Transportation and AI: A futuristic combination"
]

preprocessed_new_articles = preprocessor.preprocess_text(new_articles)

predicted_topics = model.predict(preprocessed_new_articles)

for article, topic_id in zip(new_articles, predicted_topics):
    print(f"Article: {article}")
    print(f"Predicted Topic ID: {topic_id}")
    print()

'the campaign ai now recruits more intelligently, and transfers units to threatened theaters actively. capital city defense behavior, defensive operations, weapons production and distribution among the troops are also among the upgrades. along comes also a long list of bug fixes, balancing, ui and other improvements.'

In [None]:
df_news_final_project