In [2]:
# basic data analytics
import pandas as pd
import numpy as np
import sklearn
import pickle

# nlp modules
import nltk
import spacy
import re

import multiprocessing
import string

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import warnings

# warnings.simplefilter('once')
warnings.simplefilter('ignore')

num_processors = multiprocessing.cpu_count()
num_processors

workers = num_processors-1

print(f'Using {workers} workers')

Using 7 workers


In [3]:
%%time
df_news_final_project = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
df_news_final_project.shape

(200332, 5)

In [12]:
# df_news_final_project.to_csv("/home/jupyter/df_news.csv")

In [4]:
df_news_final_project.head(3)

Unnamed: 0,url,date,language,title,text
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,en,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...
2,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce..."


In [8]:
%%time
# clean up tabs
df_news_final_project['text_clean'] = df_news_final_project['text'].apply(lambda x: x.replace('\t', ' '))

# clean up links
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].apply(lambda x: x.replace(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', ''))

# clean up remnants of web crawls
df_news_final_project['text_clean'] = df_news_final_project['text_clean'].apply(lambda x: x.replace(r'&#\d+;', ''))

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs


In [9]:
df_news_final_project.head(2)

Unnamed: 0,url,date,language,title,text,text_clean
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,\n\nArtificial intelligence improves parking e...
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,en,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...,\nChildren With Autism Saw Their Learning and ...


In [10]:
%%time
notitle = []
for i in df_news_final_project.index.values:
    notitle.append(df_news_final_project['text_clean'][i].replace(df_news_final_project['title'][i], 'Hahahah 23, 2026')) # replace by this chunk so title can also be split by pattern

df_news_final_project['text_notitle'] = notitle

# drop everything after string 'for more information'ABC
df_news_final_project['text_notitle'] = df_news_final_project['text_notitle'].str.split(r'[F|f]or more information', expand=True)[0]


CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs


In [12]:
%%time

pattern = r'(\w{3,10}\.*\s\d{1,2}\,*\s20\d{2})|(\d{1,2}\s\w{3,10}\.*\s20\d{2})|\n+'
df_news_final_project['split'] = df_news_final_project['text_notitle'].apply(lambda x: re.split(pattern=pattern, string=x))
df_news_final_project['split_len'] = df_news_final_project['split'].apply(lambda x: len(x))

CPU times: user 6min 3s, sys: 1.89 s, total: 6min 5s
Wall time: 6min 5s


In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def get_most_similar(index):
    '''
    This function takes in an index of the dataframe and returns the most similar text to the title,
    filtering out other unnecessary texts.
    '''

    title = df_news_final_project['title'][index]
    texts = df_news_final_project['split'][index]

    # print('Original:\n', texts)

    # drop none ones
    texts = [x for x in texts if x != None]

    # print('Dropping None:\n', texts)

    # print('Dropping more period than capital:\n', texts)

    # take out texts that are too short
    texts = [x for x in texts if len(x) > 150]

    # print('Dropping too short:\n', texts)

    # drop texts with Tab patterns
    pattern = r'([A-Z][a-z]+(\n|\t)+){4}'
    texts =  [re.sub(pattern, '', x) for x in texts]

    # drop texts with Sign up or email patterns
    texts = [x for x in texts if re.search(r'([S|s]ign up)|([E|e]mail)', x) == None]

    # print('Dropping Tab patterns:\n', texts)

    # keep texts with small letter patterns
    pattern = r'([a-z]+\s){4}'
    texts = [x for x in texts if re.search(pattern, x) != None]

    # print('Keeping small letter patterns:\n', texts)


    # return NA if there are no text splits left
    if len(texts) == 0:
        return np.nan

    # Tokenize the title and texts
    tokenized_title = nltk.word_tokenize(title.lower())
    tokenized_texts = [nltk.word_tokenize(text.lower()) for text in texts]

    # Convert the tokenized texts to strings
    text_strings = [' '.join(tokens) for tokens in tokenized_texts]

    # print('Tokenized texts:\n', len(text_strings))

    # Create a TF-IDF vectorizer and fit it to the text strings
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(text_strings)
        # print(tfidf_matrix.shape)

    # Compute the cosine similarity between the title and each text
    title_vector = vectorizer.transform([' '.join(tokenized_title)])
    similarity_scores = cosine_similarity(title_vector, tfidf_matrix)

    most_similar_index = similarity_scores.argmax()
        
    most_similar_score = similarity_scores.max()
    index_to_return = most_similar_index
        
    list_articles = []
    list_scores = []

    for i in range(0,len(similarity_scores[0])):
        if similarity_scores[0][i] >= 0.07:
            list_articles.append(text_strings[i])
            list_scores.append(similarity_scores[0][i])
        # print(len(text_strings), len(similarity_scores[0]))
                
        # if len(text_strings) > 1:
        #     second_most_similar_index = similarity_scores.argsort()[0][-2]
        #     # print(similarity_scores)
        #     second_most_similar_score = similarity_scores[0][second_most_similar_index]
        #     if most_similar_score > 0.2:
        #         index_to_return = second_most_similar_index
        # # return texts[index_to_return]
        # return 

    # except:
    #     pass
    

    return ' '.join(list_articles)
    


In [18]:
%%time
import time
from multiprocessing import Pool

most_similar = []
cnt = 0
batch_size = 500
total_batches = (len(df_news_final_project) + batch_size - 1) // batch_size

def process_item(i):
    return get_most_similar(i)

with Pool() as pool:
    for batch in range(total_batches):
        start_time = time.time()
        batch_indices = df_news_final_project.index.values[batch * batch_size: (batch + 1) * batch_size]
        results = pool.imap(process_item, batch_indices)

        for result in results:
            most_similar.append(result)

        end_time = time.time()
        batch_time = end_time - start_time
        print(f"Batch {batch + 1}/{total_batches} - Time spent: {batch_time:.2f} seconds")
    
df_news_final_project['text_clean'] = most_similar
df_news_final_project.dropna(subset=['text_clean'], inplace=True)
df_news_final_project.shape

Batch 1/401 - Time spent: 1.32 seconds
Batch 2/401 - Time spent: 0.93 seconds
Batch 3/401 - Time spent: 1.03 seconds
Batch 4/401 - Time spent: 0.90 seconds
Batch 5/401 - Time spent: 0.99 seconds
Batch 6/401 - Time spent: 0.87 seconds
Batch 7/401 - Time spent: 0.98 seconds
Batch 8/401 - Time spent: 0.91 seconds
Batch 9/401 - Time spent: 0.94 seconds
Batch 10/401 - Time spent: 0.93 seconds
Batch 11/401 - Time spent: 1.00 seconds
Batch 12/401 - Time spent: 0.95 seconds
Batch 13/401 - Time spent: 0.96 seconds
Batch 14/401 - Time spent: 0.94 seconds
Batch 15/401 - Time spent: 0.97 seconds
Batch 16/401 - Time spent: 0.95 seconds
Batch 17/401 - Time spent: 0.99 seconds
Batch 18/401 - Time spent: 0.91 seconds
Batch 19/401 - Time spent: 0.93 seconds
Batch 20/401 - Time spent: 0.91 seconds
Batch 21/401 - Time spent: 0.95 seconds
Batch 22/401 - Time spent: 0.97 seconds
Batch 23/401 - Time spent: 0.93 seconds
Batch 24/401 - Time spent: 0.95 seconds
Batch 25/401 - Time spent: 0.89 seconds
Batch 26/

(190013, 9)

In [19]:
df_news_final_project.head(10)

Unnamed: 0,url,date,language,title,text,text_clean,text_notitle,split,split_len
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,thanks to the application of an artificial int...,"\n\nHahahah 23, 2026\n\nHome\nChina Politics\n...","[, None, None, , Hahahah 23, 2026, None, , Non...",190
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,en,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...,scientists who designed an artificially clever...,"\nHahahah 23, 2026\n \n\nSkip to content\n T...","[, None, None, , Hahahah 23, 2026, None, , Non...",571
2,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce...",the world entered a new era of accelerated tra...,"\n\nHahahah 23, 2026\nHome\nAbout us\nBack iss...","[, None, None, , Hahahah 23, 2026, None, , Non...",457
3,http://www.homeoffice.consumerelectronicsnet.c...,2021-03-10,en,Strategy Analytics: 71% of Smartphones Sold Gl...,\n\nStrategy Analytics: 71% of Smartphones Sol...,boston– ( business wire ) –strategy analytics ...,"\n\nHahahah 23, 2026\n \nSkip to content\n\nCo...","[, None, None, , Hahahah 23, 2026, None, , Non...",91
4,http://www.itbusinessnet.com/2020/10/olympus-t...,2020-10-20,en,Olympus to Support Endoscopic AI Diagnosis Edu...,\n\nOlympus to Support Endoscopic AI Diagnosis...,– ( acn newswire ) – olympus corporation took ...,"\n\nHahahah 23, 2026\n \nSkip to content\n\nIT...","[, None, None, , Hahahah 23, 2026, None, , Non...",1468
5,http://www.mysmartrend.com/news-briefs/technic...,2020-04-17,en,Cr Bard Inc Has Returned 48.9% Since SmarTrend...,\n\nCr Bard Inc Has Returned 48.9% Since SmarT...,smartrend identified an uptrend for cr bard in...,"\n\nHahahah 23, 2026\n \nOWL LOGIN / \n ...","[, None, None, , Hahahah 23, 2026, None, , Non...",184
6,http://www.pentictonherald.ca/entertainment/na...,2020-12-08,en,From the Bard to broadcaster: Stratford Festiv...,\n\nFrom the Bard to broadcaster: Stratford Fe...,". stratford , ont. , stage star alexis gordon ...","\n\nHahahah 23, 2026\n\nYou have permission to...","[, None, None, , Hahahah 23, 2026, None, , Non...",859
7,http://www.peripherals.consumerelectronicsnet....,2021-02-25,en,MulticoreWare Inc. Becomes CEVA’s Trusted Part...,\n\nMulticoreWare Inc. Becomes CEVA’s Trusted ...,"multicoreware inc. , joins the cevanet™ partne...","\n\nHahahah 23, 2026\n\n \nSkip to content\n\n...","[, None, None, , Hahahah 23, 2026, None, , Non...",76
8,http://www.productivityapps.itbusinessnet.com/...,2020-06-23,en,Applitools Visual AI Reaches One Billion Image...,\n\nApplitools Visual AI Reaches One Billion I...,/prnewswire/ — applitools ( https : //applitoo...,"\n\nHahahah 23, 2026\n\n \nSkip to content\n\n...","[, None, None, , Hahahah 23, 2026, None, , Non...",124
9,https://3wnews.org/uncategorised/1351502/artif...,2020-06-14,en,Artificial Intelligence In Behavioral And Ment...,\nArtificial Intelligence In Behavioral And Me...,artificial intelligence in behavioral and ment...,"\nHahahah 23, 2026\n\nSkip to content\nSunday,...","[, None, None, , Hahahah 23, 2026, None, , Non...",253


In [20]:
df_news_final_project.to_csv('/home/jupyter/df_news.csv')

In [21]:
# df_news_final_project

Unnamed: 0,url,date,language,title,text,text_clean,text_notitle,split,split_len
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,thanks to the application of an artificial int...,"\n\nHahahah 23, 2026\n\nHome\nChina Politics\n...","[, None, None, , Hahahah 23, 2026, None, , Non...",190
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,en,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...,scientists who designed an artificially clever...,"\nHahahah 23, 2026\n \n\nSkip to content\n T...","[, None, None, , Hahahah 23, 2026, None, , Non...",571
2,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce...",the world entered a new era of accelerated tra...,"\n\nHahahah 23, 2026\nHome\nAbout us\nBack iss...","[, None, None, , Hahahah 23, 2026, None, , Non...",457
3,http://www.homeoffice.consumerelectronicsnet.c...,2021-03-10,en,Strategy Analytics: 71% of Smartphones Sold Gl...,\n\nStrategy Analytics: 71% of Smartphones Sol...,boston– ( business wire ) –strategy analytics ...,"\n\nHahahah 23, 2026\n \nSkip to content\n\nCo...","[, None, None, , Hahahah 23, 2026, None, , Non...",91
4,http://www.itbusinessnet.com/2020/10/olympus-t...,2020-10-20,en,Olympus to Support Endoscopic AI Diagnosis Edu...,\n\nOlympus to Support Endoscopic AI Diagnosis...,– ( acn newswire ) – olympus corporation took ...,"\n\nHahahah 23, 2026\n \nSkip to content\n\nIT...","[, None, None, , Hahahah 23, 2026, None, , Non...",1468
...,...,...,...,...,...,...,...,...,...
200327,https://www.zazoom.it/2020-12-14/comviva-and-o...,2020-12-14,en,Comviva and Ooredoo Kuwait form strategic part...,\n\nComviva and Ooredoo Kuwait form strategic ...,"/prnewswire/ comviva , the global leader in mo...","\n\nHahahah 23, 2026\n\nAccedi\n\n\nHome\n\n\n...","[, None, None, , Hahahah 23, 2026, None, , Non...",205
200328,https://www.zdnet.com/article/afp-used-clearvi...,2020-04-15,en,AFP used Clearview AI facial recognition softw...,AFP used Clearview AI facial recognition softw...,the australian federal police ( afp ) has admi...,"Hahahah 23, 2026\n\n ...","[, Hahahah 23, 2026, None, , None, None, ...",1318
200329,https://www.zdnet.com/article/from-data-to-kno...,2020-12-22,en,From data to knowledge and AI via graphs: Tech...,\n\nFrom data to knowledge and AI via graphs: ...,"in the new knowledge-based digital world , enc...","\n\nHahahah 23, 2026\n\n \n\n ...","[, None, None, , Hahahah 23, 2026, None, , Non...",1030
200330,https://yourstory.com/2020/10/ys-learn-helping...,2020-10-07,en,[YS Learn] From helping diabetic patients to m...,[YS Learn] From helping diabetic patients to m...,by sindhu kashyaap|7th oct 2020the minimum via...,"Hahahah 23, 2026LOGINYourStoryEducationHerStor...","[, Hahahah 23, 2026, None, LOGINYourStoryEduca...",37
