# Importing and Cleaning data
*Articles used are from a financial news dataset*

In [None]:
!unzip /content/2013-01-.zip -d /content/data

## Importing data

In [3]:
import pandas as pd
import os
import re
from datetime import datetime
from tqdm import tqdm

def list_files(dir):
    text = []
    for root, dirs, files in tqdm(os.walk(dir)):
        for name in files:
            #print(name)
            match = re.search(r"\d{4}-\d{2}-\d{2}", root)
            if match != None:
                date = datetime.strptime(match.group(), "%Y-%m-%d").date()
                txtfile = open(os.path.join(root, name), "r")
                data = txtfile.read()
                txtfile.close()
                #text.append((date, str))
                text.append(data)
    return text

text = list_files("/home/prairit/Documents/20061020_20131126_bloomberg_news/2013-01-")
len(text)

14it [00:00, 52.91it/s]


3831

## Cleaning data

In [4]:
text_clean = [re.sub(r'http\S+', '', t) for t in text]
text_clean= [t.strip().replace('\n', ' ') for t in text_clean]
docs = [re.sub(r'[\w\.-]+@[\w\.-]+', '', t) for t in text_clean]
docs = [t.strip().replace('`', ' ') for t in docs]
docs = [t.strip().replace('--', ' ') for t in docs]

In [5]:
docs[0]

'  Gunmen Kill 7 During Mosque Prayers in Nigerian Villages   B y   A r d o   H a z z a d   2013-01-06T15:47:00Z    Unidentified attackers opened fire during prayers yesterday in 3 villages in northwestern Zamfara state, Ibrahim Birnin Magaji, the state’s Commissioner of Information, says by phone. * Gunmen killed 7 worshipers, wounded others: Magaji  To contact the reporter on this story: Ardo Hazzad in Bauchi at      To contact the editor responsible for this story: Antony Sguazzin at'

# Summarize all the documents

In order to allow BERTopic to process the articles properly, we need to have 512 tokens, as it is using DistilBERT under the hood to work.

In [6]:
!pip install bert-extractive-summarizer
!pip install sentencepiece

Collecting bert-extractive-summarizer
  Downloading bert_extractive_summarizer-0.7.1-py3-none-any.whl (18 kB)
Collecting spacy
  Downloading spacy-3.0.6-cp38-cp38-manylinux2014_x86_64.whl (13.0 MB)
[K     |████████████████████████████████| 13.0 MB 5.7 MB/s eta 0:00:01
Collecting thinc<8.1.0,>=8.0.3
  Downloading thinc-8.0.3-cp38-cp38-manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 6.3 MB/s eta 0:00:01
[?25hCollecting wasabi<1.1.0,>=0.8.1
  Downloading wasabi-0.8.2-py3-none-any.whl (23 kB)
Collecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting pydantic<1.8.0,>=1.7.1
  Downloading pydantic-1.7.3-cp38-cp38-manylinux2014_x86_64.whl (12.2 MB)
[K     |████████████████████████████████| 12.2 MB 4.5 MB/s eta 0:00:01
[?25hCollecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.5-cp38-cp38-manylinux2014_x86_64.whl (130 kB)
[K     |████████████████████████████████| 130 kB 11.0 MB/s eta 0:00:01
[?25hCollecting srsly<

In [7]:
from summarizer import Summarizer

In [8]:
model = Summarizer()

Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





In [9]:
summarized_docs = []
for doc in tqdm(docs):
    result = model(doc, num_sentences=10)
    full = ''.join(result)
    summarized_docs.append(full)

  2%|▏         | 89/3831 [01:12<50:34,  1.23it/s]  


KeyboardInterrupt: 

In [None]:
docs[0]

In [None]:
summarized_docs[0]

'Palm Oil Imports by China to Drop, Boosting World Stockpiles   B y   B l o o m b e r g   N e w s   2013-01-14T09:35:42Z    Palm oil  imports  by  China , the world’s biggest cooking oil consumer, are set to plunge this month after the government imposed more stringent inspections on shipments, potentially increasing global inventories. China’s quality watchdog, the General Administration of Quality Supervision, Inspection and Quarantine, toughened inspections on imports of cooking oils from Jan. 1 to improve food safety. Futures slumped 23 percent last year as stockpiles expanded in Malaysia and in  Indonesia , the biggest producer, and economic slowdowns in Europe and China curbed demand. Rabobank International said on Jan. 10 that uncertainty about Chinese demand may weigh on prices on the Malaysia Derivatives Exchange. Price Gain  Prices rose as much as 1.4 percent to 2,402 ringgit ($796) on the bourse in  Kuala Lumpur  today and traded at 2,371 ringgit by 5:34 p.m. after Plantatio

In [None]:
df = pd.DataFrame(summarized_docs)
# df = pd.DataFrame(docs)

In [None]:
df.head()

Unnamed: 0,0
0,"Palm Oil Imports by China to Drop, Boosting Wo..."
1,Australian Dollar Rises on Asian Stock Gains; ...
2,Lincoln Rolls Out Range Rover-Like SUV to Lure...
3,"Patriots, Falcons Advance to NFL’s Conference ..."
4,Qatar National Bank Boosts Dividend After Net ...


In [None]:
df.to_csv('summarized_docs.csv', index=False) #to save result

# Train BERTopic Using Summarized Documents

In [None]:
!pip install bertopic

In [None]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# Prepare custom models
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', 
                        cluster_selection_method='eom', prediction_data=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

# Pass the custom models to BERTopic
topic_model = BERTopic(umap_model=umap_model, 
                       hdbscan_model=hdbscan_model, 
                       vectorizer_model=vectorizer_model)

In [None]:
# len(summarized_docs)

In [None]:
topics, probabilities = topic_model.fit_transform(summarized_docs)

## Visualising the topics

In [None]:
topic_model.visualize_topics()

## Find topics most similar to a search term

In [None]:
similar_topics, similarity = topic_model.find_topics("carbon emissions", top_n=5)
topic_model.get_topic(similar_topics[0])

[('eu', 0.032285876295559277),
 ('european', 0.021740826615736827),
 ('european union', 0.015744156650301808),
 ('carbon permits', 0.011992824352707382),
 ('emissions', 0.009444161624893565),
 ('union carbon', 0.009369742357744059),
 ('carbon allowances', 0.008836312195045964),
 ('futures europe', 0.008738363355624777),
 ('eu carbon', 0.008615075018982435),
 ('german', 0.008553656266004114)]

In [None]:
topic_model.find_topics("carbon emissions", top_n=5)

([22, 61, 66, 3, 21],
 [0.6323795982250098,
  0.5007317997897678,
  0.4451454274526525,
  0.435304818493492,
  0.43474756370208684])

##Return top n words for a specific topic and their c-TF-IDF scores

In [None]:
topic_model.get_topic(22)

[('eu', 0.032285876295559277),
 ('european', 0.021740826615736827),
 ('european union', 0.015744156650301808),
 ('carbon permits', 0.011992824352707382),
 ('emissions', 0.009444161624893565),
 ('union carbon', 0.009369742357744059),
 ('carbon allowances', 0.008836312195045964),
 ('futures europe', 0.008738363355624777),
 ('eu carbon', 0.008615075018982435),
 ('german', 0.008553656266004114)]

## Displaying documents

In [None]:
def find_docs(topic_id):
    x = []
    for idx, item in enumerate(topics):
        if topics[idx]==topic_id:
            x.append(idx)
    return x

In [None]:
found_docs = find_docs(22)

In [None]:
for i in range(5):   
    print(f"Document: {found_docs[i]}")
    print("-----------")
    print(summarized_docs[found_docs[i]]) 
    print("-----------")
    print()

Document: 227
-----------
EU Sees Next Round of Iranian Nuclear Talks ‘Very Soon’   B y   J a m e s   G . N e u g e r   2013-01-04T11:17:53Z    Talks between world powers and Iran over its nuclear program are likely to resume “very soon,” the European Union said. “They’re negotiating the modalities and the details for the next round to be held very soon,” Sebastien Brabant, a spokesman for EU foreign policy chief  Catherine Ashton , told reporters in Brussels today. Ashton represents the five permanent United Nations Security Council countries plus Germany in the nuclear talks. To contact the editor responsible for this story: James G. Neuger at
-----------

Document: 236
-----------
EU Carbon Permits Post Second Weekly Drop Ahead of Supply Boost   B y   M a t h e w   C a r r   2013-01-04T16:39:12Z    European Union carbon permits dropped a second week before a supply increase starting Jan. 7. EU contracts for December were unchanged at 6.40 euros ($8.35) a metric ton on  London ’s ICE