In [1]:
import requests

import json

import re

import pandas as pd

url = 'https://publicapi.schroders.com/schroders/external-production/public/api/v1/Contents/articles/recent?country=uk&Language=en&AudienceId=358859'

r = requests.get(url)

payload_dict = json.loads(r.text)

# print(payload_dict)

recentArticles_list = payload_dict['recentArticles']

# recent articles

TitleOfText_list = [dict_key['title'] for dict_key in recentArticles_list if 'title' in dict_key]

BodyOfText_list = [dict_key['body'] for dict_key in recentArticles_list if 'body' in dict_key]

# print(BodyOfText_list)


df1 = pd.DataFrame(TitleOfText_list, columns=['Title'])

df2 = pd.DataFrame(BodyOfText_list, columns=['Body'])

df = pd.concat([df1, df2], axis=1, join='inner')

df['Body'] = df['Body'].str.replace(r'<[^<>]*>', '', regex=True)

df["Body"] = df['Body'].str.replace('[^\w\s]', '')

df['Title'] = df['Title'].str.lower()

df['Body'] = df['Body'].str.lower()

df = df.dropna()

print(df)

## Preparing Text for LDA Analysis

import gensim

from gensim.utils import simple_preprocess

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

import gensim.corpora as corpora

stop_words = stopwords.words('english')

stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations

        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))


def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))

             if word not in stop_words] for doc in texts]


data = df.Body.values.tolist()

data_words = list(sent_to_words(data))

# remove stop words

data_words = remove_stopwords(data_words)

# print(data_words[:1][0][:30])


# Create Dictionary

id2word = corpora.Dictionary(data_words)

# Create Corpus

texts = data_words

# Term Document Frequency

corpus = [id2word.doc2bow(text) for text in texts]

# View

print(corpus[:1][0][:30])

## LDA Model Training

from pprint import pprint

# number of topics

num_topics = 3

# Build LDA model

lda_model = gensim.models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

pprint(lda_model.print_topics())

doc_lda = lda_model[corpus]

##Analyzing LDA Model

from gensim.models import CoherenceModel

import pyLDAvis


import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# Compute Perplexity

print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')

coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)


pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model, corpus, id2word)

vis


  df["Body"] = df['Body'].str.replace('[^\w\s]', '')


                                               Title  \
0  why bargain hunters should be shopping in euro...   
1       bank of england shows new focus on inflation   
2  ten thought-provoking books for your summer re...   
3  can long short funds offer a port in a stock m...   
4                      uk interest rates: what next?   
5                 monthly markets review - july 2022   
6  peter harrison: why we must make nature invest...   
7  four charts that make the case for value inves...   
8               cop27: a quick guide to common terms   

                                                Body  
0  the value investor has become something of an ...  
1  the bank of england boe has raised its main po...  
2  looking for something to feed your thoughts an...  
3  to say 2022 has been a difficult year for inve...  
4  thursday 4 august ndash inflation unlikely to ...  
5  the month in summary\ndeveloped market shares ...  
6  because we donrsquot quantify the benefits of ...  

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saracheakdkaipejchara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[(0, 3), (1, 1), (2, 3), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1)]
[(0,
  '0.006*"interest" + 0.006*"rates" + 0.006*"inflation" + 0.005*"climate" + '
  '0.005*"ndash" + 0.004*"change" + 0.004*"higher" + 0.004*"carbon" + '
  '0.003*"investment" + 0.003*"uk"'),
 (1,
  '0.009*"inflation" + 0.007*"value" + 0.007*"rates" + 0.007*"market" + '
  '0.006*"interest" + 0.005*"ndash" + 0.005*"bank" + 0.005*"growth" + '
  '0.004*"us" + 0.004*"july"'),
 (2,
  '0.008*"inflation" + 0.007*"rates" + 0.006*"interest" + 0.005*"market" + '
  '0.005*"ndash" + 0.004*"companies" + 0.004*"rate" + 0.004*"bank" + '
  '0.004*"climate" + 0.004*"uk"')]

Perplexity:  -7.983673963541395

Coherence Score:  0.29673190250278425


  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
