# Reuters Data Collection & Processing

This notebook serve the purpose to fetch the data from the reuters webpage using their query api and crawling methods

Disclaimer: Crawler is legal here. If the user can see the data without boting (login), then you can fetch it with normal requests

# Imports 

In [None]:
import wikipediaapi

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import nltk
import re


# Constant params 

In [None]:
MAX_PAGE_SIZE = 100
REUTERS_QUERY_URL = 'https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2'

# Get keywords related to a company.

## Get entities from the text

In [None]:
nltk.download('averaged_perceptron_tagger')

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

# used only for texting
text_tesla = """
Tesla, Inc. is an American multinational automotive and clean energy company headquartered in Austin, Texas, which designs, manufactures and sells electric vehicles, stationary battery energy storage devices from home to grid-scale, solar panels and solar shingles, and related products and services.
Tesla was incorporated in July 2003 by Martin Eberhard and Marc Tarpenning as Tesla Motors. The company's name is a tribute to inventor and electrical engineer Nikola Tesla. In February 2004 Elon Musk joined as the company's largest shareholder and in 2008 he was named CEO. In 2008, the company began production of its first car model, the Roadster sports car, followed by the Model S sedan in 2012, the Model X SUV in 2015, the Model 3 sedan in 2017, the Model Y crossover in 2020, the Tesla Semi truck in 2022 and the Cybertruck pickup truck in 2023. The Model 3 is the all-time bestselling plug-in electric car worldwide, and in June 2021 became the first electric car to sell 1 million units globally. In 2023, the Model Y was the best-selling vehicle, of any kind, globally.Tesla is one of the world's most valuable companies. In October 2021, Tesla's market capitalization temporarily reached $1 trillion, the sixth company to do so in U.S. history. As of 2023, it is the world's most valuable automaker. In 2022, the company led the battery electric vehicle market, with 18% share.
Tesla has been the subject of lawsuits, government scrutiny, and journalistic criticism, stemming from allegations of whistleblower retaliation, worker rights violations, product defects, and Musk's many controversial statements.
"""

# convert the wordpiece into one
def combine_bert_word_piece(doc, entities):    
    for entity in entities:
        if entity['word'].startswith("#"):
        
            # find the entity before 
            before_entities = [*filter(lambda ent: ent["end"] == entity["start"], entities)]
            
            # if the port of the word is found
            if len(before_entities) > 0:
                # get the entity
                before_entity = before_entities[0]
                
                # check if the combination exists in the text 
                combined = (before_entity["word"] + entity["word"].replace("##",""))
                                
                # if the combination exists in the document, get it 
                if combined in doc:
                    # remove both entites which are not relevant for further processing
                    entities = [*filter(lambda ent: ent["end"] != entity["start"] and entity["word"] != ent["word"],entities)]
                    
                    # replace it with a word relevant to the t'ext
                    entity["word"] = combined
                    entities.append(entity)
            else:       # if the part does not have a before part, remove it 
                entities = [*filter(lambda ent: entity['word'] != ent["word"],entities)]
                
    return entities

def remove_duplicates_entities(entities): 
    new_entities = []
    for entity in entities:
        word = entity["word"]
        if word not in new_entities:
            new_entities.append(word)
            
    return new_entities

# leave only the misc in the text
ner_entities = nlp(text_tesla)
# print(ner_entities)

# combine the remaining bert word pieces if any are left out
ner_entities = combine_bert_word_piece(text_tesla, ner_entities)

# remove duplicates entities
ner_entities = remove_duplicates_entities(ner_entities)

print(ner_entities)


# Get keywords from the text

In [None]:
tokenizer_keyword = AutoTokenizer.from_pretrained("yanekyuk/bert-keyword-extractor")
model_keyword = AutoModelForTokenClassification.from_pretrained("yanekyuk/bert-keyword-extractor")

nlp_keyword = pipeline("token-classification", model=model_keyword, tokenizer=tokenizer_keyword, grouped_entities=True)
 
# get all keywords from the text
    
def complete_incomplete_keywords(keywords, doc):
    
    new_keywords_array = []
    for keyword in keywords:
        # first detect if keyword is part of a whole word.
        patter_whole_word = rf"[,.;\"'\/\\\s]({keyword})[,.;\"'\/\\\s]"
        is_whole_word = bool(re.search(patter_whole_word, doc))
            
        # if is part of a complete word, append, else complete the word
        if is_whole_word:
            new_keywords_array.append(keyword)
        else:
            if keyword in doc:
                pat_in_word = rf"(?<=[,.;\"'\/\\\s])(\w*){keyword}(\w*)(?=[,.;\"'\/\\\s])"
                match = re.search(pat_in_word,doc)
            
                if match is not None:
                    new_full_key_word = f"{match.groups(1)[0]}{keyword}{match.groups(1)[1]}"
                    # print(f"Keyword inside a full word found: {match.groups(1)[0]}{keyword}{match.groups(1)[1]}")
                    new_keywords_array.append(new_full_key_word)
            else:
                #TODO: match words with space a special character after
                special_char_keyword = keyword.replace(" ", "")
                if special_char_keyword in doc:
                    pat_in_word = rf"(?<=[,.;\"'\/\\\s])(\w*){special_char_keyword}(\w*)(?=[,.;\"'\/\\\s])"
                    match = re.search(pat_in_word,doc)
                
                    if match is not None:
                        new_full_key_word = f"{match.groups(1)[0]}{special_char_keyword}{match.groups(1)[1]}"
                        # print(f"Keyword inside a special character: {match.groups(1)[0]}{special_char_keyword}{match.groups(1)[1]}")
                        new_keywords_array.append(new_full_key_word)
                else: 
                    print("Invalid keyword: ", special_char_keyword)
               
    return new_keywords_array

# join with entities list
def join_key_token_lists(token_list_a, token_list_b):
    for keyword in token_list_a:
        if keyword not in token_list_b:
            token_list_b.append(keyword)

    return token_list_b

def get_keywords_from_text(txt):
    res = combine_bert_word_piece(text_tesla,nlp_keyword(txt))
    res = [*map(lambda k_word: k_word["word"], res)]
    return res

def remove_invalid_keywords(keywords):    
    return list(filter(lambda k_word: not bool(re.search(rf"^[-,;.:\"'\/\\\s]*$",k_word)), keywords))

keyword_result = get_keywords_from_text(text_tesla)

key_tokens = ner_entities.copy()

keyword_result = join_key_token_lists(keyword_result,key_tokens)
keyword_result = complete_incomplete_keywords(keyword_result, text_tesla)

print(keyword_result)

## Single Function to extract from wikipedia page

In [None]:
def get_wikipedia_sections(wikipedia):
    sections_content = wikipedia_sections(wikipedia.sections, 0, [] , {})    
    sections_content["main"] = wikipedia.summary
    return sections_content

def get_wikipedia_full_page(wikipedia):
    sections_content = wikipedia_sections(wikipedia.sections, 0, [] , {})
    text_full = []
    for section in sections_content:
        text_full.append(sections_content[section])
    return text_full

def wikipedia_sections(sections, level, sections_list, out):
    # get all the sections 
    for s in sections:
        if s.title not in out:
            out[s.title] = s.text
        
        wikipedia_sections(s.sections, level + 1, sections_list, out)
        
    return out

    

In [None]:
def remove_duplicate_keywords(keywords):
    new_keywords = []
    for keyword in keywords:
        if keyword not in new_keywords:
            new_keywords.append(keyword)

    return new_keywords

def get_wikipedia_page(name):
    # get wikipedia page content
    wiki_page = wikipediaapi.Wikipedia('Sebastian Tatar (sebi.tatar2@gmail.com)', 'en')
    return wiki_page.page(name)

# get keywords from wikipedia page
def get_relevant_words_from_company(name):
    
    # get wikipedia page content
    wiki_page = get_wikipedia_page(name)
   
    sections = get_wikipedia_sections(wiki_page)
    section_main = sections["main"]
            
    # get the name entity
    named_entities = nlp(section_main)
    # combine the remaining bert word pieces if any are left out
    named_entities = combine_bert_word_piece(section_main, named_entities)
    
    # remove duplicates entities
    named_entities = remove_duplicates_entities(named_entities)
    named_entities = complete_incomplete_keywords(named_entities, section_main)

    # get company keywords
    keywords = get_keywords_from_text(section_main)
    #print(keywords)
    keywords = remove_duplicate_keywords(keywords)
    keywords = complete_incomplete_keywords(keywords, section_main)
    keywords = remove_invalid_keywords(keywords)
    
    print(keywords)

    # TODO: Topic Modeling
 
    return {
        "entities": named_entities,
        "keywords": keywords,
        "topics":[],
        "combined": join_key_token_lists(named_entities,keywords),
    }

keywords_tesla = get_relevant_words_from_company("Tesla, Inc.")
keywords_nvidia = get_relevant_words_from_company("Nvidia")
#keywords_microsoft = get_relevant_words_from_company("Microsoft")
#keywords_proctor_gamble = get_relevant_words_from_company("Procter & Gamble")
#keywords_coca_cola = get_relevant_words_from_company("Coca-Cola")

# print(keywords_tesla)


In [None]:
import numpy as np
from sklearn import preprocessing

keywords_tfidf = keywords_tesla.copy()


def find_num_occurrences(word, string):
    return string.lower().count(word.lower())

def get_keywords_weight_from_corpus(corpus, keywords):
    
    words_weight_arr = {}
    for word in keywords:
        word_count = 0
        for corp in corpus:
            sum_in_line = find_num_occurrences(word, corp)
            word_count = word_count + sum_in_line
            # print(word_count)

        if word in words_weight_arr:
            words_weight_arr[word] = words_weight_arr[word] + word_count
        else:
            words_weight_arr[word] = word_count
    
    key_arr = list(words_weight_arr.keys())
    values_arr = list(words_weight_arr.values())
    values_arr = preprocessing.normalize([np.array(values_arr)])[0].tolist()

    words_weight_arr = {key_arr[i]: values_arr[i] for i in range(len(key_arr))}

    return sorted(words_weight_arr.items(), key=lambda item: item[1])
    
wiki_page_tesla = get_wikipedia_page("Tesla, Inc.")
corpus_tesla = get_wikipedia_full_page(wiki_page_tesla)[1:] # first is empty

words_weights = get_keywords_weight_from_corpus(corpus_tesla, keywords_tesla['combined'])

for words_weight in words_weights:
    print(words_weight)


In [None]:
reference_title = "Elon Musk's Tesla overhaul hits executive bench he touted"

reference_text = 'May 1 (Reuters) - Just over a year ago, Elon Musk shared the stage at Tesla\'s (TSLA.O), opens new tab investor day in Texas with 16 executives who gave detailed presentations on the company\'s technology and growth plans, then lined up behind their boss in a show of solidarity.,\"We\'ve obviously got significant bench strength here," Musk said at the time, responding to investor concerns that the world\'s most valuable automaker was too much a one-man show. Now, at least five members of that team are gone, a Reuters analysis shows. Tesla, Musk and the 16 executives on the stage last year could not be reached for comment. Musk in a recent email to senior managers outlined plans to lay off hundreds more employees, including two top executives, the Information reported. "Hopefully these actions are making it clear that we need to be absolutely hard core about headcount and cost reduction," Musk wrote in the email, the report said. Two senior executives who flanked Musk on investor day last year are gone: Zach Kirkhorn, former CFO, resigned with a nondisclosure agreement, according to Tesla regulatory filings. Drew Baglino, Tesla\'s former chief battery engineer, left in the wave of layoffs Musk ordered last month. Baglino dumped $181 million in Tesla stock as he left. Rebecca Tinucci, who headed up Tesla\'s charging team, was one of two women on stage for the investor day last March. "We have understood since Day One that a great charging experience is the linchpin to electric vehicle adoption," Tinucci said as she walked onstage. In the subsequent year, nearly all rival automakers in the United States agreed to adopt Tesla\'s charging standards and cut deals to let their EV buyers charge at Tesla stations. Tinucci and much of her team were sacked this week. In a posting on his social media platform X, Musk said Tesla plans "to grow the Supercharger network, just at a slower pace for new locations and more focus on 100% uptime and expansion of existing locations." Another executive on the stage who left was Colin Campbell, the former vice president of powertrain engineering. The loss of so many executives is something the Tesla board should be monitoring, said Charles Elson, founding director of the Weinberg Center for Corporate Governance at the University of Delaware. "Lots of departures very quickly suggest a problematic leadership style," he said. "You shouldn\'t lose that many people that quickly." With Tesla\'s revenue, profit and share price falling, Musk has reasserted his dominance at the company. For some investors, that is more important than the executive churn. "Elon\'s not there and we have this turnover? That\'s very bad," said Gene Munster, managing partner with Deepwater Asset Management and a Tesla investor. "If Elon\'s there, he\'s going to draw on talent to keep things going so it really all comes down to Elon remaining a part of the story." Musk has signaled significant strategy shifts in response to falling sales and tougher competition - changes that could leave out executives running operations no longer central to the new plans. Tesla\'s future lies in artificial intelligence and robotaxis, not conventional auto manufacturing, Musk told investors in April. Musk is putting action behind those words. He has ordered a 10% cut in staff and scrapped plans for a new, low-cost line of vehicles in favor of revamping existing models to develop lower-priced entries. Tesla said it will pause construction of new factories until the company\'s sales had reached 3 million vehicles a year - enough to fill up the automaker\'s existing production operations. "If you buy the narrative that Tesla is an AI company fundamentally, it may not be cause for concern," said K.C. Boyce, vice president at data analytics and advisory firm Escalent. "It fits into the idea of sizing and resourcing the business correctly to deliver on the promise of full self-driving and robotaxi." Other senior Tesla executives, who were not among those onstage during the 2023 investor day, have left in recent weeks. Daniel Ho, a former Ford executive and 10-year Tesla veteran who had been director of new car programs, is no longer with the company. Rohan Patel, a former Obama administration official who had been Tesla VP for public policy and key to expansion plans for India, said he is leaving. Another executive to exit was Allie Arebalo, Tesla\'s senior director of human resources, two people familiar with the matter said on Wednesday. Martin Viecha, head of investor relations who also was on the stage last year with Musk, announced his departure at the end of an April 24 conference call with analysts. Unlike most of the other departed executives, Viecha received a warm sendoff from Musk. "The reason I reached out to you was because I thought your analysis of Tesla was the best that I had seen," Musk said on the call. Some analysts said the executive team is critical given the challenges faced by the EV maker. "Having a strong bench behind Musk is important at this pivotal time given the Category 5 storm that Tesla\'s going through," Wedbush Securities analyst Dan Ives said.'

def calculate_text_relevance(title, text, keywords_weighted):
    relevance_score = 0
    for keyword in keywords_weighted:
        keyword_count = text.count(keyword[0])
        relevance_score += keyword_count * keyword[1]
    return relevance_score
        
calculate_text_relevance(reference_title, reference_text, words_weights)
        

## Sentiment Analysis for Text


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import scipy
import torch

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
tokenizer.model_max_length = 2540
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")


reference_text_small = 'Just over a year ago, Elon Musk shared the stage at Tesla\'s (TSLA.O), opens new tab investor day in Texas with 16 executives who gave detailed presentations on the company\'s technology and growth plans, then lined up behind their boss in a show of solidarity.,\"We\'ve obviously got significant bench strength here," Musk said at the time, responding to investor concerns that the world\'s most valuable automaker was too much a one-man show. Now, at least five members of that team are gone, a Reuters analysis shows. Tesla, Musk and the 16 executives on the stage last year could not be reached for comment. Musk in a recent email to senior managers outlined plans to lay off hundreds more employees, including two top executives, the Information reported. "Hopefully these actions are making it clear that we need to be absolutely hard core about headcount and cost reduction," Musk wrote in the email, the report said. Two senior executives who flanked Musk on investor day last year are gone: Zach Kirkhorn, former CFO, resigned with a nondisclosure agreement, according to Tesla regulatory filings. Drew Baglino, Tesla\'s former chief battery engineer, left in the wave of layoffs Musk ordered last month. Baglino dumped $181 million in Tesla stock as he left. Rebecca Tinucci, who headed up Tesla\'s charging team, was one of two women on stage for the investor day last March. "We have understood since Day One that a great charging experience is the linchpin to electric vehicle adoption," Tinucci said as she walked onstage. '

In [60]:
import asyncio

# Needed this code to get the sentiment score:
# https://datascience.stackexchange.com/a/112446
async def get_sentiment_score_text(text, sentiment_score: list):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    scores = {k: v for k, v in zip(model.config.id2label.values(), scipy.special.softmax(logits.numpy().squeeze()))}

    sentiment_score.append(scores)

    return scores

async def get_sentiment_score_large_text(text):
    sentences = nltk.sent_tokenize(text)

    task_list = []
    
    sentiment_scores = []   
        
    for sentence in sentences:
        task_list.append(asyncio.create_task(get_sentiment_score_text(sentence, sentiment_scores)))
        # sentiment_scores.append(sentence_score)

    await asyncio.gather(*task_list)
        
    # Average the sentiment scores
    avg_sentiment_score = {k: sum(d[k] for d in sentiment_scores)/len(sentiment_scores) for k in sentiment_scores[0]}

    return avg_sentiment_score

sentiment_score = await get_sentiment_score_large_text(reference_text)

print(sentiment_score)

{'positive': 0.18652974384335372, 'negative': 0.4256132437537114, 'neutral': 0.38785701694014746}
