# Reuters Data Collection & Processing

This notebook serve the purpose to fetch the data from the reuters webpage using their query api and crawling methods

Disclaimer: Crawler is legal here. If the user can see the data without boting (login), then you can fetch it with normal requests

# Imports 

In [99]:
from lxml import etree
from io import StringIO, BytesIO

import requests
from pygments.lexer import combined

import wikipediaapi

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# Constant params 

In [100]:
MAX_PAGE_SIZE = 100
REUTERS_QUERY_URL = 'https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2'

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

# Get keywords related to a company.

## Get entities from the text

In [None]:


nltk.download('averaged_perceptron_tagger')

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

text = """
Tesla, Inc. is an American multinational automotive and clean energy company headquartered in Austin, Texas, which designs, manufactures and sells electric vehicles, stationary battery energy storage devices from home to grid-scale, solar panels and solar shingles, and related products and services.
Tesla was incorporated in July 2003 by Martin Eberhard and Marc Tarpenning as Tesla Motors. The company's name is a tribute to inventor and electrical engineer Nikola Tesla. In February 2004 Elon Musk joined as the company's largest shareholder and in 2008 he was named CEO. In 2008, the company began production of its first car model, the Roadster sports car, followed by the Model S sedan in 2012, the Model X SUV in 2015, the Model 3 sedan in 2017, the Model Y crossover in 2020, the Tesla Semi truck in 2022 and the Cybertruck pickup truck in 2023. The Model 3 is the all-time bestselling plug-in electric car worldwide, and in June 2021 became the first electric car to sell 1 million units globally. In 2023, the Model Y was the best-selling vehicle, of any kind, globally.Tesla is one of the world's most valuable companies. In October 2021, Tesla's market capitalization temporarily reached $1 trillion, the sixth company to do so in U.S. history. As of 2023, it is the world's most valuable automaker. In 2022, the company led the battery electric vehicle market, with 18% share.
Tesla has been the subject of lawsuits, government scrutiny, and journalistic criticism, stemming from allegations of whistleblower retaliation, worker rights violations, product defects, and Musk's many controversial statements.
"""

# convert the wordpiece into one
def combine_bert_word_piece(doc, entities):    
    for entity in entities:
        if entity['word'].startswith("#"):
        
            # find the entity before 
            before_entities = [*filter(lambda ent: ent["end"] == entity["start"], entities)]
            
            # if the port of the word is found
            if len(before_entities) > 0:
                # get the entity
                before_entity = before_entities[0]
                
                # check if the combination exists in the text 
                combined = (before_entity["word"] + entity["word"].replace("##",""))
                                
                # if the combination exists in the document, get it 
                if combined in doc:
                    # remove both entites which are not relevant for further processing
                    entities = [*filter(lambda ent: ent["end"] != entity["start"] and entity["word"] != ent["word"],entities)]
                    
                    # replace it with a word relevant to the t'ext
                    entity["word"] = combined
                    entities.append(entity)
            else:       # if the part does not have a before part, remove it 
                entities = [*filter(lambda ent: entity['word'] != ent["word"],entities)]
                
    return entities

def remove_duplicates_entities(entities): 
    new_entities = []
    for entity in entities:
        word = entity["word"]
        if word not in new_entities:
            new_entities.append(word)
            
    return new_entities

# leave only the misc in the text
ner_entities = nlp(text)
# print(ner_entities)

# combine the remaining bert word pieces if any are left out
ner_entities = combine_bert_word_piece(text, ner_entities)

# remove duplicates entities
ner_entities = remove_duplicates_entities(ner_entities)

print(ner_entities)


# Get keywords from the text

In [102]:
tokenizer_keyword = AutoTokenizer.from_pretrained("yanekyuk/bert-keyword-extractor")
model_keyword = AutoModelForTokenClassification.from_pretrained("yanekyuk/bert-keyword-extractor")

nlp_keyword = pipeline("token-classification", model=model_keyword, tokenizer=tokenizer_keyword, grouped_entities=True)
 
# get all keywords from the text

def get_keywords_from_text(txt):
    res = combine_bert_word_piece(text,nlp_keyword(txt))
    res = [*map(lambda k_word: k_word["word"], res)]
    return res

keyword_result = get_keywords_from_text(text)

key_tokens = ner_entities.copy()

# join with entities list
def join_key_token_lists(token_list_a, token_list_b):
    for keyword in token_list_a:
        if keyword not in token_list_b:
            token_list_b.append(keyword)

    return token_list_b

keyword_result = join_key_token_lists(keyword_result,key_tokens)

print(keyword_result)

['Tesla, Inc', 'American', 'Austin', 'Texas', 'Tesla', 'Martin Eberhard', 'Marc Tarp', 'Tesla Motors', 'Nikola Tesla', 'Roadster', 'Model S', 'Model X', 'Model 3', 'Model Y', 'Cy', 'U. S.', 'Musk', 'Elon Musk', 'Tesla Semi', 'Tesla, Inc.', 'clean energy', 'Austin, Texas', 'stationary battery', 'energy', 'solar panels', 'Marc Tarpenning', 'sports car', 'Semi truck', 'pickup truck', 'plug -', 'car', 'electric']


## Single Function to extract from wikipedia page

In [103]:
def get_wikipedia_sections(wikipedia):
    sections_content = wikipedia_sections(wikipedia.sections,0,[] , {})    
    sections_content["main"] = wikipedia.summary
    return sections_content

def get_wikipedia_full_page(wikipedia):
    sections_content = wikipedia_sections(wikipedia.sections,0,[] , {})
    text_full = []
    for section in sections_content:
        text_full.append(sections_content[section])
    return text_full

def wikipedia_sections(sections, level, sections_list, out):
    # get all the sections 
    for s in sections:
        if s.title not in out:
            out[s.title] = s.text
        
        wikipedia_sections(s.sections, level + 1, sections_list, out)
        
    return out

    

In [105]:

def remove_duplicate_keywords(keywords):
    new_keywords = []
    for keyword in keywords:
        if keyword not in new_keywords:
            new_keywords.append(keyword)

    return new_keywords

def get_wikipedia_page(name):
    # get wikipedia page content
    wiki_page = wikipediaapi.Wikipedia('Sebastian Tatar (sebi.tatar2@gmail.com)', 'en')
    return wiki_page.page(name)

# get keywords from wikipedia page
def get_relevant_words_from_company(name):
    
    # get wikipedia page content
    wiki_page = get_wikipedia_page(name)
   
    sections = get_wikipedia_sections(wiki_page)
    section_main = sections["main"]
            
    # get the name entity
    named_entities = nlp(section_main)
    # combine the remaining bert word pieces if any are left out
    named_entities = combine_bert_word_piece(example, named_entities)
    
    # remove duplicates entities
    named_entities = remove_duplicates_entities(named_entities)
    
    # get company keywords
    keywords = get_keywords_from_text(section_main)
    #print(keywords)
    keywords = remove_duplicate_keywords(keywords)
    
    # TODO: Topic Modeling
 
    return {
        "entities": named_entities,
        "keywords": keywords,
        "topics":[],
        "combined": join_key_token_lists(named_entities,keywords),
    }

keywords_tesla = get_relevant_words_from_company("Tesla, Inc.")
#keywords_nvidia = get_relevant_words_from_company("Nvidia")
#keywords_microsoft = get_relevant_words_from_company("Microsoft")
#keywords_proctor_gamble = get_relevant_words_from_company("Procter & Gamble")
#keywords_coca_cola = get_relevant_words_from_company("Coca-Cola")

print(keywords_tesla)


{'entities': ['Tesla, Inc', 'American', 'Austin', 'Texas', 'Tesla', 'Martin Eberhard', 'Marc Tarp', 'Tesla Motors', 'Nikola Tesla', 'Roadster', 'Model S', 'Model X', 'Model 3', 'Model Y', 'Tesla Semi', 'Cy', 'U. S.', 'Musk', 'Elon Musk'], 'keywords': ['Tesla', '.', 'clean energy', 'Austin, Texas', 'stationary battery', 'energy storage', 'Martin Eberhard', 'Marc Tarpenning', 'Nikola Tesla', 'Elon Musk', 'Roadster', 'sports car', 'Semi truck', 'plug -', 'electric', 'Tesla, Inc', 'American', 'Austin', 'Texas', 'Marc Tarp', 'Tesla Motors', 'Model S', 'Model X', 'Model 3', 'Model Y', 'Tesla Semi', 'Cy', 'U. S.', 'Musk'], 'topics': [], 'combined': ['Tesla', '.', 'clean energy', 'Austin, Texas', 'stationary battery', 'energy storage', 'Martin Eberhard', 'Marc Tarpenning', 'Nikola Tesla', 'Elon Musk', 'Roadster', 'sports car', 'Semi truck', 'plug -', 'electric', 'Tesla, Inc', 'American', 'Austin', 'Texas', 'Marc Tarp', 'Tesla Motors', 'Model S', 'Model X', 'Model 3', 'Model Y', 'Tesla Semi', '

In [111]:
from sklearn.feature_extraction.text import TfidfVectorizer

keywords_tfidf = keywords_tesla.copy()

def get_keywords_weight_from_docs(documents, keywords):
    tfidf = TfidfVectorizer()
    tfidf.fit_transform(keywords.apply(lambda x: " ".join(x))).toarray()

    print(res[1])
    
wiki_page_tesla = get_wikipedia_page("Tesla, Inc.")
documents_tesla = get_wikipedia_full_page(wiki_page_tesla)[1:] # first is empty

get_keywords_weight_from_docs(documents_tesla, keywords_tesla)

