# Text Processing

Text processing using multiple different models and techniques to extract relevant information from the text.

# Imports 

In [93]:
import wikipediaapi

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
import re
import duckdb
import os.path
from dateutil import parser
import yfinance as yf
from datetime import datetime, timedelta
import sys
import io
import contextlib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Constant params 

In [94]:
MAX_PAGE_SIZE = 100
REUTERS_QUERY_URL = 'https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2'

# Get keywords related to a company.

## Get entities from the text

In [95]:
nltk.download('averaged_perceptron_tagger')

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

# used only for texting
text_tesla = """
Tesla, Inc. is an American multinational automotive and clean energy company headquartered in Austin, Texas, which designs, manufactures and sells electric vehicles, stationary battery energy storage devices from home to grid-scale, solar panels and solar shingles, and related products and services.
Tesla was incorporated in July 2003 by Martin Eberhard and Marc Tarpenning as Tesla Motors. The company's name is a tribute to inventor and electrical engineer Nikola Tesla. In February 2004 Elon Musk joined as the company's largest shareholder and in 2008 he was named CEO. In 2008, the company began production of its first car model, the Roadster sports car, followed by the Model S sedan in 2012, the Model X SUV in 2015, the Model 3 sedan in 2017, the Model Y crossover in 2020, the Tesla Semi truck in 2022 and the Cybertruck pickup truck in 2023. The Model 3 is the all-time bestselling plug-in electric car worldwide, and in June 2021 became the first electric car to sell 1 million units globally. In 2023, the Model Y was the best-selling vehicle, of any kind, globally.Tesla is one of the world's most valuable companies. In October 2021, Tesla's market capitalization temporarily reached $1 trillion, the sixth company to do so in U.S. history. As of 2023, it is the world's most valuable automaker. In 2022, the company led the battery electric vehicle market, with 18% share.
Tesla has been the subject of lawsuits, government scrutiny, and journalistic criticism, stemming from allegations of whistleblower retaliation, worker rights violations, product defects, and Musk's many controversial statements.
"""

# convert the wordpiece into one
def combine_bert_word_piece(doc, entities):    
    for entity in entities:
        if entity['word'].startswith("#"):
        
            # find the entity before 
            before_entities = [*filter(lambda ent: ent["end"] == entity["start"], entities)]
            
            # if the port of the word is found
            if len(before_entities) > 0:
                # get the entity
                before_entity = before_entities[0]
                
                # check if the combination exists in the text 
                combined = (before_entity["word"] + entity["word"].replace("##",""))
                                
                # if the combination exists in the document, get it 
                if combined in doc:
                    # remove both entites which are not relevant for further processing
                    entities = [*filter(lambda ent: ent["end"] != entity["start"] and entity["word"] != ent["word"],entities)]
                    
                    # replace it with a word relevant to the t'ext
                    entity["word"] = combined
                    entities.append(entity)
            else:       # if the part does not have a before part, remove it 
                entities = [*filter(lambda ent: entity['word'] != ent["word"],entities)]
                
    return entities

def remove_duplicates_entities(entities): 
    new_entities = []
    for entity in entities:
        word = entity["word"]
        if word not in new_entities:
            new_entities.append(word)
            
    return new_entities

# leave only the misc in the text
ner_entities = nlp(text_tesla)
# print(ner_entities)

# combine the remaining bert word pieces if any are left out
ner_entities = combine_bert_word_piece(text_tesla, ner_entities)

# remove duplicates entities
ner_entities = remove_duplicates_entities(ner_entities)

print(ner_entities)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sebastiantatar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['Tesla, Inc', 'American', 'Austin', 'Texas', 'Tesla', 'Martin Eberhard', 'Marc Tarp', 'Tesla Motors', 'Nikola Tesla', 'Roadster', 'Model S', 'Model X', 'Model 3', 'Model Y', 'Cy', 'U. S.', 'Musk', 'Elon Musk', 'Tesla Semi']


# Get keywords from the text

In [96]:
tokenizer_keyword = AutoTokenizer.from_pretrained("yanekyuk/bert-keyword-extractor")
model_keyword = AutoModelForTokenClassification.from_pretrained("yanekyuk/bert-keyword-extractor")
nlp_keyword = pipeline("token-classification", model=model_keyword, tokenizer=tokenizer_keyword, grouped_entities=True)
 
# get all keywords from the text
    
def complete_incomplete_keywords(keywords, doc):
    
    new_keywords_array = []
    for keyword in keywords:
        # first detect if keyword is part of a whole word.
        patter_whole_word = rf"[,.;\"'\/\\\s]({keyword})[,.;\"'\/\\\s]"
        is_whole_word = bool(re.search(patter_whole_word, doc))
            
        # if is part of a complete word, append, else complete the word
        if is_whole_word:
            new_keywords_array.append(keyword)
        else:
            if keyword in doc:
                pat_in_word = rf"(?<=[,.;\"'\/\\\s])(\w*){keyword}(\w*)(?=[,.;\"'\/\\\s])"
                match = re.search(pat_in_word,doc)
            
                if match is not None:
                    new_full_key_word = f"{match.groups(1)[0]}{keyword}{match.groups(1)[1]}"
                    # print(f"Keyword inside a full word found: {match.groups(1)[0]}{keyword}{match.groups(1)[1]}")
                    new_keywords_array.append(new_full_key_word)
            else:
                #TODO: match words with space a special character after
                special_char_keyword = keyword.replace(" ", "")
                if special_char_keyword in doc:
                    pat_in_word = rf"(?<=[,.;\"'\/\\\s])(\w*){special_char_keyword}(\w*)(?=[,.;\"'\/\\\s])"
                    match = re.search(pat_in_word,doc)
                
                    if match is not None:
                        new_full_key_word = f"{match.groups(1)[0]}{special_char_keyword}{match.groups(1)[1]}"
                        # print(f"Keyword inside a special character: {match.groups(1)[0]}{special_char_keyword}{match.groups(1)[1]}")
                        new_keywords_array.append(new_full_key_word)
                else: 
                    print("Invalid keyword: ", special_char_keyword)
               
    return new_keywords_array

# join with entities list
def join_key_token_lists(token_list_a, token_list_b):
    for keyword in token_list_a:
        if keyword not in token_list_b:
            token_list_b.append(keyword)

    return token_list_b

def get_keywords_from_text(txt):
    res = combine_bert_word_piece(text_tesla,nlp_keyword(txt))
    res = [*map(lambda k_word: k_word["word"], res)]
    return res

def remove_invalid_keywords(keywords):    
    return list(filter(lambda k_word: not bool(re.search(rf"^[-,;.:\"'\/\\\s]*$",k_word)), keywords))

keyword_result = get_keywords_from_text(text_tesla)

key_tokens = ner_entities.copy()

keyword_result = join_key_token_lists(keyword_result,key_tokens)
keyword_result = complete_incomplete_keywords(keyword_result, text_tesla)

print(keyword_result)

['Tesla, Inc', 'American', 'Austin', 'Texas', 'Tesla', 'Martin Eberhard', 'Marc Tarpenning', 'Tesla Motors', 'Nikola Tesla', 'Roadster', 'Model S', 'Model X', 'Model 3', 'Model Y', 'Cybertruck', 'U.S.', 'Musk', 'Elon Musk', 'Tesla Semi', 'Tesla, Inc.', 'clean energy', 'Austin, Texas', 'stationary battery', 'energy', 'solar panels', 'Marc Tarpenning', 'sports car', 'Semi truck', 'pickup truck', 'plug-in', 'car', 'electric']


## Single Function to extract from wikipedia page

In [97]:
def get_wikipedia_sections(wikipedia):
    sections_content = wikipedia_sections(wikipedia.sections, 0, [] , {})    
    sections_content["main"] = wikipedia.summary
    return sections_content

def get_wikipedia_full_page(wikipedia):
    sections_content = wikipedia_sections(wikipedia.sections, 0, [] , {})
    text_full = []
    for section in sections_content:
        text_full.append(sections_content[section])
    return text_full

def wikipedia_sections(sections, level, sections_list, out):
    # get all the sections 
    for s in sections:
        if s.title not in out:
            out[s.title] = s.text
        
        wikipedia_sections(s.sections, level + 1, sections_list, out)
        
    return out

    

In [98]:
import numpy as np
from sklearn import preprocessing

def find_num_occurrences(word, string):
    return string.lower().count(word.lower())

def get_keywords_weight_from_corpus(corpus, keywords):
    words_weight_arr = {}
    for word in keywords:
        word_count = 0
        for corp in corpus:
            sum_in_line = find_num_occurrences(word, corp)
            word_count = word_count + sum_in_line
            # print(word_count)

        if word in words_weight_arr:
            words_weight_arr[word] = words_weight_arr[word] + word_count
        else:
            words_weight_arr[word] = word_count
    
    key_arr = list(words_weight_arr.keys())
    values_arr = list(words_weight_arr.values())
    values_arr = preprocessing.normalize([np.array(values_arr)])[0].tolist()

    words_weight_arr = {key_arr[i]: values_arr[i] for i in range(len(key_arr))}

    return sorted(words_weight_arr.items(), key=lambda item: item[1])
    
def get_company_keywords(company_name):
    wiki_page = get_wikipedia_page(company_name)
    corpus = get_wikipedia_full_page(wiki_page)[1:] # first is empty
    relevant_company_words = get_relevant_words_from_company(company_name)['combined']
    return get_keywords_weight_from_corpus(corpus, relevant_company_words)

words_weights_tesla = get_company_keywords("Tesla, Inc.")

for words_weight in words_weights_tesla:
    print(words_weight)

('clean energy', 0.0)
('stationary battery', 0.0)
('artificial intelligence', 0.0)
('Nikola Tesla', 0.0)
('Forbes Global 2000', 0.0)
('Austin, Texas', 0.0024113192578796424)
('Martin Eberhard', 0.004822638515759285)
('Marc Tarpenning', 0.004822638515759285)
('Austin', 0.004822638515759285)
('sports car', 0.007233957773638926)
('Tesla Motors', 0.007233957773638926)
('U.S.', 0.00964527703151857)
('Inc.', 0.012056596289398211)
('robotics', 0.012056596289398211)
('Tesla Semi', 0.012056596289398211)
('Cybertruck', 0.012056596289398211)
('American', 0.016879234805157496)
('Semi', 0.02170187332091678)
('Texas', 0.026524511836676066)
('Roadster', 0.03134715035243535)
('Elon Musk', 0.03858110812607428)
('Model X', 0.040992427383953915)
('Model Y', 0.0458150658997132)
('energy', 0.060282981446991055)
('Model S', 0.08198485476790783)
('Model 3', 0.08439617402578747)
('Musk', 0.14226783621489889)
('Tesla', 0.9765842994412551)


## Sentiment Analysis for Text


In [99]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import scipy
import torch

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
tokenizer.model_max_length = 2540
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

reference_text_small = 'Just over a year ago, Elon Musk shared the stage at Tesla\'s (TSLA.O), opens new tab investor day in Texas with 16 executives who gave detailed presentations on the company\'s technology and growth plans, then lined up behind their boss in a show of solidarity.,\"We\'ve obviously got significant bench strength here," Musk said at the time, responding to investor concerns that the world\'s most valuable automaker was too much a one-man show. Now, at least five members of that team are gone, a Reuters analysis shows. Tesla, Musk and the 16 executives on the stage last year could not be reached for comment. Musk in a recent email to senior managers outlined plans to lay off hundreds more employees, including two top executives, the Information reported. "Hopefully these actions are making it clear that we need to be absolutely hard core about headcount and cost reduction," Musk wrote in the email, the report said. Two senior executives who flanked Musk on investor day last year are gone: Zach Kirkhorn, former CFO, resigned with a nondisclosure agreement, according to Tesla regulatory filings. Drew Baglino, Tesla\'s former chief battery engineer, left in the wave of layoffs Musk ordered last month. Baglino dumped $181 million in Tesla stock as he left. Rebecca Tinucci, who headed up Tesla\'s charging team, was one of two women on stage for the investor day last March. "We have understood since Day One that a great charging experience is the linchpin to electric vehicle adoption," Tinucci said as she walked onstage. '



In [100]:
def remove_duplicate_keywords(keywords):
    new_keywords = []
    for keyword in keywords:
        if keyword not in new_keywords:
            new_keywords.append(keyword)

    return new_keywords

def get_wikipedia_page(name):
    # get wikipedia page content
    wiki_page = wikipediaapi.Wikipedia('Sebastian Tatar (sebi.tatar2@gmail.com)', 'en')
    return wiki_page.page(name)

# get keywords from wikipedia page
def get_relevant_words_from_company(name):
    
    # get wikipedia page content
    wiki_page = get_wikipedia_page(name)
   
    sections = get_wikipedia_sections(wiki_page)
    section_main = sections["main"]
            
    # get the name entity
    named_entities = nlp(section_main)
    # combine the remaining bert word pieces if any are left out
    named_entities = combine_bert_word_piece(section_main, named_entities)
    
    # remove duplicates entities
    named_entities = remove_duplicates_entities(named_entities)
    named_entities = complete_incomplete_keywords(named_entities, section_main)

    # get company keywords
    keywords = get_keywords_from_text(section_main)
    
    keywords = remove_duplicate_keywords(keywords)
    keywords = complete_incomplete_keywords(keywords, section_main)
    keywords = remove_invalid_keywords(keywords)
     
    return {
        "entities": named_entities,
        "keywords": keywords,
        "topics":[],
        "combined": join_key_token_lists(named_entities,keywords),
    }

keywords_tesla = get_relevant_words_from_company("Tesla, Inc.")
keywords_nvidia = get_relevant_words_from_company("Nvidia")
#keywords_microsoft = get_relevant_words_from_company("Microsoft")
#keywords_proctor_gamble = get_relevant_words_from_company("Procter & Gamble")
#keywords_coca_cola = get_relevant_words_from_company("Coca-Cola")

# print(keywords_tesla)

In [102]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

reference_title = "Elon Musk's Tesla overhaul hits executive bench he touted"

reference_text = 'May 1 (Reuters) - Just over a year ago, Elon Musk shared the stage at Tesla\'s (TSLA.O), opens new tab investor day in Texas with 16 executives who gave detailed presentations on the company\'s technology and growth plans, then lined up behind their boss in a show of solidarity.,\"We\'ve obviously got significant bench strength here," Musk said at the time, responding to investor concerns that the world\'s most valuable automaker was too much a one-man show. Now, at least five members of that team are gone, a Reuters analysis shows. Tesla, Musk and the 16 executives on the stage last year could not be reached for comment. Musk in a recent email to senior managers outlined plans to lay off hundreds more employees, including two top executives, the Information reported. "Hopefully these actions are making it clear that we need to be absolutely hard core about headcount and cost reduction," Musk wrote in the email, the report said. Two senior executives who flanked Musk on investor day last year are gone: Zach Kirkhorn, former CFO, resigned with a nondisclosure agreement, according to Tesla regulatory filings. Drew Baglino, Tesla\'s former chief battery engineer, left in the wave of layoffs Musk ordered last month. Baglino dumped $181 million in Tesla stock as he left. Rebecca Tinucci, who headed up Tesla\'s charging team, was one of two women on stage for the investor day last March. "We have understood since Day One that a great charging experience is the linchpin to electric vehicle adoption," Tinucci said as she walked onstage. In the subsequent year, nearly all rival automakers in the United States agreed to adopt Tesla\'s charging standards and cut deals to let their EV buyers charge at Tesla stations. Tinucci and much of her team were sacked this week. In a posting on his social media platform X, Musk said Tesla plans "to grow the Supercharger network, just at a slower pace for new locations and more focus on 100% uptime and expansion of existing locations." Another executive on the stage who left was Colin Campbell, the former vice president of powertrain engineering. The loss of so many executives is something the Tesla board should be monitoring, said Charles Elson, founding director of the Weinberg Center for Corporate Governance at the University of Delaware. "Lots of departures very quickly suggest a problematic leadership style," he said. "You shouldn\'t lose that many people that quickly." With Tesla\'s revenue, profit and share price falling, Musk has reasserted his dominance at the company. For some investors, that is more important than the executive churn. "Elon\'s not there and we have this turnover? That\'s very bad," said Gene Munster, managing partner with Deepwater Asset Management and a Tesla investor. "If Elon\'s there, he\'s going to draw on talent to keep things going so it really all comes down to Elon remaining a part of the story." Musk has signaled significant strategy shifts in response to falling sales and tougher competition - changes that could leave out executives running operations no longer central to the new plans. Tesla\'s future lies in artificial intelligence and robotaxis, not conventional auto manufacturing, Musk told investors in April. Musk is putting action behind those words. He has ordered a 10% cut in staff and scrapped plans for a new, low-cost line of vehicles in favor of revamping existing models to develop lower-priced entries. Tesla said it will pause construction of new factories until the company\'s sales had reached 3 million vehicles a year - enough to fill up the automaker\'s existing production operations. "If you buy the narrative that Tesla is an AI company fundamentally, it may not be cause for concern," said K.C. Boyce, vice president at data analytics and advisory firm Escalent. "It fits into the idea of sizing and resourcing the business correctly to deliver on the promise of full self-driving and robotaxi." Other senior Tesla executives, who were not among those onstage during the 2023 investor day, have left in recent weeks. Daniel Ho, a former Ford executive and 10-year Tesla veteran who had been director of new car programs, is no longer with the company. Rohan Patel, a former Obama administration official who had been Tesla VP for public policy and key to expansion plans for India, said he is leaving. Another executive to exit was Allie Arebalo, Tesla\'s senior director of human resources, two people familiar with the matter said on Wednesday. Martin Viecha, head of investor relations who also was on the stage last year with Musk, announced his departure at the end of an April 24 conference call with analysts. Unlike most of the other departed executives, Viecha received a warm sendoff from Musk. "The reason I reached out to you was because I thought your analysis of Tesla was the best that I had seen," Musk said on the call. Some analysts said the executive team is critical given the challenges faced by the EV maker. "Having a strong bench behind Musk is important at this pivotal time given the Category 5 storm that Tesla\'s going through," Wedbush Securities analyst Dan Ives said.'

reference_text_2 = "Please use the sharing tools found via the share button at the top or side of articles. Copying articles to share with others is a breach of FT.com T&Cs and Copyright Policy. Email licensing@ft.com to buy additional rights. Subscribers may share up to 10 or 20 articles per month using the gift article service.Hello from New York. Europe today is coming to grips with the significant wins by far-right parties in the EU elections. As our colleagues reported on Sunday, “kiss goodbye to the European Green Deal”. While the significance of these elections is still being understood, they could deal a major blow to renewable energy and sustainable investing on the continent that is the world’s leader in both. For today, I have an article about one aspect of renewable energy that does not get much visibility: storage. And while we might associate batteries with electric vehicles, batteries are crucial to increasing electricity demand as well. Battery storage companies are booming, but can they catch Tesla? When most people think of Tesla they picture its electric cars (or maybe a Cybertruck. I saw my first in New York last week). And when most people think of Tesla’s competitors, Ford, General Motors and other auto manufacturers come to mind. But not Julian Nebreda, chief executive of Fluence Energy, a Virginia-based, publicly traded energy storage provider. “Tesla is probably our biggest competitor,” Nebreda told me. It is all due to Tesla’s significant — and perhaps under-appreciated — battery storage division. Tesla launched a battery storage business in 2015. In April this year, Tesla reported record energy storage revenues of $1.6bn for the first quarter. “Energy generation and storage remains our highest margin business,” the company said in an earnings report. Fluence went public in 2021 and was started by AES, a Virginia utility, and German energy company Siemens. AES started work on lithium-ion energy storage in 2007, and now Fluence is one of Wall Street’s favourite companies as it hopes to benefit from booming electricity demand underpinned by artificial intelligence. “Data centre demand is not new. What is new is the urgency,” Nebreda said. “That is where we are going to play in terms of speed and getting permits, in terms of efficiency, we are the technology that is the fastest. ”While batteries are not as visually appealing as wind turbines or solar panels, they can play a crucial role in maintaining stable supply in electricity grids that are set to rely increasingly on intermittent renewable sources. Growth in batteries dominated almost all other clean energy technologies in 2023, the International Energy Agency said in an April report. Last year, battery deployment in the energy sector increased by more than 130 per cent year over year. And while many people might associate batteries with electric cars, the energy sector accounts for more than 90 per cent of overall battery demand. The AI revolution has undeniably captivated investors around the world. Chipmaker Nvidia’s market capitalisation briefly hit $3tn last week, worth the entire French Cac 40 stocks index plus most of Germany’s Dax. Investors have been eyeing under-appreciated, “picks and shovels” companies to bet on the AI boom. Storage companies such as Fluence are rising as one such business. “We believe [Fluence] should appeal to a wide swath of investors,” those both in traditional energy and environmental, social and governance (ESG), JPMorgan said in a May 9 research report. Surging investor interest in energy storage has opened doors to different technologies. Lithium batteries such as the ones Tesla makes are not the only way to bottle up electricity. Form Energy, a Massachusetts-based storage business, sells iron-air battery technology. Form argues its batteries can last much longer than lithium and are cheaper. In December, Form won a $30mn grant from California for a project that will discharge power into the state’s grid for 100 hours. The award is part of a California Energy Commission programme dedicated specifically to investing in long-duration energy storage. And in Minnesota, Xcel Energy, a utility, is partnering with Form to deploy two iron-air battery systems at retired coal plants. “Our 100-hour, iron-air battery is designed to deliver the demand dexterity needed to allow large industrial loads to be connected quickly,” Mateo Jaramillo, Form Energy’s chief executive, told me. I spoke to Jaramillo on a video call and it was my first interview with a chief executive who was wearing a yellow safety vest. He was calling me from Form’s West Virginia facility that is under construction. According to Jaramillo, technologies already exist today that can be deployed quickly and at scale to meet rising electricity demand. For example, the excitement and investor dollars in small nuclear reactors might be overly optimistic, he said. These small modular reactors “while promising as a new grid resource, still have uncertain timelines and cost profiles”, he said. One of the challenges for companies in the renewable energy ecosystem is trade restrictions. “Geopolitics is a headwind,” Nebreda said, since some Fluence parts are coming from China. “We work with a global supply chain and that is important to deliver products at good prices,” he said. “A world in which you cannot send things around effectively is a worry.” Another hurdle is cheap electricity. In some parts of the US, energy capacity costs “have just fallen through the floor,” PJ Deschenes, a managing director in Nomura’s “greentech” division, told me. For some investors, there was not enough pricing power to motivate new battery investments on their own, “and that is a problem”, he said. “You will ultimately need a more clear payment for capacity in the system,” Deschenes said.If electricity grids are hit by extreme weather or crack because of a crisis, surging prices could force the need for more batteries, he said. “It is one thing if you are paying 20 cents a kilowatt hour. It is another if you have to pay $5.” The biggest challenge for storage companies might be keeping up with Tesla. Nebreda acknowledged that Tesla also did residential battery storage, and “clearly their cost of capital is much better than ours”. As critical as batteries were for renewable energy storage, “the challenge is how new entrants compete with incumbents such as Tesla”, as well as BYD, China’s rival electric-vehicle manufacturer, said Arash Nazhad, a managing director at Moelis who co-leads the company’s cleantech group. Elon Musk’s company is clearly a juggernaut. But the evolution of competing energy storage companies will have ramifications for the global clean energy transition in ways that do not involve electric vehicles. (Patrick Temple-West)"

reference_text_3 = "We get a pandas series containing all of the rows information; inconveniently, though, it is shown on different lines. To get a DataFrame, we have to put the RU sting in another pair of brackets. We can also select multiple rows at the same time. Suppose you want to also include India and China. Simply add those row labels to the list."

def calculate_text_relevance(title, text, keywords_weighted):
    relevance_score = 0
    for keyword in keywords_weighted:
        keyword_count = text.count(keyword[0])
        relevance_score += keyword_count * keyword[1]
    return relevance_score

def preprocess_text(text):
    return [word.lower() for word in text.split() if word not in stop_words]

def calculate_text_relevance_tf_idf(title, text, keywords):
    vectorizer = TfidfVectorizer()
    vectorizer.fit([text])

    document_vectors = vectorizer.transform([text])
    keyword_vector = vectorizer.transform([" ".join(keywords)])

    cosine_similarity_score = cosine_similarity(document_vectors, keyword_vector)

    cosine_similarity_score = document_vectors.dot(keyword_vector.transpose())

    # Print relevance scores (similarity values) for each document
    #for i, score in enumerate(cosine_similarity_score.tolist()[0]):
    #    print(f"Relevance score for document {i+1}: {score}")

    return cosine_similarity_score.toarray()[0][0]

def tf_idf_test(text, words_weights):
    # Define your keywords and documents (text as list)
    documents = ["This is a text about machine learning.", "This is another text about natural language processing."]
    # calculate the relevant keywords here

    # Create a dictionary to store your custom term frequencies
    custom_tf = {word: words_weights[word] for word in words_weights if word in words_weights}
    # for doc in documents:
    #     for word in keywords:
    #         if word in doc.lower():  # Case insensitive search
    #             custom_tf[word] += 1  # Increment custom frequency for the keyword

    # print(custom_tf)

    # Convert custom TF to a sparse matrix (using dictionary from scipy)
    from scipy.sparse import csr_matrix

    # Extract data for sparse matrix (row, col, data)
    data = list(custom_tf.values())
    row = [0] * len(data)  # All elements in the first row (since it's a single document)
    col = range(len(keywords))  # Column index for each keyword
    
    # Create custom TF matrix as csr_matrix
    custom_tf_matrix = csr_matrix((data, (row, col)), shape=(1, len(keywords)))

    # Create TF-IDF vectorizer (without calculating TF)
    vectorizer = TfidfVectorizer(use_idf=True)
    
    # Fit the vectorizer on documents to get IDF weights
    vectorizer.fit(documents)
    
    # Get IDF weights as a matrix
    idf_matrix = vectorizer.idf_

    # Combine custom TF and IDF for custom TF-IDF matrix
    custom_tfidf_matrix = custom_tf_matrix * idf_matrix

    cosine_sim = cosine_similarity(custom_tfidf_matrix, custom_tfidf_matrix)
    return cosine_sim.toarray()[0][0]


# calculate_text_relevance(reference_title, reference_text, words_weights)
score = calculate_text_relevance_tf_idf(reference_title, reference_text_2, keywords_tesla['combined'])    
print(score)

score_tfidf_test = tf_idf_test(reference_text_2,words_weights_tesla)
print(score_tfidf_test)

0.2146071724290385


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sebastiantatar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TypeError: tf_idf_test() takes 1 positional argument but 2 were given

In [85]:
import asyncio

# Needed this code to get the sentiment score:
# https://datascience.stackexchange.com/a/112446
async def get_sentiment_score_text(text, sentiment_score: list):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    scores = {k: v for k, v in zip(model.config.id2label.values(), scipy.special.softmax(logits.numpy().squeeze()))}

    sentiment_score.append(scores)

    return scores

async def get_sentiment_score_large_text(text):
    sentences = nltk.sent_tokenize(text)

    task_list = []
    
    sentiment_scores = []
        
    for sentence in sentences:
        task_list.append(asyncio.create_task(get_sentiment_score_text(sentence, sentiment_scores)))
        # sentiment_scores.append(sentence_score)

    await asyncio.gather(*task_list)
        
    # Average the sentiment scores
    avg_sentiment_score = {k: sum(d[k] for d in sentiment_scores)/len(sentiment_scores) for k in sentiment_scores[0]}

    return avg_sentiment_score

sentiment_score = await get_sentiment_score_large_text(reference_text)

print(sentiment_score)

{'positive': 0.1865296641078133, 'negative': 0.4256132753112186, 'neutral': 0.3878570661808436}


In [78]:
class NewsMetaRepository:
    def __init__(self, csv_file=None):
        self.connection = duckdb.connect(database=':memory:', read_only=False)
        if os.path.isfile(f'{csv_file}/news_meta.csv'):
            try:
                self.connection.execute(f"IMPORT DATABASE '{csv_file}';")
            except Exception as e:
                self._create_table()
                self.connection.execute(f"EXPORT DATABASE '{csv_file}';")
        else:
            self._create_table()

    def _create_table(self):
        self.connection.execute("CREATE TABLE news_meta (id VARCHAR, title VARCHAR UNIQUE, url VARCHAR, timestamp VARCHAR UNIQUE,term VARCHAR)")
        self.connection.execute("CREATE SEQUENCE id_sequence START 1 INCREMENT BY 1;")

    def insert(self, news_meta):
        self.connection.execute("PREPARE insert_meta AS "
                                "INSERT INTO news_meta VALUES (nextval('id_sequence'), ?, ?, ?, ?) ON CONFLICT DO NOTHING;")
        self.connection.execute(f"EXECUTE insert_meta('{news_meta['title']}', '{news_meta['url']}', '{news_meta['timestamp']}', '{news_meta['term']}');")

    def select_all(self):
        return self.connection.execute("SELECT * FROM news_meta").fetchdf()

    def select_by_id(self, id):
        return self.connection.execute("SELECT * FROM news_meta WHERE id = ?", id).fetchdf()

    def select_by_title(self, title):
        return self.connection.execute("SELECT * FROM news_meta WHERE title = ?", title).fetchdf()

    def select_by_url(self, url):
        return self.connection.execute("SELECT * FROM news_meta WHERE url = ?", url).fetchdf()

    def select_by_term(self, source):
        self.connection.execute("PREPARE select_by_source AS "
                                "SELECT * FROM news_meta WHERE term = ?")
        return self.connection.execute(f"EXECUTE select_by_source('{source}');").fetchdf()
    
    def select_by_date_order(self):
        return self.connection.execute("SELECT * FROM news_meta ORDER BY strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z') DESC").fetchdf()

    def select_by_date(self, date_from, date_to):
        return self.connection.execute(f"SELECT * FROM news_meta WHERE strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z') BETWEEN strptime('{date_from}','%Y-%m-%d') AND strptime('{date_to}','%Y-%m-%d')").fetchdf()

    def delete_all(self):
        self.connection.execute("DELETE FROM news_meta")

    def export(self, csv_file):
        self.connection.execute(f"EXPORT DATABASE '{csv_file}';")

    def close(self):
        self.connection.close()
    
class PredictionInputRepository:
    def __init__(self, csv_file_path=None):
        self.connection = duckdb.connect(database=':memory:', read_only=False)
        if os.path.isfile(f'{csv_file_path}/prediction_input.csv'):
            try:
                self.connection.execute(f"IMPORT DATABASE '{csv_file_path}';")
            except Exception as e:
                self._create_table()
                self.connection.execute(f"EXPORT DATABASE '{csv_file_path}';")
        else:
            self._create_table()

    def _create_table(self):
        self.connection.execute("CREATE TABLE prediction_input (id VARCHAR UNIQUE, articleId INTEGER UNIQUE, timestamp VARCHAR, close DOUBLE, relevance DOUBLE, sentimentPositive DOUBLE, sentimentNegative DOUBLE, sentimentNeutral DOUBLE)")
        self.connection.execute("CREATE SEQUENCE id_sequence START 1 INCREMENT BY 1;")

    def insert(self, prediction_input):
        self.connection.execute("PREPARE insert_prediction_input AS "
                                "INSERT INTO prediction_input VALUES (nextval('id_sequence'), ?, ?, ?, ?, ?, ?, ?) ON CONFLICT DO NOTHING;")
        self.connection.execute(f"EXECUTE insert_prediction_input('{prediction_input['articleId']}', '{prediction_input['timestamp']}', {prediction_input['close']}, {prediction_input['relevance']}, {prediction_input['sentimentPositive']}, {prediction_input['sentimentNegative']}, {prediction_input['sentimentNeutral']});")

    def select_all(self):
        return self.connection.execute("SELECT * FROM prediction_input").fetchdf()

    def select_by_date(self, date_from, date_to):
        return self.connection.execute(f"SELECT * FROM prediction_input WHERE strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z') BETWEEN strptime('{date_from}','%Y-%m-%d') AND strptime('{date_to}','%Y-%m-%d')").fetchdf()

    def export(self, csv_file):
        self.connection.execute(f"EXPORT DATABASE '{csv_file}';")
    

In [82]:
company = "Tesla, Inc."
ticker = "TSLA"
source = "ft"

prediction_input_repo = PredictionInputRepository(f"prediction_input/{source}")

words_weights_score = get_company_keywords("Tesla, Inc.")

def get_date_start_end_ticker(date):
    date_formats = ["%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%dT%H:%M:%S%z"]
    output_format = "%Y-%m-%d"
    
    for date_format in date_formats:
        try:
            start_date = datetime.strptime(date, date_format)
            end_date = start_date + timedelta(days=1)
            return start_date.strftime(output_format), end_date.strftime(output_format)
        except:
            pass
        
    raise Exception("Invalid date format")

def get_date_ticker(date):
    date_formats = ["%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%dT%H:%M:%S%z"]
    output_format = "%Y-%m-%d"

    for date_format in date_formats:
        try:
            date_ticker = datetime.strptime(date, date_format)
            return date_ticker.strftime(output_format)
        except:
            pass

    raise Exception("Invalid date format")

@contextlib.contextmanager
def nostdout():
    save_stdout = sys.stdout
    sys.stdout = io.BytesIO()
    yield
    sys.stdout = save_stdout

async def get_article_file_sentiment_score(article_id, source, article_file, tickers, date):
    ticker = {k.strftime('%Y-%m-%d'): v for k, v in tickers['Close'].to_dict().items()}
    
    if os.path.isfile(f"articles/{source}/{article_file}.txt") and get_date_ticker(date) in ticker:        
        with open(f"articles/{source}/{article_file}.txt", "r") as file:
            text = file.read()
            ticker_value = ticker[get_date_ticker(date)]
            relevance_score = calculate_text_relevance_tf_idf(reference_title, text, keywords_tesla)

            print(f"Article: {article_file} - Relevance Score: {relevance_score}")


            if(relevance_score > 0 and ticker_value is not None):
                pass
                # print(f"Article: {article_file} - Relevance Score: {relevance_score}")

                # sentiment_score = await get_sentiment_score_large_text(text)
                # 
                # 
                # preditction_input = {
                #     "articleId": article_id,
                #     "timestamp": date,
                #     "close": ticker_value,
                #     "relevance": relevance_score,
                #     "sentimentPositive": sentiment_score['positive'],
                #     "sentimentNegative": sentiment_score['negative'],
                #     "sentimentNeutral": sentiment_score['neutral']
                # }           
                # prediction_input_repo.insert(preditction_input) 
            else:
                # print("Text is Irrelevant to topic:  " + article_file + "  :  " + str(relevance_score))
                pass
    else:
        # print(f"File or Ticker does not exist: {source}/{article_file}.txt / {get_date_ticker(date)}")
        return {}

async def calculate_sentiment_score_all_article(source):
    news_meta_repo = NewsMetaRepository(f"meta/{source}")
    news_meta_list =  news_meta_repo.select_by_date_order()
    
    def get_file_name(id):
        title = news_meta_list['title'][id].replace(' ','-').replace("/","-")
        return f"{title}-{parser.parse(news_meta_list['timestamp'][id]).timestamp()}"

    for i in range(0, len(news_meta_list))[:150:10]:
        task_list = []
        start_date = get_date_ticker(news_meta_list['timestamp'][i+9])
        end_date = get_date_ticker(news_meta_list['timestamp'][i])
        tickers = yf.download(ticker, start=start_date, end=end_date, progress=False).drop(['Open', 'High', 'Low', 'Volume', 'Adj Close'], axis=1)
        
        print(start_date, end_date)

        # print(tickers)

        for idx in range(0,10 if len(news_meta_list) > i + 10 else len(news_meta_list) % 10):
            file_name = get_file_name(i+idx)
            
            task_list.append(
                asyncio.create_task(
                    get_article_file_sentiment_score(
                        news_meta_list['id'][i+idx],
                        source,
                        file_name,
                        tickers,
                        news_meta_list['timestamp'][i+idx])
                )
            )
            
        await asyncio.gather(*task_list)
        
    prediction_input_repo.export(f"prediction_input/{source}")
    
await calculate_sentiment_score_all_article("ft")

2024-06-16 2024-06-20
Article: Fisker-collapse-shows-difficult-tech-can’t-find-easy-money-now-1718732675.0 - Relevance Score: 0.0
Article: Electric-car-maker-Fisker-files-for-bankruptcy-1718711313.0 - Relevance Score: 0.0
Article: IMF-warns-of-massive-labour-disruption-from-AI-1718644525.0 - Relevance Score: 0.0
Article: Asset-Management:-Janus-Henderson’s-tentative-turnaround-1718600413.0 - Relevance Score: 0.008129006746050079
Article: Zhu-Min:-People-talk-about-‘overcapacity’ . . . but-EVs-are-just-evolving-fast-1718596813.0 - Relevance Score: 0.0
2024-06-14 2024-06-16
Article: Raspberry-Pi-offers-London-a-morsel-of-hope-1718385341.0 - Relevance Score: 0.012568925295997527
Article: FirstFT:-Musk-talks-of-AI-and-humanoid-robots-after-vote-win-1718361518.0 - Relevance Score: 0.0
Article: Dan-Davies:-‘finance-is-a-tool-of-control’-1718343022.0 - Relevance Score: 0.0
Article: Transcript:-The-target-painted-on-Chinese-electric-vehicles-1718337623.0 - Relevance Score: 0.0
Article: What-I-