In [2]:
# Load libraries
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Stock returns data
stock_a_returns = [0.02, -0.01, 0.03, 0.01, -0.02]
stock_b_returns = [0.03, 0.01, 0.02, 0.00, -0.01]

# Convert to NumPy arrays for easier calculations
stock_a = np.array(stock_a_returns)
stock_b = np.array(stock_b_returns)

In [4]:
# Euclidean Distance
euclidean_distance = np.sqrt(np.sum((stock_a - stock_b)**2))
print(f"Euclidean Distance: {euclidean_distance}")

Euclidean Distance: 0.0282842712474619


In [5]:
# Manhattan Distance
manhattan_distance = np.sum(np.abs(stock_a - stock_b))
print(f"Manhattan Distance: {manhattan_distance}")

Manhattan Distance: 0.06


In [6]:
# Sample headlines
headlines = [
    "Acme Corp. announces record profits, stock surges.",
    "Acme Corp. faces regulatory scrutiny, shares decline.",
    "Market volatility impacts Acme Corp. earnings.",
    "Acme Corp. expands into new markets, analysts optimistic."
]

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer to the headlines
vectorizer.fit(headlines)

# Transform the headlines into a TF-IDF matrix
tfidf_matrix = vectorizer.transform(headlines)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

In [7]:
# Print the feature names
print(feature_names)

['acme' 'analysts' 'announces' 'corp' 'decline' 'earnings' 'expands'
 'faces' 'impacts' 'market' 'markets' 'new' 'optimistic' 'profits'
 'record' 'regulatory' 'scrutiny' 'shares' 'stock' 'surges' 'volatility']


In [8]:
# Accessing TF-IDF scores for a specific headline and word
headline_index = 0  # Index of the first headline
word_index = feature_names.tolist().index('profits')  # Get the index of the word "profits"

# Find TF-IDF score and print
tfidf_score = tfidf_matrix[headline_index, word_index]
print(f"TF-IDF score for 'profits' in the first headline: {tfidf_score}")

TF-IDF score for 'profits' in the first headline: 0.42468159315633897


In [9]:
# Print the TF-IDF matrix
print(tfidf_matrix.toarray())

[[0.22161647 0.         0.42468159 0.22161647 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.42468159 0.42468159 0.         0.         0.
  0.42468159 0.42468159 0.        ]
 [0.22161647 0.         0.         0.22161647 0.42468159 0.
  0.         0.42468159 0.         0.         0.         0.
  0.         0.         0.         0.42468159 0.42468159 0.42468159
  0.         0.         0.        ]
 [0.24478737 0.         0.         0.24478737 0.         0.46908376
  0.         0.         0.46908376 0.46908376 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.46908376]
 [0.22161647 0.42468159 0.         0.22161647 0.         0.
  0.42468159 0.         0.         0.         0.42468159 0.42468159
  0.42468159 0.         0.         0.         0.         0.
  0.         0.         0.        ]]


In [10]:
# Create a list of generic headline labels
headline_labels = ["Headline1", "Headline2", "Headline3", "Headline4"]

# Create a Pandas DataFrame
pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names, index=headline_labels)

Unnamed: 0,acme,analysts,announces,corp,decline,earnings,expands,faces,impacts,market,...,new,optimistic,profits,record,regulatory,scrutiny,shares,stock,surges,volatility
Headline1,0.221616,0.0,0.424682,0.221616,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.424682,0.424682,0.0,0.0,0.0,0.424682,0.424682,0.0
Headline2,0.221616,0.0,0.0,0.221616,0.424682,0.0,0.0,0.424682,0.0,0.0,...,0.0,0.0,0.0,0.0,0.424682,0.424682,0.424682,0.0,0.0,0.0
Headline3,0.244787,0.0,0.0,0.244787,0.0,0.469084,0.0,0.0,0.469084,0.469084,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.469084
Headline4,0.221616,0.424682,0.0,0.221616,0.0,0.0,0.424682,0.0,0.0,0.0,...,0.424682,0.424682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
