In [1]:
import ast
import json
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import plotly.express as px

from bertopic import BERTopic
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score
)

import umap.umap_ as umap


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("../Data/raw/df_oilnews.csv")
df

Unnamed: 0,title,Date,excerpt
0,April Price Crash Dragged Saudi Arabia’s Oil R...,2025-06-25,Saudi Arabia’s revenues from oil exports crash...
1,Giant Leviathan Gas Field Offshore Israel Resu...,2025-06-25,The massive Leviathan gas field offshore Israe...
2,China and India Cut Imports of Lower-Quality C...,2025-06-25,The world’s biggest and second-biggest coal im...
3,Iran-Israel War Prompts China to Reconsider Ru...,2025-06-25,The war between Israel and Iran has spark worr...
4,EU Set to Change Subsidy Rules for Energy Costs,2025-06-25,National governments in the EU would soon be a...
...,...,...,...
23415,Australia's Desalinization Plant Workers in In...,2011-06-20,Victoria state’s troubled Wonthaggi desalinati...
23416,Chinese Energy Workers in Somalia Threatened,2011-06-19,The Ogaden National Liberation Front has warne...
23417,Argentina Now Receiving 40 Percent of Chinese ...,2011-06-19,In Argentina Mandarin Chinese is now the main ...
23418,Chinese Dam and Pipeline Projects Raise Burmes...,2011-06-18,Lucrative China-backed hydropower projects are...


# SentenceTransformer’s all-MiniLM-L6-v2  Embeddings

In [78]:
df['text'] = df['title'].astype(str) + " " + df['excerpt'].astype(str)

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)

reducer_2d = umap.UMAP(n_components=2, random_state=42)
embedding_2d = reducer_2d.fit_transform(embeddings)

reducer_3d = umap.UMAP(n_components=3, random_state=42)
embedding_3d = reducer_3d.fit_transform(embeddings)

df['embedding_2d_1'] = embedding_2d[:,0]
df['embedding_2d_2'] = embedding_2d[:,1]

df['embedding_3d_1'] = embedding_3d[:,0]
df['embedding_3d_2'] = embedding_3d[:,1]
df['embedding_3d_3'] = embedding_3d[:,2]

df['embedding_original'] = [json.dumps(vec.tolist()) for vec in embeddings]

df.to_csv("news_embeddings_SentenceTransformer.csv", index=False)


Batches: 100%|██████████| 2/2 [00:22<00:00, 11.13s/it]


In [4]:
df_embeddings = pd.read_csv("../data/raw/news_embeddings_all.csv")
df_embeddings.head()

Unnamed: 0,title,Date,excerpt,text,embedding_2d_1,embedding_2d_2,embedding_3d_1,embedding_3d_2,embedding_3d_3,embedding_original
0,April Price Crash Dragged Saudi Arabia’s Oil R...,2025-06-25,Saudi Arabia’s revenues from oil exports crash...,April Price Crash Dragged Saudi Arabia’s Oil R...,3.346497,2.808099,3.267687,3.479787,0.565045,"[0.012225907295942307, 0.009778764098882675, 0..."
1,Giant Leviathan Gas Field Offshore Israel Resu...,2025-06-25,The massive Leviathan gas field offshore Israe...,Giant Leviathan Gas Field Offshore Israel Resu...,-0.213392,1.394997,0.274615,3.825916,1.1388,"[-0.030076706781983376, 0.041850119829177856, ..."
2,China and India Cut Imports of Lower-Quality C...,2025-06-25,The world’s biggest and second-biggest coal im...,China and India Cut Imports of Lower-Quality C...,0.378566,7.226022,0.11828,6.579502,0.893696,"[-0.01591774821281433, 0.022667916491627693, 0..."
3,Iran-Israel War Prompts China to Reconsider Ru...,2025-06-25,The war between Israel and Iran has spark worr...,Iran-Israel War Prompts China to Reconsider Ru...,-2.652488,4.226569,-0.139149,4.655896,-2.405691,"[-0.050621889531612396, 0.09657188504934311, 0..."
4,EU Set to Change Subsidy Rules for Energy Costs,2025-06-25,National governments in the EU would soon be a...,EU Set to Change Subsidy Rules for Energy Cost...,-0.754677,8.381176,0.2958,8.213206,-0.175206,"[-0.06337245553731918, 0.02446914277970791, 0...."


#### Download original embeddings

In [5]:
df_embeddings['embedding_list'] = df_embeddings['embedding_original'].apply(ast.literal_eval)
embeddings_df = df_embeddings['embedding_list'].apply(pd.Series)
embeddings_df.columns = [f'emb_{i}' for i in range(embeddings_df.shape[1])]

df_emb_only = pd.concat([df_embeddings['Date'].reset_index(drop=True), embeddings_df], axis=1)

df_emb_only.to_csv("news_embeddings_only.csv", index=False)
df_emb_only.head()


Unnamed: 0,Date,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,2025-06-25,0.012226,0.009779,0.089478,0.03942,0.069494,-0.012917,-0.039063,0.065813,0.01345,...,0.034876,-0.075225,-0.007969,0.041977,-0.094294,-0.054952,-0.008179,-0.04387,-0.040614,0.070509
1,2025-06-25,-0.030077,0.04185,0.033797,-0.011776,0.036505,-0.059832,-0.076533,-0.057696,-0.099415,...,0.052465,-0.005157,-0.074812,0.008974,-0.075472,0.069592,0.046688,-0.105226,-0.015004,-0.024379
2,2025-06-25,-0.015918,0.022668,0.012009,0.018269,0.043291,-0.007038,-0.003583,0.028159,-0.106706,...,-0.060737,0.052323,-0.055896,0.03946,-0.032319,-0.047179,0.048545,-0.04451,-0.035724,0.00882
3,2025-06-25,-0.050622,0.096572,0.027278,-0.001633,-0.048082,-0.019272,-0.006105,-0.013875,-0.04585,...,0.094076,0.086139,-0.038172,-0.025026,-0.001544,0.081518,0.062285,-0.032229,-0.024803,-0.051109
4,2025-06-25,-0.063372,0.024469,0.042302,0.00216,0.039314,0.008351,-0.033816,-0.011336,-0.07827,...,0.010018,-0.026637,-0.017536,0.017305,-0.022039,-0.008785,-0.000308,-0.025894,0.044644,0.010052


In [6]:
df_emb_daily = df_emb_only.groupby('Date').mean().reset_index()

In [7]:
df_emb_daily

Unnamed: 0,Date,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,2011-06-18,-0.035144,0.060475,0.006938,0.004392,0.015173,0.005281,-0.001809,-0.050903,-0.021274,...,-0.004025,0.047493,0.002554,0.002386,-0.029448,-0.001158,0.020478,-0.012183,0.040391,-0.044397
1,2011-06-19,-0.022826,0.054531,0.038822,0.046872,0.007901,0.037586,0.030731,-0.040563,0.013673,...,0.003736,-0.003391,-0.061782,0.002008,-0.022446,0.049404,0.044936,-0.104144,-0.000230,0.031177
2,2011-06-20,-0.021295,0.020053,-0.009434,0.045761,0.033256,-0.036175,-0.019319,-0.018808,-0.035898,...,0.007613,0.000529,-0.010455,0.012782,-0.062081,-0.026064,0.030625,-0.070964,-0.017090,0.024548
3,2011-06-21,-0.038333,0.039417,-0.024327,0.030875,0.027279,-0.032010,0.017563,-0.019469,-0.053186,...,0.033335,0.050554,-0.003798,-0.027531,0.008136,0.005728,0.008879,-0.030951,-0.020833,-0.049090
4,2011-06-22,-0.036751,0.003933,-0.010863,-0.010877,-0.018912,-0.031680,-0.006836,-0.023823,-0.033432,...,0.025880,0.040041,-0.006550,-0.074165,-0.038537,0.003880,-0.015516,-0.070564,0.003571,-0.036808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3572,2025-06-20,-0.037182,0.011349,0.030959,0.011617,0.020311,0.006761,-0.015680,0.025290,-0.019018,...,0.013838,0.026125,-0.022492,0.013979,-0.013695,-0.011843,0.003809,-0.055833,-0.006721,0.060191
3573,2025-06-22,0.012624,-0.053553,-0.008667,0.066812,0.060872,-0.065935,-0.045099,0.036855,0.035143,...,0.070727,-0.005268,-0.081745,-0.024200,-0.086506,-0.020197,0.019313,-0.033416,-0.028837,0.054455
3574,2025-06-23,-0.000695,0.035022,0.023385,0.021639,0.010082,-0.031147,-0.006338,0.002231,-0.020901,...,0.038239,0.018452,-0.043777,-0.011292,-0.003047,-0.001206,-0.002574,-0.056079,-0.002287,0.025008
3575,2025-06-24,-0.040735,-0.023293,0.064951,0.042227,0.019268,-0.014399,-0.057177,0.017907,-0.029223,...,0.046521,0.000406,-0.035632,0.010374,-0.038029,-0.009822,-0.026992,-0.053681,-0.003189,0.013624


In [33]:
df_emb_daily.to_csv("news_embeddings_only_final.csv", index=False)

In [25]:
o = pd.read_csv('../Data/raw/news_embeddings_only.csv')

## Cluster Analysis

In [9]:
def evaluate_k(X, k):
    km = KMeans(n_clusters=k, random_state=42).fit(X)
    labels = km.labels_
    return {
        "inertia": km.inertia_,
        "silhouette": silhouette_score(X, labels),
        "ch": calinski_harabasz_score(X, labels),
        "db": davies_bouldin_score(X, labels)
    }
df_subset = df_embeddings
results = []
X = df_subset[['embedding_2d_1', 'embedding_2d_2']].values
for k in range(2, 25):
    scores = evaluate_k(X, k)
    scores["k"] = k
    results.append(scores)

df_scores = pd.DataFrame(results)
print(df_scores)


          inertia  silhouette            ch        db   k
0   250541.200316    0.407147  20271.105266  0.968333   2
1   167117.424472    0.398671  21040.585782  0.863620   3
2   130819.705878    0.378481  20082.947436  0.979217   4
3   102658.618667    0.376243  20798.943365  0.899909   5
4    85163.742097    0.374934  21018.392142  0.854643   6
5    70666.586071    0.396578  21908.877009  0.793519   7
6    60754.802108    0.380405  22387.377237  0.837069   8
7    54425.757396    0.377273  22205.993279  0.858078   9
8    49700.684768    0.375693  21861.836009  0.856645  10
9    47735.752382    0.376743  20581.021212  0.824407  11
10   38756.373215    0.411448  23536.817773  0.732525  12
11   32814.412058    0.431818  25834.713619  0.734784  13
12   30075.537918    0.425368  26181.836122  0.748353  14
13   28555.057495    0.430527  25694.077939  0.725953  15
14   26167.693308    0.433137  26309.907423  0.695006  16
15   23785.350660    0.438106  27281.015991  0.706038  17
16   22372.780

Based on the three key internal metrics:

| k  | Silhouette Score ↑ | Calinski–Harabasz Index ↑ | Davies–Bouldin Index ↓ |
|---:|-------------------:|--------------------------:|-----------------------:|
| 24 |          **0.4227** |               **14211.80** |             **0.8121** |
| 20 |            0.4053   |                13574.60   |              0.8472    |

- **k = 24** clearly optimizes all metrics (highest silhouette and CH, lowest DB), indicating the most compact and well-separated clusters.
- **k = 20** is an attractive trade-off:  
  - **Silhouette** remains high at 0.405 (only a ~4% drop),  
  - **Calinski–Harabasz** is strong at 13 574.6,  
  - **Davies–Bouldin** is still low at 0.847.  

Choosing **20 clusters** gives nearly the same clustering quality as **24**, while reducing complexity and making the results easier to interpret.
```


In [None]:
df_subset = df_embeddings

X = df_subset[['embedding_2d_1', 'embedding_2d_2']].values
k = 20
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)
df_subset['cluster'] = clusters.astype(str)

big_palette = px.colors.qualitative.Safe + px.colors.qualitative.Bold + px.colors.qualitative.Light24
colors_for_clusters = big_palette[:k]

fig = px.scatter(
    df_subset,
    x='embedding_2d_1',
    y='embedding_2d_2',
    color='cluster',
    color_discrete_sequence=colors_for_clusters,
    hover_name='title',
    title=f'Cluster analysis con {k} cluster',
    width=900,
    height=700
)

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()


In [11]:
def evaluate_k(X, k):
    km = KMeans(n_clusters=k, random_state=42).fit(X)
    labels = km.labels_
    return {
        "inertia": km.inertia_,
        "silhouette": silhouette_score(X, labels),
        "ch": calinski_harabasz_score(X, labels),
        "db": davies_bouldin_score(X, labels)
    }

results = []
X = df_subset[['embedding_3d_1', 'embedding_3d_2', 'embedding_3d_3']].values
for k in range(2, 25):
    scores = evaluate_k(X, k)
    scores["k"] = k
    results.append(scores)

df_scores = pd.DataFrame(results)
print(df_scores)


          inertia  silhouette            ch        db   k
0   188599.049498    0.332208  12970.904846  1.255682   2
1   161299.426418    0.293355   9564.567753  1.476265   3
2   109699.816632    0.345519  13046.507586  1.080721   4
3    94257.181119    0.326286  12346.737046  1.076788   5
4    81748.895703    0.325031  12104.628478  1.098057   6
5    71051.510503    0.340458  12192.903469  1.014121   7
6    63967.934693    0.347589  11978.267634  0.945744   8
7    55934.130000    0.364417  12406.116370  0.912172   9
8    51606.455064    0.358840  12170.248491  0.952095  10
9    47546.403323    0.371174  12087.806930  0.904346  11
10   45290.913198    0.356604  11641.613779  0.962959  12
11   40389.844861    0.375112  12202.580769  0.927777  13
12   38602.249743    0.373962  11868.354524  0.902655  14
13   35703.240487    0.371713  12050.683647  0.920867  15
14   32643.812775    0.384359  12447.108252  0.945362  16
15   31437.439216    0.376030  12172.571273  0.952919  17
16   27804.566

### Cluster Semantics


It extracts the top representative keywords for each cluster by computing TF-IDF scores on the combined titles within each cluster, excluding common stopwords.


In [5]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

def descrivi_cluster(df, n_top_words=20):
    descrizioni = {}
    for cluster_id in sorted(df['cluster'].unique()):
        testi = df[df['cluster'] == cluster_id]['title'].values
        testo_unito = " ".join(testi)
        vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000)
        tfidf_matrix = vectorizer.fit_transform([testo_unito])
        feature_names = vectorizer.get_feature_names_out()
        scores = tfidf_matrix.toarray()[0]
        top_indices = scores.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_indices]
        descrizioni[cluster_id] = ", ".join(top_words)
    return descrizioni

descrizioni = descrivi_cluster(df_subset)
for cluster, parole in descrizioni.items():
    print(f"Cluster {cluster}: {parole}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saraborello/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cluster 0: russia, oil, russian, gas, ukraine, gazprom, eu, sanctions, pipeline, energy, exports, rosneft, stream, nord, europe, price, could, deal, crude, new
Cluster 1: shell, oil, billion, energy, gas, eni, africa, glencore, south, gold, assets, major, refinery, project, court, bhp, coal, prices, production, chevron
Cluster 10: oil, saudi, opec, aramco, arabia, production, billion, prices, cuts, uae, ipo, energy, output, saudis, gas, deal, bpd, kuwait, cut, crude
Cluster 11: oil, china, india, crude, imports, demand, fuel, chinese, russian, refiners, russia, indian, high, record, prices, refinery, gas, exports, refining, set
Cluster 12: venezuela, oil, venezuelan, pdvsa, sanctions, maduro, citgo, exports, crude, chevron, production, cuba, russia, deal, refinery, could, trump, imports, new, us
Cluster 13: oil, prices, demand, gasoline, opec, crude, price, eia, production, sees, set, year, forecast, brent, fall, iea, supply, weekly, global, rise
Cluster 14: energy, gas, uk, power, nuc

It applies BERTopic to extract the main topic keywords for each cluster by modeling semantic topics from the titles within each cluster.


In [None]:
descrizioni_cluster = {}

for cluster_id in sorted(df_subset['cluster'].unique()):
    testi_cluster = df_subset[df_subset['cluster'] == cluster_id]['title'].tolist()

    topic_model = BERTopic(language="english")
    topics, probs = topic_model.fit_transform(testi_cluster)
    parole_chiave = topic_model.get_topic(0)

    descrizioni_cluster[cluster_id] = parole_chiave


for cluster_id, parole in descrizioni_cluster.items():
    print(f"Cluster {cluster_id}:")
    print(", ".join([word for word, _ in parole]))
    print("-" * 50)


Cluster 0:
gazprom, gas, neft, gazproms, to, in, with, europe, for, natural
--------------------------------------------------
Cluster 1:
energy, africas, south, africa, power, kenya, african, to, renewable, hydroelectric
--------------------------------------------------
Cluster 10:
aramco, ipo, saudi, to, in, for, billion, aramcos, listing, the
--------------------------------------------------
Cluster 11:
russian, india, russia, sanctions, indias, crude, oil, russias, of, indian
--------------------------------------------------
Cluster 12:
pdvsa, to, debt, venezuelas, in, for, bond, bonds, payment, caribbean
--------------------------------------------------
Cluster 13:
gasoline, us, prices, gas, the, to, in, fall, fuel, diesel
--------------------------------------------------
Cluster 14:
wind, offshore, turbine, farm, power, farms, in, the, turbines, worlds
--------------------------------------------------
Cluster 15:
oil, canadas, canadian, trade, us, as, canada, prices, defici

#### Results

| Cluster | Topic Definition                                                                                                                             |
| ------- | -------------------------------------------------------------------------------------------------------------------------------------------- |
| 0       | Russian oil and gas sector including Gazprom, Nord Stream pipeline, EU sanctions, export dynamics, pricing and deals amid Ukraine conflict.  |
| 1       | Major energy companies (Shell, Eni, Glencore, Chevron) operations in Africa, including oil, gas, coal production, assets, and legal matters. |
| 2       | Large oil and gas producers like BP, Exxon, Chevron active in Permian Basin shale production, earnings, and asset management.                |
| 3       | Electric vehicles and battery market developments focusing on Tesla, lithium, EV sales in China and global markets.                          |
| 4       | Oil exports and pipeline security in Iraq, Kurdistan, and surrounding regions with geopolitical tensions involving Iran, Turkey, and ISIS.   |
| 5       | Nigeria’s oil and gas production including offshore fields, pipelines, OPEC participation, and regional output dynamics.                     |
| 6       | LNG trade and energy projects involving Qatar, China, Australia, coal markets, and international supply contracts.                           |
| 7       | Mexico’s oil and gas sector including Pemex operations, US-Mexico energy relations, fracking, drilling bans, and regulatory policies.        |
| 8       | Global energy transition issues, solar power, coal use, climate change impacts, emissions, and fossil fuel demand worldwide.                 |
| 9       | US oil market inventory dynamics, crude oil price fluctuations, API reports, gasoline stock movements, and supply surprises.                 |
| 10      | Saudi Arabia and OPEC production, Aramco’s IPO and billion-dollar valuations, UAE and Kuwait quota deals and market impact.                  |
| 11      | China and India’s crude oil demand and imports, Russian energy relations, refinery operations, and trade under sanctions.                    |
| 12      | Venezuela’s PDVSA oil production, sanctions impact, Maduro government, Citgo operations, and international oil deals.                        |
| 13      | Global oil price trends including gasoline demand, OPEC production forecasts, EIA and IEA reports, and supply-demand analysis.               |
| 14      | European energy sector with emphasis on UK, Germany, nuclear and wind power, offshore wind farms, natural gas, and coal markets.             |
| 15      | Canadian oil production and pipeline infrastructure (Keystone, Trans Mountain), trade relations with US, and energy export issues.           |
| 16      | Iranian and Russian oil and gas exports, OPEC quotas, sanctions effects, nuclear talks, and production cuts with geopolitical context.       |
| 17      | Libyan oil production, export challenges due to protests, port closures, force majeure declarations, and field operations.                   |
| 18      | Petrobras and South American energy sector including Brazil, Argentina, shale plays like Vaca Muerta, and regional oil deals.                |
| 19      | North Sea oil and gas industry, UK and Norway energy production, tax policies, Arctic drilling, Equinor activities, and windfall taxes.      |


### News over years

In [6]:
df_subset['year'] = pd.to_datetime(df_subset['Date']).dt.year.astype(str)
X = df_subset[['embedding_2d_1', 'embedding_2d_2']].values

k = 20
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

df_subset['cluster'] = clusters.astype(str)
big_palette = px.colors.qualitative.Safe + px.colors.qualitative.Bold + px.colors.qualitative.Light24
colors_for_clusters = big_palette[:k]

fig = px.scatter(
    df_subset,
    x='embedding_2d_1',
    y='embedding_2d_2',
    color='cluster',
    animation_frame='year',
    color_discrete_sequence=colors_for_clusters,
    hover_name='title',
    title=f'Cluster analysis con {k} cluster animata per anno',
    width=900,
    height=700
)

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()


In [8]:
df_subset.to_csv('../Data/raw/embeddings_CA_ST.csv')

In [10]:
df_subset.head()

Unnamed: 0,title,Date,excerpt,text,embedding_2d_1,embedding_2d_2,embedding_3d_1,embedding_3d_2,embedding_3d_3,embedding_original,cluster
0,April Price Crash Dragged Saudi Arabia’s Oil R...,2025-06-25,Saudi Arabia’s revenues from oil exports crash...,April Price Crash Dragged Saudi Arabia’s Oil R...,3.346497,2.808099,3.267687,3.479787,0.565045,"[0.012225907295942307, 0.009778764098882675, 0...",10
1,Giant Leviathan Gas Field Offshore Israel Resu...,2025-06-25,The massive Leviathan gas field offshore Israe...,Giant Leviathan Gas Field Offshore Israel Resu...,-0.213392,1.394997,0.274615,3.825916,1.1388,"[-0.030076706781983376, 0.041850119829177856, ...",4
2,China and India Cut Imports of Lower-Quality C...,2025-06-25,The world’s biggest and second-biggest coal im...,China and India Cut Imports of Lower-Quality C...,0.378566,7.226022,0.11828,6.579502,0.893696,"[-0.01591774821281433, 0.022667916491627693, 0...",6
3,Iran-Israel War Prompts China to Reconsider Ru...,2025-06-25,The war between Israel and Iran has spark worr...,Iran-Israel War Prompts China to Reconsider Ru...,-2.652488,4.226569,-0.139149,4.655896,-2.405691,"[-0.050621889531612396, 0.09657188504934311, 0...",0
4,EU Set to Change Subsidy Rules for Energy Costs,2025-06-25,National governments in the EU would soon be a...,EU Set to Change Subsidy Rules for Energy Cost...,-0.754677,8.381176,0.2958,8.213206,-0.175206,"[-0.06337245553731918, 0.02446914277970791, 0....",14


## Feature Creation

### Mean

In [None]:
import pandas as pd

df_embeddings['Date'] = pd.to_datetime(df_embeddings['Date'])

df_daily_mean = df_embeddings.groupby(df_embeddings['Date'].dt.date).agg({
    'embedding_2d_1': 'mean',
    'embedding_2d_2': 'mean',
    'embedding_3d_1': 'mean',
    'embedding_3d_2': 'mean',
    'embedding_3d_3': 'mean'
}).reset_index()

df_daily_mean.rename(columns={'Date': 'date'}, inplace=True)
df_daily_mean.rename(columns={
    col: f"{col}_mean" for col in df_daily_mean.columns if col not in ['date']
}, inplace=True)

df_daily_mean.head()


Unnamed: 0,date,embedding_2d_1_mean,embedding_2d_2_mean,embedding_3d_1_mean,embedding_3d_2_mean,embedding_3d_3_mean
0,2011-06-18,-0.935101,5.024511,0.21543,5.237488,-0.66889
1,2011-06-19,-1.985589,4.556999,-0.696016,5.859865,2.020602
2,2011-06-20,-1.999046,5.132574,-1.021147,5.45929,0.582733
3,2011-06-21,-0.993401,3.384487,0.281058,4.571217,0.721057
4,2011-06-22,-0.394963,4.760837,0.561471,5.280361,-0.026019


In [13]:
df_daily_mean

Unnamed: 0,date,embedding_2d_1_mean,embedding_2d_2_mean,embedding_3d_1_mean,embedding_3d_2_mean,embedding_3d_3_mean
0,2011-06-18,-0.935101,5.024511,0.215430,5.237488,-0.668890
1,2011-06-19,-1.985589,4.556999,-0.696016,5.859865,2.020602
2,2011-06-20,-1.999046,5.132574,-1.021147,5.459290,0.582733
3,2011-06-21,-0.993401,3.384487,0.281058,4.571217,0.721057
4,2011-06-22,-0.394963,4.760837,0.561471,5.280361,-0.026019
...,...,...,...,...,...,...
3572,2025-06-20,0.450951,6.634312,1.347050,6.824654,0.515441
3573,2025-06-22,5.550936,3.817274,4.889119,4.761684,-0.301483
3574,2025-06-23,0.913494,5.770122,1.171993,5.552404,0.877928
3575,2025-06-24,2.259547,7.123854,2.163190,7.152474,1.082229


#### Plots

In [14]:
import plotly.graph_objects as go

fig_2d = go.Figure()
fig_2d.add_trace(go.Scatter(x=df_daily_mean['date'], y=df_daily_mean['embedding_2d_1_mean'], mode='lines', name='embedding_2d_1'))
fig_2d.add_trace(go.Scatter(x=df_daily_mean['date'], y=df_daily_mean['embedding_2d_2_mean'], mode='lines', name='embedding_2d_2'))

fig_2d.update_layout(
    title='Average Daily Embeddings 2D',
    xaxis_title='Date',
    yaxis_title='Embedding Value',
    width=900,
    height=500
)
fig_2d.show()


### Daily Dispersion

In [None]:
df_embeddings['Date'] = pd.to_datetime(df_embeddings['Date'])
df_daily_var = df_embeddings.groupby(df_embeddings['Date'].dt.date).agg({
    'embedding_2d_1': 'var',
    'embedding_2d_2': 'var',
    'embedding_3d_1': 'var',
    'embedding_3d_2': 'var',
    'embedding_3d_3': 'var'
}).reset_index()

df_daily_var.rename(columns={'Date': 'date'}, inplace=True)
df_daily_var.rename(columns={
    col: f"{col}_var_daily" for col in df_daily_var.columns if col not in ['date']
}, inplace=True)
df_daily_var['semantic_dispersion_2d'] = df_daily_var[['embedding_2d_1_var_daily', 'embedding_2d_2_var_daily']].mean(axis=1, skipna=True)
df_daily_var['semantic_dispersion_3d'] = df_daily_var[['embedding_3d_1_var_daily', 'embedding_3d_2_var_daily', 'embedding_3d_3_var_daily']].mean(axis=1, skipna=True)
df_daily_var.head()


Unnamed: 0,date,embedding_2d_1_var_daily,embedding_2d_2_var_daily,embedding_3d_1_var_daily,embedding_3d_2_var_daily,embedding_3d_3_var_daily,semantic_dispersion_2d,semantic_dispersion_3d
0,2011-06-18,5.422055,0.747519,0.4585,0.192979,6.055534,3.084787,2.235671
1,2011-06-19,12.808491,11.313642,3.519935,0.322372,3.334505,12.061066,2.392271
2,2011-06-20,3.14386,6.098374,0.469789,1.499433,4.469719,4.621117,2.146314
3,2011-06-21,0.628568,8.030444,0.027787,0.557885,6.004919,4.329506,2.196864
4,2011-06-22,2.521376,12.938322,0.734488,4.668383,4.121398,7.729849,3.174756


In [18]:
df_daily_var.fillna(0)

Unnamed: 0,date,embedding_2d_1_var_daily,embedding_2d_2_var_daily,embedding_3d_1_var_daily,embedding_3d_2_var_daily,embedding_3d_3_var_daily,semantic_dispersion_2d,semantic_dispersion_3d
0,2011-06-18,5.422055,0.747519,0.458500,0.192979,6.055534,3.084787,2.235671
1,2011-06-19,12.808491,11.313642,3.519935,0.322372,3.334505,12.061066,2.392271
2,2011-06-20,3.143860,6.098374,0.469789,1.499433,4.469719,4.621117,2.146314
3,2011-06-21,0.628568,8.030444,0.027787,0.557885,6.004919,4.329506,2.196864
4,2011-06-22,2.521376,12.938322,0.734488,4.668383,4.121398,7.729849,3.174756
...,...,...,...,...,...,...,...,...
3572,2025-06-20,5.794191,5.538606,1.460968,4.354188,3.771403,5.666398,3.195520
3573,2025-06-22,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3574,2025-06-23,6.112049,7.353633,3.670140,4.178602,1.794649,6.732841,3.214463
3575,2025-06-24,10.430340,12.050490,6.055853,3.920213,3.069129,11.240415,4.348398


#### plots

In [None]:
fig_2d = px.line(df_daily_var,
                 x='date',
                 y=['embedding_2d_1_var_daily', 'embedding_2d_2_var_daily'],
                 title='Daily Variance of 2D Embedding Components',
                 labels={'value': 'Variance', 'date': 'Date', 'variable': 'Component'})
fig_2d.update_layout(template='plotly_white', width=900, height=400)
fig_2d.update_yaxes(range=[0, 90])
fig_2d.show()


In [None]:
fig_sem_2d = px.line(df_daily_var,
                     x='date',
                     y='semantic_dispersion_2d',
                     title='Daily Semantic Dispersion (2D)',
                     labels={'semantic_dispersion_2d': 'Dispersion', 'date': 'Date'})
fig_sem_2d.update_layout(template='plotly_white', width=900, height=400)
fig_sem_2d.show()


### Cosine, Drift, Drift Velocity



| Column                    | Description                                                                                                                                                                                  |
| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 **embedding\_mean**       | Mean embedding vector calculated by aggregating all original embeddings of the day's news items                                                                                              |
| **cosine\_sim**           | Cosine similarity between the mean embedding of the current day and the previous day (value between 0 and 1). Closer to 1 = little change; closer to 0 = large semantic change               |
| **drift**                 | Measure of semantic change: `1 - cosine_sim`. High drift means a large variation between days                                                                                                |
| **drift\_velocity**       | Average speed of semantic change calculated as a rolling mean of the drift (window of 3 days). Indicates the recent trend of drift                                                           |
| **drift\_velocity\_diff** | First derivative of velocity, i.e., daily change of the drift velocity. Can be positive or negative, indicating whether velocity is increasing or decreasing                                 |
| **drift\_acceleration**   | Second temporal derivative, i.e., change of the velocity change (acceleration). Detects sudden shifts in how the semantic focus changes (whether the change is accelerating or slowing down) |


In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_similarity

df_embeddings['embedding_array'] = df_embeddings['embedding_original'].apply(ast.literal_eval).apply(np.array)

df_daily = df_embeddings.groupby(df_embeddings['Date'].dt.date)['embedding_array'].apply(lambda x: np.mean(np.stack(x), axis=0)).reset_index()
df_daily.rename(columns={'Date': 'date', 'embedding_array': 'embedding_mean'}, inplace=True)

def cosine_sim(v1, v2):
    return cosine_similarity(v1.reshape(1, -1), v2.reshape(1, -1))[0][0]

df_daily['cosine_sim'] = np.nan
for i in range(1, len(df_daily)):
    df_daily.loc[i, 'cosine_sim'] = cosine_sim(df_daily.loc[i, 'embedding_mean'], df_daily.loc[i-1, 'embedding_mean'])


df_daily['drift'] = 1 - df_daily['cosine_sim']

window_size = 3
df_daily['drift_velocity'] = df_daily['drift'].rolling(window=window_size).mean()

df_daily['drift_velocity_diff'] = df_daily['drift_velocity'].diff()
df_daily['drift_acceleration'] = df_daily['drift_velocity_diff'].diff()

print(df_daily[['date', 'cosine_sim', 'drift', 'drift_velocity', 'drift_acceleration']].head(10))


         date  cosine_sim     drift  drift_velocity  drift_acceleration
0  2011-06-18         NaN       NaN             NaN                 NaN
1  2011-06-19    0.402920  0.597080             NaN                 NaN
2  2011-06-20    0.457480  0.542520             NaN                 NaN
3  2011-06-21    0.599418  0.400582        0.513394                 NaN
4  2011-06-22    0.646195  0.353805        0.432302                 NaN
5  2011-06-23    0.618741  0.381259        0.378549            0.027338
6  2011-06-24    0.347412  0.652588        0.462551            0.137756
7  2011-06-25    0.278069  0.721931        0.585260            0.038707
8  2011-06-26    0.305124  0.694876        0.689798           -0.018170
9  2011-06-27    0.565291  0.434709        0.617172           -0.177165


In [27]:
df_daily

Unnamed: 0,date,embedding_mean,cosine_sim,drift,drift_velocity,drift_velocity_diff,drift_acceleration
0,2011-06-18,"[-0.035143853281624615, 0.060475101694464684, ...",,,,,
1,2011-06-19,"[-0.022825940512120724, 0.05453146621584892, 0...",0.402920,0.597080,,,
2,2011-06-20,"[-0.02129497081041336, 0.02005319930613041, -0...",0.457480,0.542520,,,
3,2011-06-21,"[-0.038333331048488614, 0.03941713785752654, -...",0.599418,0.400582,0.513394,,
4,2011-06-22,"[-0.03675077985972166, 0.00393333900719881, -0...",0.646195,0.353805,0.432302,-0.081092,
...,...,...,...,...,...,...,...
3572,2025-06-20,"[-0.03718245909031895, 0.011349399108439684, 0...",0.759766,0.240234,0.278692,0.029329,0.041500
3573,2025-06-22,"[0.012624431401491165, -0.0535525307059288, -0...",0.513037,0.486963,0.341294,0.062601,0.033272
3574,2025-06-23,"[-0.0006948095661672679, 0.0350219647992741, 0...",0.595818,0.404182,0.377126,0.035832,-0.026769
3575,2025-06-24,"[-0.04073533708772933, -0.02329262951388955, 0...",0.746879,0.253121,0.381422,0.004296,-0.031537


In [39]:
df_daily.dropna(inplace=True)

In [44]:
fig = go.Figure()

# Cosine similarity
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['cosine_sim'],
    mode='lines',
    name='Cosine Similarity',
    line=dict(color='green')
))

# Drift
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift'],
    mode='lines',
    name='Drift',
    line=dict(color='red')
))

# Drift Velocity
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift_velocity'],
    mode='lines',
    name='Drift Velocity',
    line=dict(color='orange')
))
# Drift Velocity dIFF
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift_velocity_diff'],
    mode='lines',
    name='Drift Velocity Diff',
    line=dict(color='lightblue')
))

# Drift Acceleration
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift_acceleration'],
    mode='lines',
    name='Drift Acceleration',
    line=dict(color='blue')
))

fig.update_layout(
    title='Temporal Dynamics of Semantic Change',
    xaxis_title='Date',
    yaxis_title='Value',
    template='plotly_white',
    width=1000,
    height=500,
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5)
)

fig.show()


In [18]:
df_daily['date'] = pd.to_datetime(df_daily['date'])
df_daily_var['date'] = pd.to_datetime(df_daily_var['date'])
df_daily_var = df_daily_var.dropna()
df_daily_mean['date'] = pd.to_datetime(df_daily_mean['date'])

df_merged = pd.merge(df_daily, df_daily_var, on='date', how='inner')
df_merged = pd.merge(df_merged, df_daily_mean, on='date', how='inner')


In [21]:
df_merged.to_csv('embeddings_feature_ST.csv',index=False)

In [None]:
sent = (
    pd.read_csv('../Data/raw/df_sentiment.csv', parse_dates=['Date'])
)

mapping = {'positive':  1,
           'neutral':   0,
           'negative': -1}

sent['score_num'] = sent['sentiment'].map(mapping)
