In [2]:
import ast
import json
import re

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px

import nltk
from nltk.corpus import stopwords

from bertopic import BERTopic
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import umap


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../Data/raw/df_oilnews.csv")

In [6]:
model_file = hf_hub_download(
    "fse/word2vec-google-news-300", filename="word2vec-google-news-300.model"
)
vectors_file = hf_hub_download(
    "fse/word2vec-google-news-300", filename="word2vec-google-news-300.model.vectors.npy"
)

w2v_model = KeyedVectors.load(model_file, mmap='r')
w2v_model.vectors = np.load(vectors_file)
w2v_model.fill_norms()

df['text'] = df['title'].astype(str) + " " + df['excerpt'].astype(str)
def simple_tokenize(text):
    return re.sub(r'[^\w\s]', '', text.lower()).split()
df['tokens'] = df['text'].apply(simple_tokenize)

vector_size = w2v_model.vector_size
def get_doc_embedding(tokens):
    vecs = [w2v_model[w] for w in tokens if w in w2v_model]
    return np.mean(vecs, axis=0) if vecs else np.zeros(vector_size)
embeddings = np.vstack(df['tokens'].map(get_doc_embedding).values)

embedding_2d = umap.UMAP(n_components=2, random_state=42).fit_transform(embeddings)
embedding_3d = umap.UMAP(n_components=3, random_state=42).fit_transform(embeddings)

df['embedding_2d_1'] = embedding_2d[:, 0]
df['embedding_2d_2'] = embedding_2d[:, 1]
df['embedding_3d_1'] = embedding_3d[:, 0]
df['embedding_3d_2'] = embedding_3d[:, 1]
df['embedding_3d_3'] = embedding_3d[:, 2]
df['embedding_original'] = [json.dumps(vec.tolist()) for vec in embeddings]

df.to_csv("news_embeddings_Word2Vec_HF_En.csv", index=False)


  warn(
  warn(


## Cluster Analysis

In [3]:
df_embeddings = pd.read_csv("../Data/raw/news_embeddings_Word2Vec_HF_En.csv")

In [4]:
from sklearn.cluster import KMeans
from sklearn.metrics import (silhouette_score,
                             calinski_harabasz_score,
                             davies_bouldin_score)

def evaluate_k(X, k):
    km = KMeans(n_clusters=k, random_state=42).fit(X)
    labels = km.labels_
    return {
        "inertia": km.inertia_,
        "silhouette": silhouette_score(X, labels),
        "ch": calinski_harabasz_score(X, labels),
        "db": davies_bouldin_score(X, labels)
    }
df_subset = df_embeddings
results = []
X = df_subset[['embedding_2d_1', 'embedding_2d_2']].values
for k in range(2, 25):
    scores = evaluate_k(X, k)
    scores["k"] = k
    results.append(scores)

df_scores = pd.DataFrame(results)
print(df_scores)


          inertia  silhouette            ch        db   k
0   131743.501313    0.369062  16656.476946  1.064499   2
1    83945.987460    0.394204  19736.969025  0.874986   3
2    68303.645858    0.350644  17958.125996  0.933648   4
3    52208.514526    0.358701  19424.266540  0.898384   5
4    43670.179004    0.371652  19492.567952  0.851559   6
5    36474.382207    0.374485  20217.316106  0.828762   7
6    32636.509936    0.375452  19759.563048  0.836791   8
7    27824.197217    0.395766  20785.032083  0.746169   9
8    24767.601302    0.398611  21076.499516  0.723848  10
9    22455.629619    0.395152  21161.510353  0.741947  11
10   20803.433528    0.392035  20933.867129  0.745416  12
11   18519.628927    0.386644  21795.904160  0.790218  13
12   16623.082617    0.393833  22618.673161  0.741243  14
13   15158.931185    0.397295  23192.133381  0.733000  15
14   13583.221112    0.405312  24337.182247  0.724113  16
15   12837.117495    0.399380  24225.761183  0.722537  17
16   12222.850

In [4]:
df_subset = df_embeddings

X = df_subset[['embedding_2d_1', 'embedding_2d_2']].values
k = 16  
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)
df_subset['cluster'] = clusters.astype(str)

big_palette = px.colors.qualitative.Safe + px.colors.qualitative.Bold + px.colors.qualitative.Light24
colors_for_clusters = big_palette[:k]

fig = px.scatter(
    df_subset,
    x='embedding_2d_1',
    y='embedding_2d_2',
    color='cluster',
    color_discrete_sequence=colors_for_clusters,
    hover_name='title',
    title=f'Cluster analysis con {k} cluster',
    width=900,
    height=700
)

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()


In [6]:
df_embeddings.to_csv('../Data/embeddings_CA_Word2vec.csv')


### Cluster Semantics

In [6]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

def descrivi_cluster(df, n_top_words=20):
    descrizioni = {}
    for cluster_id in sorted(df['cluster'].unique()):
        testi = df[df['cluster'] == cluster_id]['title'].values
        testo_unito = " ".join(testi)
        vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000)
        tfidf_matrix = vectorizer.fit_transform([testo_unito])
        feature_names = vectorizer.get_feature_names_out()
        scores = tfidf_matrix.toarray()[0]
        top_indices = scores.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_indices]
        descrizioni[cluster_id] = ", ".join(top_words)
    return descrizioni

descrizioni = descrivi_cluster(df_subset)
for cluster, parole in descrizioni.items():
    print(f"Cluster {cluster}: {parole}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saraborello/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cluster 0: oil, gas, production, prices, shale, energy, price, demand, industry, could, natural, set, opec, billion, china, sees, russia, output, low, record
Cluster 1: energy, power, solar, wind, coal, uk, renewable, china, new, emissions, billion, world, could, gas, renewables, carbon, oil, offshore, india, capacity
Cluster 10: oil, pipeline, gas, court, stream, new, nord, mountain, trans, energy, drilling, keystone, case, xl, biden, venezuela, could, trump, exxon, lease
Cluster 11: nuclear, power, plant, energy, plants, japan, iran, china, new, uk, coal, grid, russia, blackouts, reactor, reactors, tariffs, could, electricity, eu
Cluster 12: oil, crude, prices, build, api, draw, reports, inventory, inventories, surprise, gasoline, large, rise, fall, draws, small, higher, product, expected, rally
Cluster 13: oil, refinery, crude, exports, pipeline, production, strike, libya, iran, russian, largest, gulf, mexico, iraq, export, shut, could, hurricane, workers, shell
Cluster 14: energy, 

In [7]:
descrizioni_cluster = {}

for cluster_id in sorted(df_subset['cluster'].unique()):
    testi_cluster = df_subset[df_subset['cluster'] == cluster_id]['title'].tolist()
    
    topic_model = BERTopic(language="english")
    topics, probs = topic_model.fit_transform(testi_cluster)
    parole_chiave = topic_model.get_topic(0)
    
    descrizioni_cluster[cluster_id] = parole_chiave


for cluster_id, parole in descrizioni_cluster.items():
    print(f"Cluster {cluster_id}:")
    print(", ".join([word for word, _ in parole]))
    print("-" * 50)


Cluster 0:
shale, us, eia, bpd, boom, production, in, the, for, predicts
--------------------------------------------------
Cluster 1:
wind, offshore, turbine, farm, power, uk, turbines, farms, in, first
--------------------------------------------------
Cluster 10:
venezuela, citgo, pdvsa, venezuelas, maduro, venezuelan, for, us, crystallex, trump
--------------------------------------------------
Cluster 11:
blackouts, texas, grid, blackout, outages, power, california, wave, heatwave, demand
--------------------------------------------------
Cluster 12:
oil, crude, prices, inventories, build, draw, inventory, on, surprise, us
--------------------------------------------------
Cluster 13:
libyas, libya, libyan, ports, majeure, force, production, field, oilfield, oil
--------------------------------------------------
Cluster 14:
fracking, shale, ban, uk, earthquakes, in, permian, study, as, of
--------------------------------------------------
Cluster 15:
venezuela, venezuelan, sanctio

| Cluster | Topic                                                                                                    |
| ------- | -------------------------------------------------------------------------------------------------------- |
| 0       | Global oil & gas production and pricing trends (shale output, demand in China/Russia, industry records)  |
| 1       | Power generation and renewables transition (solar, wind, coal, emissions, capacity in UK/China/India)    |
| 2       | Oil infrastructure attacks and security incidents (tanker/pipeline strikes, Iran/Libya/ISIS/Houthis)     |
| 3       | Crude export metrics and trade flows (OPEC/Russia output, China/India imports, barrels-per-day records)  |
| 4       | Upstream LNG & offshore project deals (new field developments, Shell/Exxon exploration and production)   |
| 5       | Oil majors’ financial results (quarterly earnings, profit beats/misses, refining segment performance)    |
| 6       | Sanctions’ impact on oil & gas markets (Russia/Iran restrictions, EU LNG deals, pipeline shifts)         |
| 7       | Oil price dynamics and demand analysis (gasoline trends, supply/demand balance, OPEC signals)            |
| 8       | Electric vehicles and clean mobility (Tesla/EV sales, batteries, market growth in China/UK)              |
| 9       | Aramco asset transactions and investments (stakes, IPOs, Saudi fund deals, Shell/Exxon participations)   |
| 10      | Pipeline politics and legal battles (Nord Stream, Keystone XL, Trans Mountain, court rulings)            |
| 11      | Nuclear & power-plant developments (new reactors, grid resilience, Japan/Iran/UK energy tariffs)         |
| 12      | Crude inventory reports & price drivers (API builds/draws, surprise stock changes, rally expectations)   |
| 13      | Refinery operations and disruptions (exports, pipeline flows, strikes in Libya/Mexico, hurricane impact) |
| 14      | Energy policy & taxation debates (UK/EU climate bills, fracking, windfall taxes, natural gas levies)     |
| 15      | OPEC production decisions and cuts (Saudi/Russia output, India/Iran imports, export quotas)              |


In [20]:
df_subset['year'] = pd.to_datetime(df_subset['Date']).dt.year.astype(str)
X = df_subset[['embedding_2d_1', 'embedding_2d_2']].values

k = 16
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

df_subset['cluster'] = clusters.astype(str)
big_palette = px.colors.qualitative.Safe + px.colors.qualitative.Bold + px.colors.qualitative.Light24
colors_for_clusters = big_palette[:k]

fig = px.scatter(
    df_subset,
    x='embedding_2d_1',
    y='embedding_2d_2',
    color='cluster',
    animation_frame='year',
    color_discrete_sequence=colors_for_clusters,
    hover_name='title',
    title=f'Cluster analysis con {k} cluster animata per anno',
    width=900,
    height=700
)

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()


In [26]:
df_subset['year'] = pd.to_datetime(df_subset['Date']).dt.year.astype(str)
df_subset['cluster'] = df_subset['cluster'].astype(str)

year_cluster_counts = df_subset.groupby(['year', 'cluster']).size().unstack(fill_value=0)
year_cluster_counts = year_cluster_counts[sorted(year_cluster_counts.columns, key=lambda x: int(x))]
year_cluster_counts['total'] = year_cluster_counts.sum(axis=1)
pd.set_option('display.max_columns', None)
year_cluster_counts


cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,total
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011,12,57,13,8,70,3,44,2,1,26,10,81,1,13,46,11,398
2012,58,272,57,11,59,2,54,29,72,88,49,53,0,27,67,5,903
2013,80,228,49,5,115,7,56,23,107,81,71,56,1,25,93,11,1008
2014,60,151,16,15,38,3,54,23,59,16,45,36,0,24,73,15,628
2015,102,38,8,15,48,5,57,30,31,33,58,15,0,14,33,26,513
2016,159,41,167,91,180,47,79,74,49,189,113,25,42,89,65,95,1505
2017,165,53,120,173,271,42,122,80,45,221,143,30,57,145,75,188,1930
2018,141,69,87,149,205,42,146,58,113,143,172,23,55,131,93,166,1793
2019,118,94,146,130,210,25,145,47,106,154,126,30,56,115,105,163,1770
2020,231,118,74,137,135,53,103,112,101,129,100,22,56,92,93,135,1691


In [29]:
word2vec_cluster_theme = {
    0: "global_trade",
    1: "clean_energy_evs",
    2: "geopolitics_sanctions",
    3: "global_trade",
    4: "upstream_projects",
    5: "financial_results",
    6: "geopolitics_sanctions",
    7: "opec_production",
    8: "clean_energy_evs",
    9: "financial_results",
    10: "pipeline_politics",
    11: "nuclear",
    12: "inventories",
    13: "refinery_disruptions",
    14: "energy_policy",
    15: "opec_production"
}

semantic_colors = {
    "geopolitics_sanctions": "#D62728",       # Crimson Red
    "global_trade": "#1F77B4",                # Sky Blue
    "opec_production": "#E2B000",             # Goldenrod
    "financial_results": "#2CA02C",           # Forest Green
    "clean_energy_evs": "#17BECF",            # Teal
    "china_india_demand": "#8C564B",          # Burgundy (only for DistilBERT)
    "inventories": "#9467BD",                 # Violet
    "upstream_projects": "#FF7F0E",           # Orange
    "pipeline_politics": "#7F7F7F",           # Slate Gray
    "energy_policy": "#A65628",               # Brown
    "nuclear": "#393B79",                     # Indigo
    "refinery_disruptions": "#BCBD22"         # Mustard Yellow
}

cluster_palette = {
    str(cluster): semantic_colors.get(word2vec_cluster_theme[cluster], "#D3D3D3")
    for cluster in word2vec_cluster_theme
}

df_bar = year_cluster_counts.drop(columns='total').reset_index()
df_bar_long = df_bar.melt(id_vars='year', var_name='cluster', value_name='count')
df_bar_long['cluster'] = df_bar_long['cluster'].astype(str) 

fig = px.bar(
    df_bar_long,
    x="cluster",
    y="count",
    facet_col="year",
    facet_col_wrap=4,
    color="cluster",
    color_discrete_map=cluster_palette,
    category_orders={"cluster": sorted(df_bar_long['cluster'].unique(), key=int)},
    title="Word2Vec – Distribution of Semantic Topics by Year",
    width=1300,
    height=850
)

fig.update_layout(showlegend=True)
fig.show()



## Feature Creation

### Mean

In [30]:
df_embeddings['Date'] = pd.to_datetime(df_embeddings['Date'])

df_daily_mean = df_embeddings.groupby(df_embeddings['Date'].dt.date).agg({
    'embedding_2d_1': 'mean',
    'embedding_2d_2': 'mean',
    'embedding_3d_1': 'mean',
    'embedding_3d_2': 'mean',
    'embedding_3d_3': 'mean'
}).reset_index()

df_daily_mean.rename(columns={'Date': 'date'}, inplace=True)
df_daily_mean.rename(columns={
    col: f"{col}_mean" for col in df_daily_mean.columns if col not in ['date']
}, inplace=True)

df_daily_mean.head()


Unnamed: 0,date,embedding_2d_1_mean,embedding_2d_2_mean,embedding_3d_1_mean,embedding_3d_2_mean,embedding_3d_3_mean
0,2011-06-18,-0.374726,0.168819,1.308357,0.344546,3.693299
1,2011-06-19,-0.354561,0.54843,1.240495,0.852007,4.078292
2,2011-06-20,1.349736,-0.693626,1.392533,-0.107635,5.247386
3,2011-06-21,-1.341812,0.129342,1.286522,0.32761,3.710636
4,2011-06-22,0.400725,0.079165,0.873506,0.346109,4.371677


In [31]:
fig_2d = go.Figure()
fig_2d.add_trace(go.Scatter(x=df_daily_mean['date'], y=df_daily_mean['embedding_2d_1_mean'], mode='lines', name='embedding_2d_1'))
fig_2d.add_trace(go.Scatter(x=df_daily_mean['date'], y=df_daily_mean['embedding_2d_2_mean'], mode='lines', name='embedding_2d_2'))

fig_2d.update_layout(
    title='Average Daily Embeddings 2D',
    xaxis_title='Date',
    yaxis_title='Embedding Value',
    width=900,
    height=500
)
fig_2d.show()

### Variance

In [32]:
df_embeddings['Date'] = pd.to_datetime(df_embeddings['Date'])
df_daily_var = df_embeddings.groupby(df_embeddings['Date'].dt.date).agg({
    'embedding_2d_1': 'var',
    'embedding_2d_2': 'var',
    'embedding_3d_1': 'var',
    'embedding_3d_2': 'var',
    'embedding_3d_3': 'var'
}).reset_index()

df_daily_var.rename(columns={'Date': 'date'}, inplace=True)
df_daily_var.rename(columns={
    col: f"{col}_var_daily" for col in df_daily_var.columns if col not in ['date']
}, inplace=True)
df_daily_var['semantic_dispersion_2d'] = df_daily_var[['embedding_2d_1_var_daily', 'embedding_2d_2_var_daily']].mean(axis=1, skipna=True)
df_daily_var['semantic_dispersion_3d'] = df_daily_var[['embedding_3d_1_var_daily', 'embedding_3d_2_var_daily', 'embedding_3d_3_var_daily']].mean(axis=1, skipna=True)
df_daily_var.head()


Unnamed: 0,date,embedding_2d_1_var_daily,embedding_2d_2_var_daily,embedding_3d_1_var_daily,embedding_3d_2_var_daily,embedding_3d_3_var_daily,semantic_dispersion_2d,semantic_dispersion_3d
0,2011-06-18,9.006795,0.055639,0.079326,0.04215,4.261042,4.531217,1.46084
1,2011-06-19,9.132315,12.17186,0.348385,6.494428,4.306194,10.652088,3.716335
2,2011-06-20,8.230696,1.220364,1.355301,0.877787,3.047017,4.72553,1.760035
3,2011-06-21,5.131634,0.972484,1.489803,0.814485,2.598364,3.052059,1.634217
4,2011-06-22,3.308153,2.559762,0.528086,1.355651,1.93024,2.933957,1.271326


In [34]:
fig_2d = px.line(df_daily_var, 
                 x='date', 
                 y=['embedding_2d_1_var_daily', 'embedding_2d_2_var_daily'],
                 title='Daily Variance of 2D Embedding Components',
                 labels={'value': 'Variance', 'date': 'Date', 'variable': 'Component'})
fig_2d.update_layout(template='plotly_white', width=900, height=400)
fig_2d.update_yaxes(range=[0, 90])
fig_2d.show()

### Cosine similarity

In [36]:
df_embeddings['embedding_array'] = df_embeddings['embedding_original'].apply(ast.literal_eval).apply(np.array)

df_daily = df_embeddings.groupby(df_embeddings['Date'].dt.date)['embedding_array'].apply(lambda x: np.mean(np.stack(x), axis=0)).reset_index()
df_daily.rename(columns={'Date': 'date', 'embedding_array': 'embedding_mean'}, inplace=True)

def cosine_sim(v1, v2):
    return cosine_similarity(v1.reshape(1, -1), v2.reshape(1, -1))[0][0]

df_daily['cosine_sim'] = np.nan
for i in range(1, len(df_daily)):
    df_daily.loc[i, 'cosine_sim'] = cosine_sim(df_daily.loc[i, 'embedding_mean'], df_daily.loc[i-1, 'embedding_mean'])


df_daily['drift'] = 1 - df_daily['cosine_sim']

window_size = 3 
df_daily['drift_velocity'] = df_daily['drift'].rolling(window=window_size).mean()

df_daily['drift_velocity_diff'] = df_daily['drift_velocity'].diff()
df_daily['drift_acceleration'] = df_daily['drift_velocity_diff'].diff()

print(df_daily[['date', 'cosine_sim', 'drift', 'drift_velocity', 'drift_acceleration']].head(10))


         date  cosine_sim     drift  drift_velocity  drift_acceleration
0  2011-06-18         NaN       NaN             NaN                 NaN
1  2011-06-19    0.800558  0.199442             NaN                 NaN
2  2011-06-20    0.833325  0.166675             NaN                 NaN
3  2011-06-21    0.871855  0.128145        0.164754                 NaN
4  2011-06-22    0.918565  0.081435        0.125419                 NaN
5  2011-06-23    0.904230  0.095770        0.101783            0.015700
6  2011-06-24    0.735997  0.264003        0.147070            0.068921
7  2011-06-25    0.706760  0.293240        0.217671            0.025315
8  2011-06-26    0.785924  0.214076        0.257106           -0.031166
9  2011-06-27    0.885927  0.114073        0.207129           -0.089412


In [37]:
df_daily.dropna(inplace=True)

In [39]:
import plotly.graph_objects as go

fig = go.Figure()

# Cosine similarity
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['cosine_sim'],
    mode='lines',
    name='Cosine Similarity',
    line=dict(color='green')
))

# Drift
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift'],
    mode='lines',
    name='Drift',
    line=dict(color='red')
))

# Drift Velocity
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift_velocity'],
    mode='lines',
    name='Drift Velocity',
    line=dict(color='orange')
))
# Drift Velocity dIFF
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift_velocity_diff'],
    mode='lines',
    name='Drift Velocity Diff',
    line=dict(color='lightblue')
))

# Drift Acceleration
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift_acceleration'],
    mode='lines',
    name='Drift Acceleration',
    line=dict(color='blue')
))

fig.update_layout(
    title='Temporal Dynamics of Semantic Change',
    xaxis_title='Date',
    yaxis_title='Value',
    template='plotly_white',
    width=1000,
    height=500,
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5)
)

fig.show()


In [20]:
df_daily['date'] = pd.to_datetime(df_daily['date'])
df_daily_var['date'] = pd.to_datetime(df_daily_var['date'])
df_daily_var = df_daily_var.dropna()
df_daily_mean['date'] = pd.to_datetime(df_daily_mean['date'])

df_merged = pd.merge(df_daily, df_daily_var, on='date', how='inner')
df_merged = pd.merge(df_merged, df_daily_mean, on='date', how='inner')

In [21]:
df_merged.to_csv('embeddings_feature_news_embeddings_Word2Vec_HF_En.csv',index=False)