In [2]:
import ast
import json

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import torch

import umap

from bertopic import BERTopic
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../Data/raw/df_oilnews.csv")
df

Unnamed: 0,title,Date,excerpt
0,April Price Crash Dragged Saudi Arabia’s Oil R...,2025-06-25,Saudi Arabia’s revenues from oil exports crash...
1,Giant Leviathan Gas Field Offshore Israel Resu...,2025-06-25,The massive Leviathan gas field offshore Israe...
2,China and India Cut Imports of Lower-Quality C...,2025-06-25,The world’s biggest and second-biggest coal im...
3,Iran-Israel War Prompts China to Reconsider Ru...,2025-06-25,The war between Israel and Iran has spark worr...
4,EU Set to Change Subsidy Rules for Energy Costs,2025-06-25,National governments in the EU would soon be a...
...,...,...,...
23415,Australia's Desalinization Plant Workers in In...,2011-06-20,Victoria state’s troubled Wonthaggi desalinati...
23416,Chinese Energy Workers in Somalia Threatened,2011-06-19,The Ogaden National Liberation Front has warne...
23417,Argentina Now Receiving 40 Percent of Chinese ...,2011-06-19,In Argentina Mandarin Chinese is now the main ...
23418,Chinese Dam and Pipeline Projects Raise Burmes...,2011-06-18,Lucrative China-backed hydropower projects are...


# Distilbert-base-uncased

## Embeddings creation

In [3]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModel.from_pretrained(model_name)

def embed_texts(texts, batch_size=16):
    embeddings = []
    model.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i : i + batch_size]
            enc = tokenizer(batch, padding=True, truncation=True,
                            return_tensors="pt", max_length=128)
            out = model(**enc)  # out.last_hidden_state shape: [B, L, D]
            mask = enc["attention_mask"].unsqueeze(-1)  # [B, L, 1]
            summed = (out.last_hidden_state * mask).sum(1)      # [B, D]
            counts = mask.sum(1)                               # [B, 1]
            mean_pooled = summed / counts                      # [B, D]
            embeddings.append(mean_pooled.cpu())
    return torch.cat(embeddings).numpy()

# Usage
texts = df['title'].tolist()
embeddings = embed_texts(texts)
df['embeddings'] = embeddings.tolist()



In [4]:
X = np.vstack(df['embeddings'].values)  

# Proiezione 2D
reducer_2d = umap.UMAP(n_components=2, random_state=42)
emb2d = reducer_2d.fit_transform(X)
df['umap2d_1'], df['umap2d_2'] = emb2d[:,0], emb2d[:,1]

# Proiezione 3D
reducer_3d = umap.UMAP(n_components=3, random_state=42)
emb3d = reducer_3d.fit_transform(X)
df['umap3d_1'], df['umap3d_2'], df['umap3d_3'] = emb3d[:,0], emb3d[:,1], emb3d[:,2]


  warn(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  warn(


In [6]:
df.to_csv("../Data/raw/embeddings_distilbert-base-uncased.csv", index = False)

In [16]:
df = df.rename(columns={
    'umap2d_1': 'embedding_2d_1',
    'umap2d_2': 'embedding_2d_2',
    'umap3d_1': 'embedding_3d_1',
    'umap3d_2': 'embedding_3d_2',
    'umap3d_3': 'embedding_3d_3'
})

In [1]:
df.to_csv("../Data/raw/embeddings_distilbert-base-uncased.csv", index = False)

NameError: name 'df' is not defined

In [3]:
df_embeddings = pd.read_csv('../Data/raw/embeddings_distilbert-base-uncased.csv')

## Cluster Analysis 2D

In [4]:
from sklearn.cluster import KMeans
from sklearn.metrics import (silhouette_score,
                             calinski_harabasz_score,
                             davies_bouldin_score)

def evaluate_k(X, k):
    km = KMeans(n_clusters=k, random_state=42).fit(X)
    labels = km.labels_
    return {
        "inertia": km.inertia_,
        "silhouette": silhouette_score(X, labels),
        "ch": calinski_harabasz_score(X, labels),
        "db": davies_bouldin_score(X, labels)
    }
df_subset = df_embeddings
results = []
X = df_subset[['embedding_2d_1', 'embedding_2d_2']].values
for k in range(2, 25):
    scores = evaluate_k(X, k)
    scores["k"] = k
    results.append(scores)

df_scores = pd.DataFrame(results)
print(df_scores)


          inertia  silhouette            ch        db   k
0   138164.293237    0.363133  15748.324473  1.109933   2
1    84819.593661    0.409944  20189.900270  0.857854   3
2    59926.859044    0.426533  22292.075012  0.766328   4
3    52390.225747    0.365590  19965.799980  0.932018   5
4    37861.347508    0.410633  23898.206963  0.788740   6
5    31040.999717    0.405922  25148.526739  0.796135   7
6    26502.966701    0.407774  25816.913447  0.767597   8
7    23184.506054    0.405486  26241.077591  0.755167   9
8    20085.413198    0.410688  27324.194466  0.747203  10
9    18472.174567    0.406198  26942.947418  0.764405  11
10   17158.584872    0.396298  26530.434264  0.796816  12
11   16494.980682    0.383503  25376.797500  0.809739  13
12   14851.993512    0.392893  26213.514048  0.822290  14
13   14419.627700    0.386295  25120.464945  0.828814  15
14   12957.950432    0.395096  26264.675219  0.773005  16
15   12303.386471    0.391642  26010.274329  0.792204  17
16   11445.597

In [5]:
df_subset = df_embeddings
X = df_subset[['embedding_2d_1', 'embedding_2d_2']].values

k = 18
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)
df_subset['cluster'] = clusters.astype(str)

big_palette  = px.colors.qualitative.Safe \
             + px.colors.qualitative.Bold \
             + px.colors.qualitative.Light24
colors_for_clusters = big_palette[:k]

fig = px.scatter(
    df_subset,
    x='embedding_2d_1',
    y='embedding_2d_2',
    color='cluster',
    color_discrete_sequence=colors_for_clusters,
    hover_name='title',
    title=f'Cluster analysis con {k} cluster',
    width=900, height=700
)
fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()

### Cluster Semantics

In [10]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

def descrivi_cluster(df, n_top_words=20):
    descrizioni = {}
    for cluster_id in sorted(df['cluster'].unique()):
        testi = df[df['cluster'] == cluster_id]['title'].values
        testo_unito = " ".join(testi)
        vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000)
        tfidf_matrix = vectorizer.fit_transform([testo_unito])
        feature_names = vectorizer.get_feature_names_out()
        scores = tfidf_matrix.toarray()[0]
        top_indices = scores.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_indices]
        descrizioni[cluster_id] = ", ".join(top_words)
    return descrizioni

descrizioni = descrivi_cluster(df_subset)
for cluster, parole in descrizioni.items():
    print(f"Cluster {cluster}: {parole}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saraborello/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cluster 0: oil, russia, iran, gas, russian, ukraine, deal, eu, china, sanctions, energy, nuclear, exports, pipeline, new, turkey, south, iraq, sea, opec
Cluster 1: oil, energy, gas, world, prices, europe, could, climate, big, uk, new, coal, natural, shell, opec, canada, industry, shale, lng, largest
Cluster 10: oil, gas, gazprom, russia, offshore, field, rosneft, natural, deal, pipeline, russian, new, production, china, lng, project, exxon, exploration, major, giant
Cluster 11: oil, bpd, 000, million, production, output, barrels, court, opec, gulf, hurricane, mexico, nigeria, billion, libya, crude, iran, exports, pipeline, iraq
Cluster 12: oil, opec, prices, demand, production, gas, cuts, price, energy, global, sees, iea, output, year, forecast, growth, goldman, natural, could, sachs
Cluster 13: tesla, new, solar, energy, ev, wind, power, world, battery, could, car, musk, electric, model, first, china, sales, aramco, fuel, cars
Cluster 14: oil, china, exports, crude, imports, russia, p

In [11]:
descrizioni_cluster = {}

for cluster_id in sorted(df_subset['cluster'].unique()):
    testi_cluster = df_subset[df_subset['cluster'] == cluster_id]['title'].tolist()
    
    topic_model = BERTopic(language="english")
    topics, probs = topic_model.fit_transform(testi_cluster)
    parole_chiave = topic_model.get_topic(0)
    
    descrizioni_cluster[cluster_id] = parole_chiave


for cluster_id, parole in descrizioni_cluster.items():
    print(f"Cluster {cluster_id}:")
    print(", ".join([word for word, _ in parole]))
    print("-" * 50)

Cluster 0:
iran, oil, iranian, to, sanctions, india, us, iraq, exports, tanker
--------------------------------------------------
Cluster 1:
shale, fracking, uk, permian, boom, toxic, cuadrilla, dispersants, again, chemical
--------------------------------------------------
Cluster 10:
gazprom, gas, to, gazproms, europe, ukraine, stream, for, in, of
--------------------------------------------------
Cluster 11:
court, case, in, against, over, shell, us, for, supreme, uk
--------------------------------------------------
Cluster 12:
opec, production, oil, cuts, output, to, in, meeting, cut, compliance
--------------------------------------------------
Cluster 13:
tesla, in, ev, to, battery, model, car, electric, the, sales
--------------------------------------------------
Cluster 14:
russias, russian, russia, exports, oil, by, revenues, in, to, gas
--------------------------------------------------
Cluster 15:
drilling, lease, sale, alaska, gulf, offshore, mexico, biden, administration


| Cluster | Topic                                                                                                                     |
| ------- | ------------------------------------------------------------------------------------------------------------------------- |
| 0       | Geopolitical energy dynamics and sanctions (focus on Russia, Iran, Ukraine, OPEC, pipelines, nuclear energy)              |
| 1       | Global oil and energy markets (price movements, climate considerations, major producers like Shell and Canada, LNG/shale) |
| 2       | Large-scale energy investments and deals (billions in oil/gas projects, funds, Aramco plans, India and Shell)             |
| 3       | Oil and gas market fluctuations under sanctions (Iran, Venezuela, Russia) and export/production shifts                    |
| 4       | Inventory reports and crude price drivers (API builds/draws, gasoline stocks, weekly supply surprises)                    |
| 5       | China and India’s energy demand (coal, nuclear, LNG), global power mix and pricing                                        |
| 6       | European energy transition (UK/EU renewables like wind/solar, emissions, carbon, coal-to-gas shifts)                      |
| 7       | OPEC production decisions (Saudi cuts, Russian output, Iran/Venezuela export policies, ministerial actions)               |
| 8       | Q-series earnings and profit reports in oil majors (Shell, Exxon, record profits vs. estimates)                           |
| 9       | Attacks on oil infrastructure (Libya tankers, pipelines, Houthi/ISIS threats, Saudi/Nigerian fields)                      |
| 10      | Major upstream projects and partnerships (Gazprom, Rosneft, Exxon offshore fields, LNG pipelines)                         |
| 11      | Production/output metrics and OPEC court rulings (barrels per day, “000” figures, Gulf storms, Nigeria/Libya)             |
| 12      | OPEC price forecasts and demand outlooks (IEA, Goldman Sachs, growth projections, cuts)                                   |
| 13      | Clean energy and EV revolution (Tesla solar, batteries, wind power, Model 3/EV sales, Musk vs. Aramco)                    |
| 14      | China’s crude trade flows (exports/imports, records, India, Russia, Saudi pricing trends)                                 |
| 15      | North American pipeline politics (Keystone XL, Trump/Biden energy policies, Canadian courts, Alberta drilling)            |
| 16      | Global crude trade and import dependencies (Russia, China, India, Iran, Venezuela refiners)                               |
| 17      | Integrated oil-major strategies (Aramco, Exxon, Shell, Petrobras asset deals, refinery stakes, offshore projects)         |


In [8]:
# news over years
df_subset['year'] = pd.to_datetime(df_subset['Date']).dt.year.astype(str)
X = df_subset[['embedding_2d_1', 'embedding_2d_2']].values

k = 18
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

df_subset['cluster'] = clusters.astype(str)
big_palette = px.colors.qualitative.Safe + px.colors.qualitative.Bold + px.colors.qualitative.Light24
colors_for_clusters = big_palette[:k]

fig = px.scatter(
    df_subset,
    x='embedding_2d_1',
    y='embedding_2d_2',
    color='cluster',
    animation_frame='year',
    color_discrete_sequence=colors_for_clusters,
    hover_name='title',
    title=f'Cluster analysis con {k} cluster animata per anno',
    width=900,
    height=700
)

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()


In [13]:
df_subset['year'] = pd.to_datetime(df_subset['Date']).dt.year.astype(str)
df_subset['cluster'] = df_subset['cluster'].astype(str)

year_cluster_counts = df_subset.groupby(['year', 'cluster']).size().unstack(fill_value=0)
year_cluster_counts = year_cluster_counts[sorted(year_cluster_counts.columns, key=lambda x: int(x))]
year_cluster_counts['total'] = year_cluster_counts.sum(axis=1)
pd.set_option('display.max_columns', None)
year_cluster_counts


cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,total
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011,63,7,22,8,2,42,17,26,1,15,49,5,0,17,14,7,65,38,398
2012,49,95,88,12,29,53,109,42,21,50,41,23,29,137,16,31,31,47,903
2013,37,88,95,10,12,74,120,31,13,40,51,35,34,166,20,52,47,83,1008
2014,39,54,17,54,16,33,46,23,25,15,25,13,33,120,10,52,25,28,628
2015,56,71,17,26,21,23,28,34,36,9,26,8,40,29,7,26,34,22,513
2016,68,81,130,83,67,33,40,99,95,196,85,87,74,44,61,67,72,123,1505
2017,113,74,142,133,83,68,40,146,74,155,130,114,75,66,82,91,143,201,1930
2018,106,74,94,175,74,73,47,126,61,112,83,65,60,134,102,131,114,162,1793
2019,98,94,93,165,67,79,76,117,54,128,74,51,63,107,88,120,147,149,1770
2020,65,113,93,197,98,92,80,144,72,83,53,62,123,80,79,85,84,88,1691


In [17]:
# === 1. Mapping: DistilBERT cluster → semantic topic ===
distilbert_cluster_theme = {
    0: "geopolitics_sanctions",
    3: "geopolitics_sanctions",
    9: "geopolitics_sanctions",
    16: "global_trade",
    7: "opec_production",
    11: "opec_production",
    12: "opec_production",
    8: "financial_results",
    17: "financial_results",
    6: "clean_energy_evs",
    13: "clean_energy_evs",
    5: "china_india_demand",
    14: "china_india_demand",
    4: "inventories",
    10: "upstream_projects",
    15: "pipeline_politics"
}

# === 2. Semantic topic → color ===
semantic_colors = {
    "geopolitics_sanctions": "#D62728",       # Crimson Red
    "global_trade": "#1F77B4",                # Sky Blue
    "opec_production": "#E2B000",             # Goldenrod
    "financial_results": "#2CA02C",           # Forest Green
    "clean_energy_evs": "#17BECF",            # Teal
    "china_india_demand": "#8C564B",          # Burgundy
    "inventories": "#9467BD",                 # Violet
    "upstream_projects": "#FF7F0E",           # Orange
    "pipeline_politics": "#7F7F7F"            # Slate Gray
}

# === 3. Cluster → color mapping (default gray if unmapped) ===
distilbert_cluster_palette = {
    str(cluster): semantic_colors.get(distilbert_cluster_theme.get(cluster, ""), "#D3D3D3")
    for cluster in range(18)
}

# === 4. Prepare the data ===
df_bar = year_cluster_counts.drop(columns='total').reset_index()
df_bar_long = df_bar.melt(id_vars='year', var_name='cluster', value_name='count')
df_bar_long['cluster'] = df_bar_long['cluster'].astype(str)

# === 5. Plotly bar chart (no legend) ===
fig = px.bar(
    df_bar_long,
    x="cluster",
    y="count",
    facet_col="year",
    facet_col_wrap=4,
    color="cluster",
    color_discrete_map=distilbert_cluster_palette,
    category_orders={"cluster": sorted(df_bar_long['cluster'].unique(), key=int)},
    title="DistilBERT – Distribution of Semantic Topics by Year",
    width=1300,
    height=850
)

# === 6. Remove legend ===
fig.update_layout(showlegend=True)

fig.show()


In [7]:
df_embeddings.to_csv('../Data/embeddings_CA_dBert.csv')

## Cluster Analysis 3D

In [16]:
from sklearn.cluster import KMeans
from sklearn.metrics import (silhouette_score,
                             calinski_harabasz_score,
                             davies_bouldin_score)

def evaluate_k(X, k):
    km = KMeans(n_clusters=k, random_state=42).fit(X)
    labels = km.labels_
    return {
        "inertia": km.inertia_,
        "silhouette": silhouette_score(X, labels),
        "ch": calinski_harabasz_score(X, labels),
        "db": davies_bouldin_score(X, labels)
    }
df_subset = df_embeddings
results = []
X = df_subset[['embedding_3d_1', 'embedding_3d_2','embedding_3d_3']].values
for k in range(2, 25):
    scores = evaluate_k(X, k)
    scores["k"] = k
    results.append(scores)

df_scores = pd.DataFrame(results)
print(df_scores)


          inertia  silhouette            ch        db   k
0   106444.655504    0.306057  12114.905904  1.298052   2
1    72856.864171    0.345834  14247.354993  1.066067   3
2    52654.674960    0.360653  16136.710874  0.920532   4
3    42116.387363    0.357619  16594.939384  0.968540   5
4    38450.703241    0.325330  14987.365271  1.075875   6
5    32279.200548    0.340148  15622.690318  1.004733   7
6    29081.764180    0.336674  15230.252456  1.000378   8
7    25886.485147    0.338699  15332.023000  1.011169   9
8    23812.198494    0.342466  15041.693292  0.966351  10
9    21915.728950    0.349265  14910.956099  0.955359  11
10   20500.997501    0.345606  14637.005533  0.960652  12
11   18998.917218    0.353393  14631.707493  0.955068  13
12   18291.060133    0.336770  14097.957720  1.022283  14
13   16799.969344    0.348083  14400.498178  0.958761  15
14   16223.055074    0.342215  13973.282110  0.985727  16
15   15515.280499    0.342515  13763.693043  0.971903  17
16   14592.341

In [18]:
# 3D
df_subset = df_embeddings
X = df_subset[['embedding_3d_1', 'embedding_3d_2', 'embedding_3d_3']].values

k = 18  
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

df_subset['cluster'] = clusters.astype(str)

big_palette = px.colors.qualitative.Safe + px.colors.qualitative.Bold + px.colors.qualitative.Light24
colors_for_clusters = big_palette[:k] 

fig = px.scatter_3d(
    df_subset,
    x='embedding_3d_1',
    y='embedding_3d_2',
    z='embedding_3d_3',
    color='cluster',
    color_discrete_sequence=colors_for_clusters,
    hover_name='title',
    title=f'Cluster analysis 3D (k={k})',
    width=900,
    height=700
)

fig.update_traces(marker=dict(size=5, opacity=0.7))
fig.show()


## Feature Creation

In [22]:
df_embeddings = pd.read_csv('../Data/raw/embeddings_distilbert-base-uncased.csv')


### Mean

In [18]:
df_embeddings['Date'] = pd.to_datetime(df_embeddings['Date'])

df_daily_mean = df_embeddings.groupby(df_embeddings['Date'].dt.date).agg({
    'embedding_2d_1': 'mean',
    'embedding_2d_2': 'mean',
    'embedding_3d_1': 'mean',
    'embedding_3d_2': 'mean',
    'embedding_3d_3': 'mean'
}).reset_index()

df_daily_mean.rename(columns={'Date': 'date'}, inplace=True)
df_daily_mean.rename(columns={
    col: f"{col}_mean" for col in df_daily_mean.columns if col not in ['date']
}, inplace=True)

df_daily_mean.head()


Unnamed: 0,date,embedding_2d_1_mean,embedding_2d_2_mean,embedding_3d_1_mean,embedding_3d_2_mean,embedding_3d_3_mean
0,2011-06-18,10.839672,5.758209,9.116658,6.028089,4.292948
1,2011-06-19,8.053342,5.083354,9.623685,5.302163,6.358195
2,2011-06-20,10.168943,4.220067,9.596725,4.739921,5.252258
3,2011-06-21,9.852678,5.405722,9.004225,5.665517,5.114022
4,2011-06-22,9.432877,4.629726,8.864992,4.703284,4.397264


In [19]:
fig_2d = go.Figure()
fig_2d.add_trace(go.Scatter(x=df_daily_mean['date'], y=df_daily_mean['embedding_2d_1_mean'], mode='lines', name='embedding_2d_1'))
fig_2d.add_trace(go.Scatter(x=df_daily_mean['date'], y=df_daily_mean['embedding_2d_2_mean'], mode='lines', name='embedding_2d_2'))

fig_2d.update_layout(
    title='Average Daily Embeddings 2D',
    xaxis_title='Date',
    yaxis_title='Embedding Value',
    width=900,
    height=500
)
fig_2d.show()

### Variance

In [20]:
df_embeddings['Date'] = pd.to_datetime(df_embeddings['Date'])
df_daily_var = df_embeddings.groupby(df_embeddings['Date'].dt.date).agg({
    'embedding_2d_1': 'var',
    'embedding_2d_2': 'var',
    'embedding_3d_1': 'var',
    'embedding_3d_2': 'var',
    'embedding_3d_3': 'var'
}).reset_index()

df_daily_var.rename(columns={'Date': 'date'}, inplace=True)
df_daily_var.rename(columns={
    col: f"{col}_var_daily" for col in df_daily_var.columns if col not in ['date']
}, inplace=True)
df_daily_var['semantic_dispersion_2d'] = df_daily_var[['embedding_2d_1_var_daily', 'embedding_2d_2_var_daily']].mean(axis=1, skipna=True)
df_daily_var['semantic_dispersion_3d'] = df_daily_var[['embedding_3d_1_var_daily', 'embedding_3d_2_var_daily', 'embedding_3d_3_var_daily']].mean(axis=1, skipna=True)
df_daily_var.head()


Unnamed: 0,date,embedding_2d_1_var_daily,embedding_2d_2_var_daily,embedding_3d_1_var_daily,embedding_3d_2_var_daily,embedding_3d_3_var_daily,semantic_dispersion_2d,semantic_dispersion_3d
0,2011-06-18,1.002504,0.545411,0.018353,0.243087,1.758421,0.773957,0.673287
1,2011-06-19,0.027666,3.631286,0.022455,2.477514,0.649123,1.829476,1.049697
2,2011-06-20,2.4605,0.904674,2.237977,0.326985,0.504592,1.682587,1.023184
3,2011-06-21,0.655416,1.826174,0.401945,0.592122,0.621849,1.240795,0.538639
4,2011-06-22,3.990852,2.880831,1.01644,2.608578,1.228204,3.435841,1.617741


In [24]:
fig_2d = px.line(
    df_daily_var, 
    x='date', 
    y=['embedding_2d_1_var_daily', 'embedding_2d_2_var_daily'],
    title='Daily Variance of 2D Embedding Components',
    labels={'value': 'Variance', 'date': 'Date', 'variable': 'Component'}
)

fig_2d.update_layout(template='plotly_white', width=900, height=400)
fig_2d.update_yaxes(range=[0, 90])
fig_2d.show()


### Cosine, Drift, Drift Velocity

In [25]:
df_embeddings['embedding_array'] = df_embeddings['embeddings'].apply(ast.literal_eval).apply(np.array)

df_daily = df_embeddings.groupby(df_embeddings['Date'].dt.date)['embedding_array'].apply(lambda x: np.mean(np.stack(x), axis=0)).reset_index()
df_daily.rename(columns={'Date': 'date', 'embedding_array': 'embedding_mean'}, inplace=True)

def cosine_sim(v1, v2):
    return cosine_similarity(v1.reshape(1, -1), v2.reshape(1, -1))[0][0]

df_daily['cosine_sim'] = np.nan
for i in range(1, len(df_daily)):
    df_daily.loc[i, 'cosine_sim'] = cosine_sim(df_daily.loc[i, 'embedding_mean'], df_daily.loc[i-1, 'embedding_mean'])


df_daily['drift'] = 1 - df_daily['cosine_sim']

window_size = 3 
df_daily['drift_velocity'] = df_daily['drift'].rolling(window=window_size).mean()

df_daily['drift_velocity_diff'] = df_daily['drift_velocity'].diff()
df_daily['drift_acceleration'] = df_daily['drift_velocity_diff'].diff()

print(df_daily[['date', 'cosine_sim', 'drift', 'drift_velocity', 'drift_acceleration']].head(10))


         date  cosine_sim     drift  drift_velocity  drift_acceleration
0  2011-06-18         NaN       NaN             NaN                 NaN
1  2011-06-19    0.881471  0.118529             NaN                 NaN
2  2011-06-20    0.922092  0.077908             NaN                 NaN
3  2011-06-21    0.946231  0.053769        0.083402                 NaN
4  2011-06-22    0.955424  0.044576        0.058751                 NaN
5  2011-06-23    0.956513  0.043487        0.047277            0.013177
6  2011-06-24    0.861742  0.138258        0.075441            0.039637
7  2011-06-25    0.786505  0.213495        0.131747            0.028143
8  2011-06-26    0.829726  0.170274        0.174009           -0.014044
9  2011-06-27    0.926989  0.073011        0.152260           -0.064012


In [27]:
df_daily.dropna(inplace=True)

In [28]:
import plotly.graph_objects as go

fig = go.Figure()

# Cosine similarity
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['cosine_sim'],
    mode='lines',
    name='Cosine Similarity',
    line=dict(color='green')
))

# Drift
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift'],
    mode='lines',
    name='Drift',
    line=dict(color='red')
))

# Drift Velocity
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift_velocity'],
    mode='lines',
    name='Drift Velocity',
    line=dict(color='orange')
))
# Drift Velocity dIFF
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift_velocity_diff'],
    mode='lines',
    name='Drift Velocity Diff',
    line=dict(color='lightblue')
))

# Drift Acceleration
fig.add_trace(go.Scatter(
    x=df_daily['date'],
    y=df_daily['drift_acceleration'],
    mode='lines',
    name='Drift Acceleration',
    line=dict(color='blue')
))

fig.update_layout(
    title='Temporal Dynamics of Semantic Change',
    xaxis_title='Date',
    yaxis_title='Value',
    template='plotly_white',
    width=1000,
    height=500,
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5)
)

fig.show()


In [31]:
df_daily['date'] = pd.to_datetime(df_daily['date'])
df_daily_var['date'] = pd.to_datetime(df_daily_var['date'])
df_daily_var = df_daily_var.dropna()
df_daily_mean['date'] = pd.to_datetime(df_daily_mean['date'])

df_merged = pd.merge(df_daily, df_daily_var, on='date', how='inner')
df_merged = pd.merge(df_merged, df_daily_mean, on='date', how='inner')


In [32]:
df_merged.to_csv('embeddings_feature_distilbert-base-uncased.csv',index=False)