In [2]:
import ast
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import plotly.express as px
import plotly.graph_objects as go

from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import torch

In [3]:
only_muslim_data = pd.read_csv('path/to/data-files/only_muslim_df.csv')

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## Generate embeddings using MPNet model

In [5]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
titles = only_muslim_data['translated_title'].fillna("").tolist()

batch_size = 32
embeddings = []

for i in tqdm(range(0, len(titles), batch_size), desc="Encoding Titles with MPNet"):
    batch = titles[i:i + batch_size]
    batch_embeddings = model.encode(batch, device=device)
    embeddings.extend(batch_embeddings)

only_muslim_data['mpnet_embedding'] = [e.tolist() for e in embeddings]



Encoding Titles with MPNet: 100%|██████████| 69/69 [00:05<00:00, 12.37it/s]


## Do the dimensionality reduction using t-SNE

In [7]:
# list of embeddings to NumPy array
embedding_matrix = np.array(only_muslim_data["mpnet_embedding"].tolist())

# Run t-SNE
tsne = TSNE(
    n_components=2,
    perplexity=50,
    learning_rate=200,
    random_state=42,
    max_iter=2000,
)

tsne_result = tsne.fit_transform(embedding_matrix)


In [8]:
muslim_tsne_df = only_muslim_data.copy()

# Add reduced t-SNE coordinates
muslim_tsne_df["x"] = tsne_result[:, 0]
muslim_tsne_df["y"] = tsne_result[:, 1]

# :Semantic Representation Map

In [9]:
# Colored by Language
fig1 = px.scatter(
    muslim_tsne_df,
    x="x",
    y="y",
    color="language",
    hover_data=["translated_title", "language", "source_name"],
    title="Semantic Map of Titles by Language",
    labels={"x": "t-SNE Dimension 1", "y": "t-SNE Dimension 2"},
)
fig1.show()

# Change Over Time

In [10]:
muslim_tsne_df['pubDate'] = pd.to_datetime(muslim_tsne_df['pubDate'])
muslim_tsne_df = muslim_tsne_df.sort_values('pubDate')

# Step 2: Format date into string for animation frames
muslim_tsne_df['pubDateStr'] = muslim_tsne_df['pubDate'].dt.strftime('%Y-%m-%d')

# Step 3: Create the animation
fig = px.scatter(
    muslim_tsne_df,
    x='x',
    y='y',
    animation_frame='pubDateStr',
    animation_group='article_id',
    color='language',
    hover_name='title',
    hover_data={'pubDateStr': True, 'language': True, 'translated_title': True},
    title='t-SNE Cluster Animation Nagpur Violence News Articles Over Time',
    labels={'x': 't-SNE X', 'y': 't-SNE Y'},
    opacity=0.7,
    height=700
)

fig.update_layout(
    title_font_size=22,
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    legend_title_text='Language',
    showlegend=True
)

fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 600  # ms
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 300

fig.show()