In [None]:
import os
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sentence_transformers import SentenceTransformer
import torch
from tqdm.notebook import tqdm
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost")
POSTGRES_PORT = os.getenv("POSTGRES_PORT", "5432")
POSTGRES_DB = os.getenv("POSTGRES_DB", "telegram_scraper")
POSTGRES_USER = os.getenv("POSTGRES_USER", "your_username")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")

TABLES = [
    "russian_channels_messages",
    "russian_groups_messages",
    "ukrainian_channels_messages",
    "ukrainian_groups_messages"
]

# Connect to DB
engine = create_engine(
    f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
)

def fetch_sample(table, n=2500):
    query = f"""
        SELECT id, messagetext, chat_id, chat_name
        FROM {table}
        ORDER BY RANDOM()
        LIMIT {n}
    """
    df = pd.read_sql(query, engine)
    df['table'] = table
    return df

dfs = [fetch_sample(table) for table in TABLES]
df = pd.concat(dfs, ignore_index=True)
print("Data loaded")


Data loaded


In [None]:
texts = df['messagetext'].astype(str).tolist()

# Model names
MODEL_NAMES = [
    # "deepvk/USER-bge-m3",
    "intfloat/multilingual-e5-large-instruct"
]

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
batch_size = 16

def compute_embeddings(texts, model_name):
    print(f"Computing embeddings with {model_name}")
    model = SentenceTransformer(model_name, device=DEVICE)
    return model.encode(texts, batch_size=batch_size, show_progress_bar=True)

# Compute embeddings for both models
# embeddings_bge_m3 = compute_embeddings(texts, MODEL_NAMES[0])
embeddings_e5 = compute_embeddings(texts, MODEL_NAMES[0])

# Optionally, add to DataFrame for later use
# df['embedding_bge_m3'] = list(embeddings_bge_m3)
df['embedding_e5'] = list(embeddings_e5)

# Show the DataFrame with embeddings
df.head()

Using device: cuda
Computing embeddings with intfloat/multilingual-e5-large-instruct


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Unnamed: 0,id,messagetext,chat_id,chat_name,table,embedding_e5
0,258227,Подготовка визита премьер-министра Индии в Рос...,1050820672,https://t.me/tass_agency,russian_channels_messages,"[0.014172985, 0.0043434715, -0.011479417, -0.0..."
1,67015,❗️Совместный российско-азербайджанский патруль...,1260622817,https://t.me/readovkanews,russian_channels_messages,"[0.0194318, 0.018181017, -0.014406425, -0.0460..."
2,117104,"⚡️Литва, Латвия, Эстония и Польша попросили Me...",1394050290,https://t.me/bbbreaking,russian_channels_messages,"[0.015960189, 0.012944568, -0.0068293777, -0.0..."
3,12785,Наши рыбаки сегодня словили крутой кадр ✈️,1283524369,https://t.me/zhest_belgorod,russian_channels_messages,"[0.0050023296, 0.027736673, -0.011438828, -0.0..."
4,41597,Российские войска отвели с Харьковского направ...,1260622817,https://t.me/readovkanews,russian_channels_messages,"[0.03794567, 0.010649348, 0.00024101515, -0.03..."


In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch

# Load Russian narratives
DESCRIPTIONS_FILE_RU = '../../../apollolytics-network/data/propaganda_narratives/narratives_ru.json'
with open(DESCRIPTIONS_FILE_RU, 'r', encoding='utf-8') as f:
    descriptions_ru = json.load(f)
narrative_keys_ru = list(descriptions_ru.keys())
narrative_texts_ru = [desc['ru'] for desc in descriptions_ru.values()]

# Load Ukrainian narratives
DESCRIPTIONS_FILE_UK = '../../../apollolytics-network/data/propaganda_narratives/narratives_uk.json'
with open(DESCRIPTIONS_FILE_UK, 'r', encoding='utf-8') as f:
    descriptions_uk = json.load(f)
narrative_keys_uk = list(descriptions_uk.keys())
narrative_texts_uk = [desc['uk'] for desc in descriptions_uk.values()]

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# # Process BGE-M3 model
# model_bge_m3 = SentenceTransformer("deepvk/USER-bge-m3", device=DEVICE)

# # Russian narratives
# narrative_embeddings_bge_m3_ru = model_bge_m3.encode(narrative_texts_ru)
# similarities_bge_m3_ru = cosine_similarity(np.vstack(df['embedding_bge_m3']), narrative_embeddings_bge_m3_ru)
# sim_df_bge_m3_ru = pd.DataFrame(similarities_bge_m3_ru, columns=[f"{k}_sim_bge_m3_ru" for k in narrative_keys_ru])

# # Ukrainian narratives
# narrative_embeddings_bge_m3_uk = model_bge_m3.encode(narrative_texts_uk)
# similarities_bge_m3_uk = cosine_similarity(np.vstack(df['embedding_bge_m3']), narrative_embeddings_bge_m3_uk)
# sim_df_bge_m3_uk = pd.DataFrame(similarities_bge_m3_uk, columns=[f"{k}_sim_bge_m3_uk" for k in narrative_keys_uk])

# del model_bge_m3

# Process E5 model
model_e5 = SentenceTransformer("intfloat/multilingual-e5-large-instruct", device=DEVICE)

# Russian narratives
narrative_embeddings_e5_ru = model_e5.encode(narrative_texts_ru)
similarities_e5_ru = cosine_similarity(np.vstack(df['embedding_e5']), narrative_embeddings_e5_ru)
sim_df_e5_ru = pd.DataFrame(similarities_e5_ru, columns=[f"{k}_sim_e5_ru" for k in narrative_keys_ru])

# Ukrainian narratives
narrative_embeddings_e5_uk = model_e5.encode(narrative_texts_uk)
similarities_e5_uk = cosine_similarity(np.vstack(df['embedding_e5']), narrative_embeddings_e5_uk)
sim_df_e5_uk = pd.DataFrame(similarities_e5_uk, columns=[f"{k}_sim_e5_uk" for k in narrative_keys_uk])

del model_e5

# Combine all results
df = pd.concat([df, sim_df_e5_ru, sim_df_e5_uk], axis=1)

In [1]:
import os
import json
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from dotenv import load_dotenv

# --- ENVIRONMENT ---
load_dotenv()

POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost")
POSTGRES_PORT = os.getenv("POSTGRES_PORT", "5432")
POSTGRES_DB = os.getenv("POSTGRES_DB", "telegram_scraper")
POSTGRES_USER = os.getenv("POSTGRES_USER", "postgres")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")

# --- Load test_100k table ---
engine = create_engine(
    f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
)
df = pd.read_sql("SELECT * FROM test_100k", engine)

# Parse embeddings from JSONB to numpy arrays
def parse_embedding(x):
    if isinstance(x, str):
        return np.array(json.loads(x), dtype=np.float16)
    elif isinstance(x, list):
        return np.array(x, dtype=np.float16)
    else:
        return np.zeros(1024, dtype=np.float16)  # fallback, adjust if your embedding size is different

# embeddings = np.vstack(df['multilingual_e5_large_instruct'].apply(parse_embedding).values)

# # --- Load narratives ---
# DESCRIPTIONS_FILE_RU = '../../../apollolytics-network/data/propaganda_narratives/narratives_ru.json'
# DESCRIPTIONS_FILE_UK = '../../../apollolytics-network/data/propaganda_narratives/narratives_uk.json'

# with open(DESCRIPTIONS_FILE_RU, 'r', encoding='utf-8') as f:
#     descriptions_ru = json.load(f)
# narrative_keys_ru = list(descriptions_ru.keys())
# narrative_texts_ru = [desc['ru'] for desc in descriptions_ru.values()]

# with open(DESCRIPTIONS_FILE_UK, 'r', encoding='utf-8') as f:
#     descriptions_uk = json.load(f)
# narrative_keys_uk = list(descriptions_uk.keys())
# narrative_texts_uk = [desc['uk'] for desc in descriptions_uk.values()]

# # --- Load model and compute narrative embeddings in half precision ---
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# model = SentenceTransformer("intfloat/multilingual-e5-large-instruct", device=DEVICE)
# if DEVICE == "cuda":
#     model.half()

# with torch.cuda.amp.autocast(enabled=(DEVICE == "cuda")):
#     narrative_embeddings_ru = model.encode(narrative_texts_ru, convert_to_numpy=True)
#     narrative_embeddings_uk = model.encode(narrative_texts_uk, convert_to_numpy=True)

# # Ensure narrative embeddings are float16
# narrative_embeddings_ru = narrative_embeddings_ru.astype(np.float16)
# narrative_embeddings_uk = narrative_embeddings_uk.astype(np.float16)

# # --- Compute similarities ---
# similarities_ru = cosine_similarity(embeddings, narrative_embeddings_ru)
# similarities_uk = cosine_similarity(embeddings, narrative_embeddings_uk)

# # --- Create DataFrames for similarity scores ---
# sim_df_ru = pd.DataFrame(similarities_ru, columns=[f"{k}_sim_e5_ru" for k in narrative_keys_ru])
# sim_df_uk = pd.DataFrame(similarities_uk, columns=[f"{k}_sim_e5_uk" for k in narrative_keys_uk])

# # --- Concatenate with original DataFrame ---
# df = pd.concat([df, sim_df_ru, sim_df_uk], axis=1)

# # Show the result
# df.head()

In [2]:
df

Unnamed: 0,chat_id,id,chat_name,peer_id,messagedatetime,messagedate,messagetext,out,mentioned,media_unread,...,ttl_period,quick_reply_shortcut_id,effect,factcheck,views,forwards,replies,reactions,embedding,multilingual_e5_large_instruct
0,2022231015,4495,https://t.me/mediyca,2022231015,2025-01-30 17:45:31,2025-01-30,"В импортных чипсах Lay’s обнаружили аллергены,...",False,False,False,...,,,,,195840,315,0,"{'❤': 100, '🌭': 5, '👍': 68, '👎': 14, '🔥': 7, '...",,"[0.01567930541932583, -0.0037688673473894596, ..."
1,1197865170,26788,https://t.me/ssigny,1197865170,2022-06-06 17:14:41,2022-06-06,"⚡️В ЛНР не исключили обращение к ОДКБ, но толь...",False,False,False,...,,,,,97405,31,0,"{'❤': 17, '👍': 1076, '👎': 20, '😁': 5, '🤔': 52}",,"[-0.004293631762266159, 0.010973948985338211, ..."
2,1498939244,37134,https://t.me/voenacher,1498939244,2023-01-04 04:54:00,2023-01-04,"«Все награды, они почетные, они боевые. Каждая...",False,False,False,...,,,,,287178,51,0,"{'❤': 267, '🌚': 5, '👍': 2988, '👎': 13, '👏': 12...",,"[-0.0009024208411574364, -0.002367566805332899..."
3,1117628569,37134,https://t.me/mash,1117628569,2022-08-04 10:14:27,2022-08-04,"На ""президентской"" дороге Севастополя фура про...",False,False,False,...,,,,,557730,1302,0,,,"[-0.0009024208411574364, -0.002367566805332899..."
4,1708761316,68470,https://t.me/novosti_efir,1708761316,2025-04-07 19:54:24,2025-04-07,Мошенники массово обманывают россиян через фей...,False,False,False,...,,,,,779230,1881,0,"{'❤': 85, '👍': 327, '👎': 35, '🔥': 29, '😁': 130...",,"[-0.0103005301207304, 0.014275731518864632, -0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1635619800,1059320,https://t.me/mostodaychat,1635619800,2024-07-20 09:47:28,2024-07-20,"Нет. Патриоты там сидеть не могут, или они не ...",False,False,False,...,,,,,0,0,0,,,"[0.01544005237519741, -0.013247917406260967, -..."
99996,1473047870,159692,https://t.me/gruppa_ekb_eburg,1473047870,2024-05-30 04:45:28,2024-05-30,Продавец-консультант сувенирной продукции.\nОт...,False,False,False,...,,,,,0,0,0,{'👍': 1},,"[-0.009056540206074715, 0.013310451991856098, ..."
99997,1473047870,163737,https://t.me/gruppa_ekb_eburg,1473047870,2024-06-16 05:28:37,2024-06-16,В бригаде строителей русских требуются 1 напар...,False,False,False,...,,,,,0,0,0,{'👍': 1},,"[0.021832171827554703, 0.03789491951465607, -0..."
99998,1635619800,1096355,https://t.me/mostodaychat,1635619800,2024-08-02 19:23:59,2024-08-02,А где логика? Там что другие дети? Они чем то ...,False,False,False,...,,,,,0,0,0,"{'🎉': 2, '💯': 1, '🔥': 5}",,"[0.0036467830650508404, 0.00692920433357358, -..."


In [8]:
# Choose your narrative similarity column, e.g.:
narrative_col = "HistoricalUnityOfRussiaAndUkraine_sim_e5_ru"  # <-- change as needed

# Sort by similarity
sorted_df = df.sort_values(by=narrative_col, ascending=False)

print(f"\nTop 5 messages for {narrative_col}:\n")
for i, row in sorted_df.head(5).iterrows():
    print(f"Score: {row[narrative_col]:.4f}")
    print(f"Chat: {row.get('chat_name', 'N/A')}")
    print(f"Message: {row['messagetext']}\n{'-'*40}")

print(f"\nLowest 5 messages for {narrative_col}:\n")
for i, row in sorted_df.tail(5).iterrows():
    print(f"Score: {row[narrative_col]:.4f}")
    print(f"Chat: {row.get('chat_name', 'N/A')}")
    print(f"Message: {row['messagetext']}\n{'-'*40}")


Top 5 messages for HistoricalUnityOfRussiaAndUkraine_sim_e5_ru:

Score: 0.8965
Chat: https://t.me/varlamov_news
Message: Зеленский в обращении по случаю Дня украинской государственности назвал Украину «единственной законной наследницей Киевской Руси».

Украина — это «страна, с которой началась история христианства в Восточной Европе», это «не колония, не анклав, не протекторат, не губерния, эялет или кронланд, не часть чужих империй, не „земли в составе“, не союзная республика, не автономия и не провинция, а свободная, самостоятельная, суверенная, неделимая и независимая страна», сказал Зеленский.

«Княже Володимир, тогда чур не обижаться, что часть Украины будут называть Русью. Сам сказал. Молодец», — написала представитель МИД России Захарова, комментируя слова президента Украины о Киевской Руси.
----------------------------------------
Score: 0.8948
Chat: https://t.me/meduzalive
Message: «Нацистские сатанисты укрепились в святом русском городе Киеве». Как выяснила «Медуза», именно 

In [5]:
df.columns

Index(['chat_id', 'id', 'chat_name', 'peer_id', 'messagedatetime',
       'messagedate', 'messagetext', 'out', 'mentioned', 'media_unread',
       'silent', 'post', 'from_scheduled', 'legacy', 'edit_hide', 'pinned',
       'noforwards', 'invert_media', 'offline', 'from_id',
       'from_boosts_applied', 'saved_peer_id', 'fwd_from', 'fwd_from_type',
       'via_bot_id', 'via_business_bot_id', 'reply_to', 'reply_markup',
       'entities', 'edit_date', 'post_author', 'grouped_id', 'ttl_period',
       'quick_reply_shortcut_id', 'effect', 'factcheck', 'views', 'forwards',
       'replies', 'reactions', 'embedding', 'multilingual_e5_large_instruct',
       'DenazificationOfUkraine_sim_e5_ru',
       'ProtectionOfRussianSpeakers_sim_e5_ru',
       'NATOExpansionThreat_sim_e5_ru', 'BiolabsConspiracy_sim_e5_ru',
       'UkraineasaFailedState_sim_e5_ru',
       'HistoricalUnityOfRussiaAndUkraine_sim_e5_ru',
       'WesternRussophobia_sim_e5_ru', 'SanctionsAsEconomicWarfare_sim_e5_ru',
       '