## 📦 Environment & Dependencies

This section imports all the necessary libraries. Make sure to install missing ones via `pip install package-name` if needed.

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import emoji
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import umap
import nltk
import seaborn as sns
import matplotlib.font_manager as fm

from sentence_transformers import SentenceTransformer
from pymorphy2 import MorphAnalyzer
from collections import Counter

tqdm.pandas()
nltk.download('punkt')

## 📊 Load and Prepare Data

This section loads the merged and clustered dataset and performs basic preprocessing.

In [None]:
# Load the preprocessed and clustered data
df = pd.read_csv("embedded_clustered.csv")
df = df.dropna(subset=["text"])

# Check the structure
df.head()

## 🌐 UMAP Cluster Visualization

We visualize the clustered embeddings using UMAP.

In [None]:
plt.figure(figsize=(10, 7))
sns.scatterplot(
    x="umap_x", y="umap_y", hue="cluster", data=df,
    palette="tab10", legend="full", alpha=0.7
)
plt.title("UMAP Projection of Text Clusters")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## ☁️ WordClouds for Key Concepts

We visualize semantic associations around selected terms like 'русский', 'народ', and 'запад'.

In [None]:
def generate_context_wordcloud(df, keyword, window=10):
    texts = df["text"].dropna().astype(str).tolist()
    tokens = [nltk.word_tokenize(text.lower()) for text in texts]
    context_words = []

    for sentence in tokens:
        for i, word in enumerate(sentence):
            if keyword in word:
                left = sentence[max(0, i - window):i]
                right = sentence[i+1:i + 1 + window]
                context_words.extend(left + right)

    freq = Counter(context_words)
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(freq)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"WordCloud for context around '{keyword}'")
    plt.show()

generate_context_wordcloud(df, "русск")
generate_context_wordcloud(df, "народ")
generate_context_wordcloud(df, "запад")

## 🎯 Extracting Dominant Terms by Cluster

TF-IDF is used to identify potentially manipulative or propagandistic lexical fields.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download("stopwords")
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

clusters = df["cluster"].unique()
top_n = 15
tfidf_results = {}

for cluster in clusters:
    texts = df[df["cluster"] == cluster]["text"].astype(str).tolist()
    vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words=russian_stopwords, max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    tfidf_scores = tfidf_matrix.sum(axis=0).A1
    terms = vectorizer.get_feature_names_out()

    top_indices = tfidf_scores.argsort()[::-1][:top_n]
    top_terms = [(terms[i], tfidf_scores[i]) for i in top_indices]
    tfidf_results[cluster] = top_terms

for cluster, terms in tfidf_results.items():
    print(f"\nCluster {cluster}:")
    for term, score in terms:
        print(f"{term} ({score:.2f})")

## 🧾 Summary

This notebook demonstrates how to use NLP and clustering techniques to explore rhetorical and semantic patterns in Russian pro-government Telegram discourse. The result highlights the thematic fragmentation, key identity symbols, and potential propagandistic markers across different narrative clusters.