<a href="https://colab.research.google.com/github/stepanjaburek/workingpaper_czech_psp_speeches/blob/main/Word2Vec_FastText_IPS_Paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word embeddings

# Setup

In [None]:
!pip install numpy==1.24.4 --quiet # somehow new numpy doesnt work well with gensim
!pip install gensim==4.3.3 --quiet

import os
os.kill(os.getpid(), 9)

In [None]:
!pip install stanza --quiet
import stanza
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models import FastText
from gensim.models import KeyedVectors
from tqdm import tqdm
tqdm.pandas()

In [None]:
#df = pd.read_csv('/content/word2vec_left.csv')
df = pd.read_csv('/content/word2vec_right.csv')

# Setup lemma

In [None]:
stanza.download("cs")  # get the Czech model
nlp = stanza.Pipeline("cs", processors="tokenize,lemma")

In [None]:
doc = nlp("levicový poslanec zvolil levicovou vládu a levicová maláčová..babiš babiše babišem") # test lemma setup
lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
print(lemmas)

In [None]:
# Define the lemmatization function
def lemmatize_text(text):
    text = str(text)
    doc = nlp(text)  # apply Stanza NLP
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words if word.upos != 'PUNCT']     # Get lemmatized tokens (exclude punctuation and spaces)
    return " ".join(lemmas)     # Return the lemmatized text as a string
# Claude helping here

In [None]:
df['text'] = df['text'].fillna('').astype(str) # be sure with NAs, shouldnt be any though
df['text_lemma'] = df['text'].progress_apply(lemmatize_text) # lemmatization using stanza

print(df[['text_lemma', 'text']].head()) # check it

# Or Import already lemmatizated data instead

In [None]:
#df = pd.read_csv('/content/lemmatized_data_left.csv')
df = pd.read_csv('/content/lemmatized_data_right.csv')

# FastText

In [None]:
texts = df['text_lemma']
corpus = texts.progress_apply(lambda x: simple_preprocess(x, deacc=False)) # finalize using gensim's simple_preprocess
print(corpus.head())

In [None]:
# Train FastText model
model = FastText(
    sentences=corpus,
    vector_size=300,    # dimensions of embedding space
    window=5,           # context window size
    min_count=5,
    workers=8,
    sg=0,               # 1 = skip-gram; 0 = CBOW
    epochs=10
)

In [None]:
# get the already saved word embeddings
# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("fasttext_wordvectors_right", mmap='r')
vector = wv['prezident']  # Get numpy vector of a word
wv.most_similar('prezident', topn=20)

In [None]:
model.wv.most_similar('levice', topn=20)

In [None]:
# Get similarity between two words
model.wv.similarity('senát', 'záchod')

In [None]:
# Find odd word out
model.wv.doesnt_match(['ministr', 'prezident', 'Praha', 'premiér'])

In [None]:
# Word analogies (a is to b as c is to ?)
model.wv.most_similar(positive=['levice', 'obchod'], negative=['pravice'], topn=5)
# nefunguje moc, nebo to neumim

# T SNE downprojection

In [None]:
!pip install adjustText

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import matplotlib.cm as cm
from adjustText import adjust_text

def plot_tsne(model, words, perplexity=35, figsize=(12, 10), colormap='hsv', adjust=True):
    """
    Create a t-SNE plot for the given words using their embeddings.

    Parameters:
    - model: FastText model
    - words: list of words to visualize
    - perplexity: t-SNE perplexity parameter
    - figsize: figure size
    - colormap: matplotlib colormap for points
    - adjust: whether to use adjust_text to prevent label overlap
    """
    # Get word vectors
    word_vectors = np.array([model.wv[word] for word in words])

    # Perform t-SNE dimensionality reduction
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    coordinates = tsne.fit_transform(word_vectors)

    # Create a scatter plot
    plt.figure(figsize=figsize)

    # Use different colors for points
    colors = cm.get_cmap(colormap, len(words))

    x = coordinates[:, 0]
    y = coordinates[:, 1]

    plt.scatter(x, y, c=range(len(words)), cmap=colors, alpha=0.7, s=100)

    # Add labels for each point
    texts = []
    for i, word in enumerate(words):
        texts.append(plt.text(x[i], y[i], word, fontsize=12))

    if adjust:
        # Adjust text positions to prevent overlap
        adjust_text(texts, arrowprops=dict(arrowstyle='->', color='black'))

    plt.title('t-SNE Visualization of Word Embeddings', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.savefig('word_embeddings_tsne.png', dpi=300, bbox_inches='tight')
    plt.show()

# Example usage:
# 1. Political terms
political_terms = ['vláda', 'parlament', 'strana', 'volby', 'demokracie', 'ústava',
                  'premiér', 'prezident', 'ministr', 'opozice', 'koalice', 'poslanec',
                  'senátor', 'zákon', 'politika', 'justice', 'soud', 'ekonomika', 'inflace', 'rozpočet', 'daň', 'HDP', 'krize',
                 'reforma', 'nezaměstnanost', 'mzda', 'dotace', 'investice', 'dluh',
                 'deficit', 'banky', 'úrok', 'export', 'průmysl','levice', 'pravice', 'socialismus', 'kapitalismus', 'konzervativní',
                     'liberální', 'komunistický', 'progresivní', 'sociální', 'solidarita',
                     'svoboda', 'rovnost', 'trh', 'stát', 'soukromý', 'veřejný']

# Run the visualization
plot_tsne(model, political_terms)
# Compare left/right political terms
political_spectrum = ['levice', 'pravice', 'socialismus', 'kapitalismus', 'konzervativní',
                     'liberální', 'komunistický', 'progresivní', 'sociální', 'solidarita',
                     'svoboda', 'rovnost', 'trh', 'stát', 'soukromý', 'veřejný']
plot_tsne(model, political_spectrum, perplexity=5)  # Lower perplexity for smaller set

# Position in space between two vectors

In [None]:
def get_left_right_position(model, left_terms, right_terms, target_word):
    """Get position of a word on left-right axis"""
    # Average left and right seed vectors
    left_vec = np.mean([model.wv[w] for w in left_terms if w in model.wv], axis=0)
    right_vec = np.mean([model.wv[w] for w in right_terms if w in model.wv], axis=0)

    # Create left-right axis
    lr_axis = right_vec - left_vec
    lr_axis = lr_axis / np.linalg.norm(lr_axis)  # Normalize

    # Project target word onto this axis
    if target_word in model.wv:
        position = np.dot(model.wv[target_word], lr_axis)
        return position
    else:
        return None
# Define seed words
left_terms = ['levice', "levicový", "levicová"]
right_terms = ['pravice', "pravicový", "pravicová"]

# Get position of "inflace" (inflation)
position = get_left_right_position(model, left_terms, right_terms, 'svoboda')
print(f"Position of 'inflace' on left-right axis: {position}")
# Positive means right-leaning, negative means left-leaning

In [None]:
word_vectors = model.wv
word_vectors.save("fasttext_wordvectors_right") # Store just the words + their trained embeddings.

# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("fasttext_wordvectors_right", mmap='r')
vector = wv['prezident']  # Get numpy vector of a word
wv.most_similar('prezident', topn=20)