## Word2Vec: Document Vector Representation
Representation of documents as the average of the embeddings of the
words they contain.

In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn import neighbors
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords as sw
from unidecode import unidecode
import unicodedata
import re

from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import STOPWORDS, WordCloud

# Word2Vec
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('punkt_tab' , quiet=True)
import warnings

warnings.filterwarnings("ignore", category=UserWarning)


In [2]:
# Load data
traindata = pd.read_csv('Datasets/EvaluationData/politicES_phase_2_train_public.csv', header=0)
ytrain = traindata.iloc[:, :]


In [3]:
# Get Spanish stopwords from nltk
spanish_sw = set(sw.words('spanish'))
extras = {'rt', 'https', 'http', 'jaja', 'jajaja', 'jajajaja', 'jajajajaja', 'mas', 'hace'}
stopwords = spanish_sw.union(extras)

In [27]:
# Get spanish 
stemmer = SnowballStemmer('spanish')


In [40]:
def preserve_letters(text: str, letters: list) -> str:
    placeholders = {letter: f"__PLACEHOLDER_{i}__" for i, letter in enumerate(letters)}
    for k, v in placeholders.items():
        text = text.replace(k, v)
    text = unicodedata.normalize('NFKD', text)
    text = ''.join(ch for ch in text if not unicodedata.combining(ch))
    for k, v in placeholders.items():
        text = text.replace(v, k)
    return text

In [74]:
def preprocess_and_stem(text, letters=['ñ','Ñ']):
    text = preserve_letters(text, letters)
    text = text.lower()
    text = re.sub(r"[^a-zA-ZñÑáéíóúüÁÉÍÓÚÜ\s]", "", text)    
    tokens = text.split()
    tokens_stemmed = [stemmer.stem(w) for w in tokens]
    tokens_stemmed = [w for w in tokens_stemmed if w not in stopwords]
    return tokens_stemmed

In [75]:
tokens = ytrain.iloc[:, -1].astype(str).apply(preprocess_and_stem)

In [76]:
try:
    model_w2v = Word2Vec(
        sentences=tokens.tolist(),
        vector_size=100,      
        window=5,
        min_count=2,
        workers=4,
        sg=1
    )
except Exception as e:
    # print(f"Error training Word2Vec model: {e}")
    pass

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [77]:
def doc_vector(tokens):
    words_vecs = [
        model_w2v.wv[word] for word in tokens if word in model_w2v.wv
    ]
    if len(words_vecs) == 0:
        return np.zeros(model_w2v.vector_size)
    return np.mean(words_vecs, axis=0)

In [81]:
word = "si"
try:
    print(model_w2v.wv.most_similar(word))
except KeyError:
    print(f"The word {word} is not in the vocabulary.")


[('porqu', 0.7838612198829651), ('obstant', 0.7752715945243835), ('dimitiri', 0.7621756792068481), ('despartidiz', 0.7608611583709717), ('deseari', 0.7568363547325134), ('ocurrir', 0.7545263767242432), ('digal', 0.7542862296104431), ('adivinais', 0.7540134191513062), ('explicari', 0.7535245418548584), ('abrumador', 0.7519144415855408)]


In [79]:
word1 = "gobierno"
word2 = "croquetas"

try:
    sim = model_w2v.wv.similarity(word1, word2)
    print(f"Similarity between '{word1}' and '{word2}': {sim:.4f}")
except KeyError as e:
    print(f"One of the words is not in the vocabulary: {e}")

One of the words is not in the vocabulary: "Key 'gobierno' not present"


In [80]:
print(len(model_w2v.wv.key_to_index))
print(list(model_w2v.wv.key_to_index.keys())[:20])  # primeras 20 palabras



29435
['user', 'par', 'tod', 'com', 'si', 'politici', 'hashtag', 'per', 'hac', 'politicalparty', 'hoy', 'gobiern', 'ser', 'tien', 'sobr', 'much', 'nuestr', 'españ', 'pued', 'sol']
