In [21]:
import collections
import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import spacy
import nltk
from nltk.stem import WordNetLemmatizer

from utils import (
    drop_spam_rows,
    remove_digits,
    remove_prefixed_words,
    contract_spaces,
    remove_single_characters,
    remove_special_characters,
)

### Load spaCy Spanish trained pipeline.

In [22]:
try:
    sp = spacy.load("es_core_news_sm")
except OSError:
    !python3 -m spacy download es_core_news_sm
    sp = spacy.load("es_core_news_sm")

### Load texts

In [23]:
dataset_path = "data/balcones_2020.csv"
dataset = pd.read_csv(dataset_path)
texts = dataset["text"]

### Remove spam

In [24]:
spam_texts = [
    "El Magazin del Balcón Segoviano",
    "Viva María Auxiliadora",
    "Italianos cantan 'Bella Ciao' en sus balcones por los 75 años de la caída del fascismo",
]
texts = drop_spam_rows(text_series=texts, spam_messages=spam_texts)
texts = texts[texts.duplicated() == False]

### Preprocessing

In [25]:
PUNCTUATION_MARKS = string.punctuation + "¿" + "¡" + "..." + "…" + " "
STOP_WORDS = nltk.corpus.stopwords.words("spanish")
UNDESIRED_WORDS = [
    "balcón",
    "balcones",
    "balcon",
    "si",
    "haber",
    "ser",
    "quedateencasa",
    "yomequedoencasa",
    # "toca"
]
UNDESIRED_PREFIXES = ["@", "#", "http", "jaj", "xd", "xD", "XD"]


def tokenize(text):
    """Get tokenized text."""
    return " ".join(
        [
            token.lemma_ for token in sp(text)
            if token.text not in PUNCTUATION_MARKS
            and token.text not in STOP_WORDS + UNDESIRED_WORDS
        ]
    )

def preprocess_text(text):
    """Apply transformations to text."""
    text = text.lower()
    for prefix in UNDESIRED_PREFIXES:
            text = remove_prefixed_words(prefix, text)
    text = remove_special_characters(text)
    text = remove_single_characters(text)
    text = remove_digits(text)
    text = contract_spaces(text)
    return tokenize(text)

In [26]:
texts_preprocessed = texts.apply(lambda x: preprocess_text(x))

### TF-IDF Vectorizer

In [66]:
tf = TfidfVectorizer(
    stop_words=(STOP_WORDS),
    min_df=3,
    max_df=0.85,
    ngram_range=(1, 3)
)
X = tf.fit_transform(texts_preprocessed)

In [67]:
feature_names = tf.get_feature_names_out()
words = np.array(tf.get_feature_names_out())
matrix = pd.DataFrame(X.toarray(), columns=feature_names)

### NMF

In [77]:
model = NMF(n_components=20, random_state=42)
nmf_output = model.fit_transform(X)

In [78]:
components_df = pd.DataFrame(model.components_, columns=feature_names)

In [79]:
components_df.head()

Unnamed: 0,abajo,abascal,abeja,abierto,abrazar,abrazarno,abrazo,abrazo enorme,abrigo,abril,...,único dar,único hacer,único momento,único poder,único sitio,único vecino,útil,útil habitación,útil habitación baño,útil habitación baños
0,0.001748,0.0,0.001892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.008999,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.042629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.007739,0.0,0.005213,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000939,0.0,0.0,0.0,0.0,0.0,...,0.00301,0.0,0.00013,0.0,0.0,0.0,0.006086,0.0,0.0,0.0
3,0.0,0.0,0.0,0.019478,0.005007,0.000135,0.028219,0.007088,0.0,0.053762,...,0.000899,0.0,0.003944,0.0,0.000756,0.0,0.0,0.001281,0.000784,0.000584
4,0.005713,0.0,0.001285,0.008616,0.000682,0.001852,0.020038,0.002942,0.002132,0.0,...,0.0,0.014779,0.0,0.004799,0.0,0.0,0.0,0.0,0.0,0.0


### Topics

In [80]:
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'For topic {topic} the words with the highest value are:')
    print(tmp.nlargest(10))
    print('\n')

For topic 0 the words with the highest value are:
salir             4.449236
salir aplaudir    0.692522
poder salir       0.357594
ir salir          0.348252
salir cantar      0.242864
hoy salir         0.191526
hora salir        0.182254
gente salir       0.165179
salir calle       0.164909
salir día         0.140047
Name: 0, dtype: float64


For topic 1 the words with the highest value are:
tirar            3.200566
ir tirar         0.198088
querer tirar     0.176858
querer           0.139755
poder tirar      0.137848
dos              0.093807
cuarentena       0.090272
gana tirar       0.085498
tirar petardo    0.082287
petardo          0.078579
Name: 1, dtype: float64


For topic 2 the words with the highest value are:
aplaudir              2.745884
salir aplaudir        0.932866
luego                 0.376544
sanitario             0.310074
sanidad               0.192477
aplaudir sanitario    0.183828
seguro                0.159897
público               0.142370
gente aplaudir      