In [1]:
import collections
import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import spacy
import nltk
from nltk.stem import WordNetLemmatizer

from utils import (
    drop_spam_rows,
    remove_digits,
    remove_prefixed_words,
    contract_spaces,
    remove_single_characters,
    remove_special_characters,
)

### Load spaCy Spanish trained pipeline.

In [2]:
try:
    sp = spacy.load("es_core_news_sm")
except OSError:
    !python3 -m spacy download es_core_news_sm
    sp = spacy.load("es_core_news_sm")

### Load texts

In [3]:
dataset_path = "data/balcones_2020.csv"
dataset = pd.read_csv(dataset_path)
texts = dataset["text"]

### Remove spam

In [4]:
spam_texts = [
    "El Magazin del Balcón Segoviano",
    "Viva María Auxiliadora",
    "Italianos cantan 'Bella Ciao' en sus balcones por los 75 años de la caída del fascismo",
]
texts = drop_spam_rows(text_series=texts, spam_messages=spam_texts)
texts = texts[texts.duplicated() == False]

### Preprocessing

In [5]:
PUNCTUATION_MARKS = string.punctuation + "¿" + "¡" + "..." + "…" + " "
STOP_WORDS = nltk.corpus.stopwords.words("spanish")
UNDESIRED_WORDS = [
    "balcón",
    "balcones",
    "balcon",
    "si",
    "haber",
    "ser",
    "quedateencasa",
    "yomequedoencasa",
    # "toca"
]
UNDESIRED_PREFIXES = ["@", "#", "http", "jaj", "xd", "xD", "XD"]


def tokenize(text):
    """Get tokenized text."""
    return " ".join(
        [
            token.lemma_ for token in sp(text)
            if token.text not in PUNCTUATION_MARKS
            and token.text not in STOP_WORDS + UNDESIRED_WORDS
        ]
    )

def preprocess_text(text):
    """Apply transformations to text."""
    text = text.lower()
    for prefix in UNDESIRED_PREFIXES:
            text = remove_prefixed_words(prefix, text)
    text = remove_special_characters(text)
    text = remove_single_characters(text)
    text = remove_digits(text)
    text = contract_spaces(text)
    return tokenize(text)

In [6]:
texts_preprocessed = texts.apply(lambda x: preprocess_text(x))

### TF-IDF Vectorizer

In [7]:
tf = TfidfVectorizer(
    stop_words=(STOP_WORDS),
    min_df=3,
    max_df=0.85,
    ngram_range=(1, 3)
)
X = tf.fit_transform(texts_preprocessed)

In [8]:
feature_names = tf.get_feature_names_out()
words = np.array(tf.get_feature_names_out())
matrix = pd.DataFrame(X.toarray(), columns=feature_names)

### NMF

In [9]:
model = NMF(n_components=20, random_state=42)
nmf_output = model.fit_transform(X)



In [10]:
components_df = pd.DataFrame(model.components_, columns=feature_names)

In [11]:
components_df.head()

Unnamed: 0,abajo,abascal,abeja,abierto,abrazar,abrazarno,abrazo,abrazo enorme,abrigo,abril,...,único ciudadano español,único dar,único hacer,único momento,único poder,único sitio,único vecino,útil,útil habitación,útil habitación baño
0,0.015414,0.01877,0.0,0.032558,0.03039,0.0,0.001559,0.0,0.0,0.0,...,0.0,0.003406,0.0,0.002779,0.0,0.005598,0.01262,0.0,0.0,0.0
1,0.002357,0.0,0.000919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.004443,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.019322,0.004857,0.0,0.025627,0.006686,0.0,0.050855,...,0.0,0.001201,0.0,0.004228,0.0,0.00025,0.0,0.0,0.001353,0.001353
3,0.04499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.008579,0.0,0.006017,0.0,0.0,0.0,0.0,0.0
4,0.008066,0.0,0.001342,0.01049,0.001187,0.001779,0.02185,0.003197,0.002265,0.0,...,0.0,0.0,0.015203,9.6e-05,0.005508,0.0,0.0,0.0,0.0,0.0


### Topics

In [12]:
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'For topic {topic} the words with the highest value are:')
    print(tmp.nlargest(10))
    print('\n')

For topic 0 the words with the highest value are:
ver            5.212463
pasar          0.309076
bien           0.266964
ver ventana    0.225645
ver gente      0.216513
asomar         0.210843
dos            0.207527
ir ver         0.205509
niño           0.204307
padre          0.204028
Name: 0, dtype: float64


For topic 1 the words with the highest value are:
salir             2.174047
salir aplaudir    0.335268
poder salir       0.173513
ir salir          0.161624
salir cantar      0.100104
hoy salir         0.090644
salir calle       0.078632
gente salir       0.077538
hora salir        0.072491
salir casa        0.065366
Name: 1, dtype: float64


For topic 2 the words with the highest value are:
día               2.246444
cada              0.527377
cada día          0.451089
buen              0.373161
buen día          0.333935
menos             0.154873
día cuarentena    0.147087
aplauso           0.146930
confinamiento     0.125069
cuarentena        0.124725
Name: 2, dtype: fl