In [1]:
import collections
import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import spacy
import nltk
from nltk.stem import WordNetLemmatizer

from utils import (
    drop_spam_rows,
    remove_digits,
    remove_prefixed_words,
    contract_spaces,
    remove_word,
    remove_single_characters,
    remove_special_characters,
)



### Load spaCy Spanish trained pipeline.

In [2]:
try:
    sp = spacy.load("es_core_news_sm")
except OSError:
    !python3 -m spacy download es_core_news_sm
    sp = spacy.load("es_core_news_sm")

In [3]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /home/robert/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/robert/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/robert/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Load texts

In [4]:
dataset_path = "data/balcones_2020.csv"
dataset = pd.read_csv(dataset_path)
texts = dataset["text"]

### Remove spam

In [5]:
spam_texts = ["El Magazin del Balcón Segoviano", "Viva María Auxiliadora"]
texts = drop_spam_rows(text_series=texts, spam_messages=spam_texts)

### Preprocessing

In [23]:
PUNCTUATION_MARKS = string.punctuation + "¿¡" + "..." + "…" + " "
STOP_WORDS = nltk.corpus.stopwords.words("spanish")
UNDESIRED_WORDS = [
    "balcón",
    "balcones",
    "balcon",
    "si",
    "haber",
    "ser",
    "quedateencasa",
    "yomequedoencasa",
    "parir"
]
UNDESIRED_PREFIXES = ["@", "#", "http", "jaj", "xd", "xD", "XD"]


def tokenize(text):
    return " ".join(
        [
            token.lemma_ for token in sp(text)
            if token.text not in PUNCTUATION_MARKS
            and token.text not in STOP_WORDS + UNDESIRED_WORDS
        ]
    )

def preprocess_text(text):

    text = text.lower()
    # Regex filters.
    for prefix in UNDESIRED_PREFIXES:
            text = remove_prefixed_words(prefix, text)
    text = remove_special_characters(text)
    text = remove_single_characters(text)
    text = remove_digits(text)
    text = contract_spaces(text)
    return tokenize(text)

In [24]:
texts_preprocessed = texts.apply(lambda x: preprocess_text(x))

### TF-IDF Vectorizer

In [26]:
tf = TfidfVectorizer(
    stop_words=(STOP_WORDS),
    min_df=3,
    max_df=0.85,
    ngram_range=(1, 2)
)
X = tf.fit_transform(texts_preprocessed)

In [30]:
feature_names = tf.get_feature_names_out()
words = np.array(tf.get_feature_names_out())
matrix = pd.DataFrame(X.toarray(), columns = feature_names)

### NMF

In [49]:
model = NMF(n_components=15, random_state=10)
nmf_output = model.fit_transform(X)

In [50]:
components_df = pd.DataFrame(model.components_, columns=feature_names)

In [51]:
components_df

Unnamed: 0,abajo,abascal,abeja,abierto,abrazar,abrazarno,abrazo,abrazo enorme,abrigo,abril,...,único,único ciudadano,único dar,único hacer,único momento,único poder,único sitio,único vecino,útil,útil habitación
0,0.011728,0.0,0.001336,0.0,0.003365,0.0,0.0,0.0,0.003362,0.0,...,0.040637,0.001673,0.0,0.0,0.008274,0.0,0.003038,0.000367,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000431,0.0,0.0,0.0,0.0,0.000957,...,0.006336,0.003297,0.002495,0.0,0.000205,0.0,0.0,0.0,0.004485,0.0
2,0.001088,0.0,0.0,0.000235,0.0,0.0,0.0,0.0,0.0,0.008966,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000204,0.0,0.0
3,0.03693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001602,0.0,0.0,0.006311,0.0,0.005141,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.017506,0.005191,0.000642,0.026576,0.006067,9.3e-05,0.053637,...,0.034915,0.0,0.001166,0.0,0.003433,0.0,0.000502,0.0,0.000878,0.001814
5,0.010059,0.0,0.001377,0.01028,0.002093,0.001901,0.025715,0.003151,0.003323,0.003744,...,0.018648,0.0,0.0,0.015921,0.0,0.007823,0.0,0.0,0.0,0.0
6,0.010673,0.003317,0.0,0.0,0.0,0.0,0.0,0.0,0.008766,0.0,...,0.005478,0.0,0.0,0.0,0.000108,0.0,0.0,0.010861,0.0,0.0
7,0.010589,0.007169,0.0,0.017317,0.015327,0.0,0.006277,0.0,0.0,0.0,...,0.033066,0.0,0.001921,0.0,0.000738,0.0,0.003339,0.006436,0.0,0.0
8,0.019568,0.0,0.0,0.000769,0.001403,0.00011,0.015017,0.0,0.0,0.020067,...,0.013603,0.0,0.0,0.0,0.0,0.003928,0.003319,0.0,0.000962,0.000237
9,0.001605,0.0,0.0,0.003581,0.012597,0.000391,0.017649,0.001861,0.0,0.020795,...,0.0,0.000703,0.0,0.0,0.0,0.000126,0.0,0.0,0.0,0.0


### Topics

In [52]:
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'For topic {topic} the words with the highest value are:')
    print(tmp.nlargest(10))
    print('\n')

For topic 0 the words with the highest value are:
salir             4.504302
salir aplaudir    0.691029
poder salir       0.652369
poder             0.614710
ir salir          0.342918
salir cantar      0.277432
cantar            0.230656
salir calle       0.206934
hora salir        0.202030
hora              0.194266
Name: 0, dtype: float64


For topic 1 the words with the highest value are:
aplaudir              2.057521
salir aplaudir        0.770545
luego                 0.291644
sanitario             0.238871
sanidad               0.158732
aplaudir sanitario    0.145544
seguro                0.127300
público               0.118316
sanidad público       0.097566
salgo                 0.093803
Name: 1, dtype: float64


For topic 2 the words with the highest value are:
cantar             0.844968
bella              0.714873
bella ciao         0.714873
ciao               0.713700
cantar bella       0.703115
italiano cantar    0.699438
fascismo           0.698886
italiano           0.6

In [46]:
# column names
topicnames = ['Topic_' + str(i) for i in range(model.n_components)]

# index names
tweets = ['tweet_' + str(i) for i in range(len(texts))]

In [48]:
# create a dataframe
df_tweet_topic = pd.DataFrame(np.round(nmf_output, 3), columns=topicnames)

# dominant topic
df_tweet_topic['dominant_topic'] = np.argmax(df_tweet_topic.values, axis=1)
df_tweet_topic.head()

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,dominant_topic
0,0.001,0.0,0.0,0.0,0.0,0.099,0.044,0.015,0.0,0.0,5
1,0.0,0.021,0.0,0.0,0.0,0.035,0.0,0.138,0.0,0.0,7
2,0.002,0.0,0.0,0.071,0.003,0.004,0.002,0.018,0.009,0.001,3
3,0.002,0.008,0.0,0.0,0.008,0.004,0.001,0.036,0.001,0.001,7
4,0.003,0.004,0.0,0.006,0.005,0.004,0.008,0.015,0.003,0.005,7
