In [28]:
import os
import pandas as pd
import nltk
from enum import Enum
import re
import spacy
from spacy.tokens import Token
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import numpy as np

## Loading the data

In [29]:
TRAIN_PATH = os.path.join(os.getcwd(), 'data', 'train.csv')

Let's rename the columns to make them easier to work with.

In [30]:
data_df = pd.read_csv(TRAIN_PATH, index_col='index')
data_df = data_df.rename(columns={
    'tweet': 'text',
    'prejudice_woman': 'woman',
    'prejudice_lgbtiq': 'lgbtiq',
    'prejudice_inmigrant_race': 'race',
    'mean_prejudice': 'mean',
})

data_df = data_df.sort_index()

display(data_df.head(5))
display(data_df.describe())


Unnamed: 0_level_0,text,humor,woman,lgbtiq,race,gordofobia,mean
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,MENTION Ya estará colocada en algún chiringuit...,0,1,0,0,0,3.4
22,"Sin querer, hoy le dije “Hola” a una feminista...",1,1,0,0,0,3.8
30,"En este capítulo, que se transmitió en el Cana...",0,0,1,0,0,2.2
40,MENTION MENTION MENTION Concuerdo contigo en e...,0,0,1,0,0,3.8
45,– ¡No es no! ¡En mi cuerpo mando yo!\r\n– ¡Así...,1,1,0,0,0,2.2


Unnamed: 0,humor,woman,lgbtiq,race,gordofobia,mean
count,2671.0,2671.0,2671.0,2671.0,2671.0,2671.0
mean,0.325346,0.483714,0.227256,0.248596,0.08012,3.053126
std,0.468592,0.499828,0.419138,0.43228,0.271529,0.809173
min,0.0,0.0,0.0,0.0,0.0,0.4
25%,0.0,0.0,0.0,0.0,0.0,2.4
50%,0.0,0.0,0.0,0.0,0.0,3.0
75%,1.0,1.0,0.0,0.0,0.0,3.6
max,1.0,1.0,1.0,1.0,1.0,5.0


Let's load the spacy model

In [31]:
SPACY_MODEL = "es_core_news_sm"
# TODO: Try other spanish spacy models: es_core_news_md, es_core_news_lg, es_dep_news_trf

try:
    nlp = spacy.load(SPACY_MODEL)
except OSError:
    spacy.cli.download(SPACY_MODEL)
    nlp = spacy.load(SPACY_MODEL)


By using the default pipeline of the spacy model, we can get tokens with information about their part of speech, lemma, whether they are a stop word, etc.

In [32]:
# Tokenize text
parsed_df = data_df.copy()
parsed_df['tokens'] = list(nlp.pipe(parsed_df['text']))
parsed_df['tokens'] = parsed_df['tokens'].apply(lambda x: [token for token in x])
display(parsed_df.head(5))


Unnamed: 0_level_0,text,humor,woman,lgbtiq,race,gordofobia,mean,tokens
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9,MENTION Ya estará colocada en algún chiringuit...,0,1,0,0,0,3.4,"[MENTION, Ya, estará, colocada, en, algún, chi..."
22,"Sin querer, hoy le dije “Hola” a una feminista...",1,1,0,0,0,3.8,"[Sin, querer, ,, hoy, le, dije, “, Hola, ”, a,..."
30,"En este capítulo, que se transmitió en el Cana...",0,0,1,0,0,2.2,"[En, este, capítulo, ,, que, se, transmitió, e..."
40,MENTION MENTION MENTION Concuerdo contigo en e...,0,0,1,0,0,3.8,"[MENTION, MENTION, MENTION, Concuerdo, contigo..."
45,– ¡No es no! ¡En mi cuerpo mando yo!\r\n– ¡Así...,1,1,0,0,0,2.2,"[–, ¡, No, es, no, !, ¡, En, mi, cuerpo, mando..."


In [33]:
# Show some data about a token
token: Token = parsed_df['tokens'].iloc[0][5]
print("Text: ", token.text)
print("Lemma: ", token.lemma_)
print("POS: ", token.pos_)
print("Tag: ", token.tag_)
print("Dep: ", token.dep_)
print("Shape: ", token.shape_)
print("Is alpha: ", token.is_alpha)
print("Is stop: ", token.is_stop)

Text:  algún
Lemma:  alguno
POS:  DET
Tag:  DET
Dep:  det
Shape:  xxxx
Is alpha:  True
Is stop:  True


## Token filtering utility
Let's create utility functions to filter the tokens based on their attributes. We can easily use these functions to filter the tokens, build and experiment with different representations of the data in the next steps of the project.

In [34]:
EMOJI_PATTERN = re.compile(
    "["
    + "\U0001F600-\U0001F64F"
    + "\U0001F300-\U0001F5FF"
    + "\U0001F680-\U0001F6FF"
    + "\U0001F1E0-\U0001F1FF"
    + "\U00002500-\U00002BEF"
    + "\U00002702-\U000027B0"
    + "\U00002702-\U000027B0"
    + "\U000024C2-\U0001F251"
    + "\U0001f926-\U0001f937"
    + "\U00010000-\U0010ffff"
    + "\u2640-\u2642"
    + "\u2600-\u2B55"
    + "\u200d"
    + "\u23cf"
    + "\u23e9"
    + "\u231a"
    + "\ufe0f"
    + "\u3030"
    + "]+",
    flags=re.UNICODE,
)

try:
    STOPWORDS = nltk.corpus.stopwords.words("spanish")
except LookupError:
    nltk.download("stopwords")
    STOPWORDS = nltk.corpus.stopwords.words("spanish")

In [35]:
FILTER_FUNC = {
    "punct": lambda token: token.is_punct,
    "stopwords": lambda token: token.text.lower() in STOPWORDS,
    "emoji": lambda token: EMOJI_PATTERN.match(token.text),
    "number": lambda token: token.like_num,
    "newline": lambda token: re.match(r"\n+", token.text),
    "space": lambda token: token.is_space and not re.match(r"\n+", token.text),
    "tags": lambda token: token.text in ["MENTION", "HASHTAG", "URL"]
}

class Filter(Enum):
    PUNCT = "punct"
    STOPWORDS = "stopwords"
    EMOJI = "emoji"
    NUMBER = "number"
    NEWLINE = "newline"
    SPACE = "space"
    TAGS = "tags"


def filter_tokens(series: pd.Series, filters: list[Filter]) -> pd.Series:
    '''
    Filters a series of tokens using the given filters.
    args:
        series: a series of lists of tokens
        filters: a list of filters to apply
    returns:
        a series of lists of tokens
    '''
    return series.apply(
        lambda tokens: [token for token in tokens if not any([FILTER_FUNC[filter.value](token) for filter in filters])]
    )

In [36]:
def test_filter_tokens(result, expected):
    error = False
    for token, expected_token in zip(result, expected):
        try:
            assert token.text == expected_token
        except AssertionError:
            print(f"Expected: '{expected_token}', got: '{token.text}'")
            error = True

    if not error:
        print("Test passed!")

# test Filter.PUNCT
result = filter_tokens(parsed_df['tokens'], [Filter.PUNCT]).loc[22]
expected = ['Sin', 'querer', 'hoy', 'le', 'dije', 'Hola', 'a', 'una', 'feminista', 'El', 'juicio', 'es', 'mañana', '\n\n', 'HASHTAG', '   ', 'feminismo', '  ']
test_filter_tokens(result, expected)

# test Filter.STOPWORDS
result = filter_tokens(parsed_df['tokens'], [Filter.STOPWORDS]).loc[9]
expected = ['MENTION', 'colocada', 'algún', 'chiringuito', 'feminazi']
test_filter_tokens(result, expected)

# test Filter.EMOJI
result = filter_tokens(parsed_df['tokens'], [Filter.EMOJI]).loc[75]
expected = ['Demasiadas', 'mujeres', ',', 'demasiadas', 'mujeres', '\n', 'URL']
test_filter_tokens(result, expected)

# test Filter.NUMBER
result = filter_tokens(parsed_df['tokens'], [Filter.NUMBER]).loc[5245]
expected = ['Acá', 'te', 'lo', 'aclaro', ':', '\n', 'de', 'octubre', ':', 'Día', 'de', 'brujas', '(', 'Mujeres', ')', '\n', 'de', 'noviembre', ':', 'Día', 'de', 'todos', 'los', 'santos', '(', 'Hombres', ')', '\n', 'noviembre', ':', 'Día', 'de', 'los', 'difuntos', '(', 'Hombres', 'que', 'se', 'animaron', 'a', 'decirle', 'brujas', 'a', 'las', 'mujeres', ')']
test_filter_tokens(result, expected)

# test Filter.NEWLINE
result = filter_tokens(parsed_df['tokens'], [Filter.NEWLINE]).loc[22]
expected = ['Sin', 'querer', ',', 'hoy', 'le', 'dije', '“', 'Hola', '”', 'a', 'una', 'feminista', '.', 'El', 'juicio', 'es', 'mañana', '.', 'HASHTAG', '   ', 'feminismo', '  ', '#']
test_filter_tokens(result, expected)

# test Filter.SPACE
result = filter_tokens(parsed_df['tokens'], [Filter.SPACE]).loc[22]
expected = ['Sin', 'querer', ',', 'hoy', 'le', 'dije', '“', 'Hola', '”', 'a', 'una', 'feminista', '.', 'El', 'juicio', 'es', 'mañana', '.', '\n\n', 'HASHTAG', 'feminismo', '#']

# test Filter.TAGS
result = filter_tokens(parsed_df['tokens'], [Filter.TAGS]).loc[14869]
# FIXME: Notice that when there is no space between `-` and the next word, both are recognized as a single token
expected = ['  ', '  ', '  \n', '-Te', 'cuento', 'un', 'chiste', 'machista', '?', '\n', '-Pero', 'que', 'dices', ',', 'si', 'soy', 'mujer', '.', '\n', '-Tranquila', ',', 'te', 'lo', 'explico', '.']
test_filter_tokens(result, expected)


# test all
result = filter_tokens(parsed_df['tokens'], [Filter.PUNCT, Filter.STOPWORDS, Filter.EMOJI, Filter.NUMBER, Filter.NEWLINE, Filter.SPACE]).loc[5245]
expected = ['Acá', 'aclaro', 'octubre', 'Día', 'brujas', 'Mujeres', 'noviembre', 'Día', 'santos', 'Hombres', 'noviembre', 'Día', 'difuntos', 'Hombres', 'animaron', 'decirle', 'brujas', 'mujeres']
test_filter_tokens(result, expected)

Expected: '

', got: '

'
Test passed!
Expected: '
', got: '
'
Expected: '
', got: '
'
Expected: '
', got: '
'
Expected: '
', got: '
'
Expected: 'HASHTAG', got: '

'
Expected: '   ', got: 'HASHTAG'
Expected: 'feminismo', got: '   '
Expected: '  ', got: 'feminismo'
Expected: '#', got: '  '
Expected: '  
', got: '  
'
Expected: '
', got: '
'
Expected: '
', got: '
'
Test passed!


## TF-IDF
TODO: Try different token filtering options for TF-IDF

In [37]:
# Remove stopwords, punctuations, emojis, numbers, newlines and spaces
tf_idf_df = parsed_df.copy()
tf_idf_df['processed'] = filter_tokens(tf_idf_df['tokens'], [Filter.PUNCT, Filter.STOPWORDS, Filter.EMOJI, Filter.NUMBER, Filter.NEWLINE, Filter.SPACE])

# Use lemmas instead of tokens
tf_idf_df['processed'] = tf_idf_df['processed'].apply(lambda tokens: [token.lemma_ for token in tokens])

# Concat all tokens into a single string
# This is needed for the TF-IDF vectorizer
tf_idf_df['processed'] = tf_idf_df['processed'].apply(lambda tokens: ' '.join(tokens))

# lowercase
tf_idf_df['processed'] = tf_idf_df['processed'].apply(lambda tokens: tokens.lower())

# TF-IDF vectorizer
tf_idf_vectorizer = TfidfVectorizer()
tf_idf_df["tf-idf"] = tf_idf_vectorizer.fit_transform(tf_idf_df['processed']).toarray().tolist()

display(tf_idf_df.head())

Unnamed: 0_level_0,text,humor,woman,lgbtiq,race,gordofobia,mean,tokens,processed,tf-idf
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9,MENTION Ya estará colocada en algún chiringuit...,0,1,0,0,0,3.4,"[MENTION, Ya, estará, colocada, en, algún, chi...",mention colocado alguno chiringuito feminazi,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
22,"Sin querer, hoy le dije “Hola” a una feminista...",1,1,0,0,0,3.8,"[Sin, querer, ,, hoy, le, dije, “, Hola, ”, a,...",querer hoy decir hola feminista juicio mañana ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
30,"En este capítulo, que se transmitió en el Cana...",0,0,1,0,0,2.2,"[En, este, capítulo, ,, que, se, transmitió, e...",capítulo transmitir canal televisión británico...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
40,MENTION MENTION MENTION Concuerdo contigo en e...,0,0,1,0,0,3.8,"[MENTION, MENTION, MENTION, Concuerdo, contigo...",mention mention mention concuerdo contigo supe...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
45,– ¡No es no! ¡En mi cuerpo mando yo!\r\n– ¡Así...,1,1,0,0,0,2.2,"[–, ¡, No, es, no, !, ¡, En, mi, cuerpo, mando...",cuerpo mando así hablar empoderada poder abort...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [38]:
print(f"Length of tf-idf vector: {len(tf_idf_df['tf-idf'].iloc[0])}")

Length of tf-idf vector: 7875


## Word embeddings
### FastText Spanish Unannotated Corpora
Pre-trained word embeddings were downloaded from [dccuchile/spanish-word-embeddings](https://github.com/dccuchile/spanish-word-embeddings#fasttext-embeddings-from-suc).

According to [josecannete/spanish-corpora](https://github.com/josecannete/spanish-corpora) the corpus on which the FastText embeddings were trained was processed in the following way:

> - Lowercase
> - Removed urls
> - Removed listing
> - Replaced multiple spaces with single one

so in order to get the best results we will need to do the same. Urls in the training set are replaced with the string `URL` but we will remove them anyway. We will do the same for hashtags and mentions.

In [39]:
fast_text_suc_df = parsed_df.copy()
fast_text_suc_df['processed'] = filter_tokens(fast_text_suc_df['tokens'], [Filter.SPACE, Filter.TAGS, Filter.NEWLINE])
fast_text_suc_df['processed'] = fast_text_suc_df['processed'].apply(lambda tokens: [token.text for token in tokens])

fast_text_suc_df.head()

Unnamed: 0_level_0,text,humor,woman,lgbtiq,race,gordofobia,mean,tokens,processed
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9,MENTION Ya estará colocada en algún chiringuit...,0,1,0,0,0,3.4,"[MENTION, Ya, estará, colocada, en, algún, chi...","[Ya, estará, colocada, en, algún, chiringuito,..."
22,"Sin querer, hoy le dije “Hola” a una feminista...",1,1,0,0,0,3.8,"[Sin, querer, ,, hoy, le, dije, “, Hola, ”, a,...","[Sin, querer, ,, hoy, le, dije, “, Hola, ”, a,..."
30,"En este capítulo, que se transmitió en el Cana...",0,0,1,0,0,2.2,"[En, este, capítulo, ,, que, se, transmitió, e...","[En, este, capítulo, ,, que, se, transmitió, e..."
40,MENTION MENTION MENTION Concuerdo contigo en e...,0,0,1,0,0,3.8,"[MENTION, MENTION, MENTION, Concuerdo, contigo...","[Concuerdo, contigo, en, eso, ,, super, repugn..."
45,– ¡No es no! ¡En mi cuerpo mando yo!\r\n– ¡Así...,1,1,0,0,0,2.2,"[–, ¡, No, es, no, !, ¡, En, mi, cuerpo, mando...","[–, ¡, No, es, no, !, ¡, En, mi, cuerpo, mando..."


In [44]:
# Load FastText model
FAST_TEXT_SUC_PATH = os.path.join(os.getcwd(), 'embeddings', 'fasttext', 'embeddings-l-model.vec')
print(FAST_TEXT_SUC_PATH)
fast_text_suc_model = KeyedVectors.load_word2vec_format(FAST_TEXT_SUC_PATH)

c:\Users\sarae\Desktop\LNR\LNR-project\embeddings\fasttext\embeddings-l-model.vec


ValueError: invalid literal for int() with base 10: 'MENTION'

In [None]:
fast_text_suc_vec_len = len(fast_text_suc_model['hola'])
print(fast_text_suc_vec_len)

NameError: name 'fast_text_suc_model' is not defined

In [None]:
# Represent each sentence as the average of its word embeddings
def get_sentence_embedding(tokens: list[str], model):
    embeddings = []
    at_least_one = False
    for token in tokens:
        try:
            embeddings.append(model[token])
            at_least_one = True
        except KeyError:
            pass

    if not at_least_one:
        return np.zeros(fast_text_suc_vec_len)
    
    return np.mean(embeddings, axis=0)
    

In [33]:
fast_text_suc_df['embedding'] = fast_text_suc_df['processed'].apply(lambda tokens: get_sentence_embedding(tokens, fast_text_suc_model))
display(fast_text_suc_df.head())

NameError: name 'fast_text_suc_model' is not defined

In [166]:
print(f"Length of FastText embedding vector: {len(fast_text_suc_df['embedding'].iloc[0])}")

Length of FastText embedding vector: 300


### TODO: Word2Vec
Pre-trained word2vec embeddings were downloaded from [aitoralmeida/spanish_word2vec](https://github.com/aitoralmeida/spanish_word2vec).

Aitor Almeida, & Aritz Bilbao. (2018). Spanish 3B words Word2Vec Embeddings (Version 1.0) [Data set]. Zenodo. http://doi.org/10.5281/zenodo.1410403
Bilbao-Jayo, A., & Almeida, A. (2018). Automatic political discourse analysis with multi-scale convolutional neural networks and contextual data. International Journal of Distributed Sensor Networks, 14(11), 1550147718811827.

In [129]:
# Load pre-trained word embeddings
word2vec_path = os.path.join(os.getcwd(), 'embeddings', 'word2vec', 'complete.kv')


### TODO: GloVe