## Imports

In [62]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
import re
import tensorflow_addons as tfa
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

## Load Data

In [63]:
PATH = 'data/'
PATH_ODIO = 'Odio/'
PATH_CATEGORIAS = 'Categorias/'
N_MAX_WORDS = 5000
MAX_TEXT_LENGTH = 25

def clean_text(text):
  TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
  TEXT_CLEANING_RE_EXTRA = "[^\w\s]"
  TEXT_CLEAN_CHAR = "(\s\w\s)"
  text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
  text = re.sub(TEXT_CLEANING_RE_EXTRA, ' ', str(text).lower()).strip()
  text = re.sub(TEXT_CLEAN_CHAR, ' ', str(text).lower()).strip()
  return text

In [64]:
data_test = pd.read_csv(PATH+'public_test.csv')
tweets = data_test['text'].values

### Pre-processing - Odio

In [65]:
stop_words = stopwords.words("spanish")
stemmer = SnowballStemmer("spanish")

In [66]:
def preprocess(text,cleaning=True,stopwords=True,stemming=True):
    if cleaning:
      text = clean_text(text)
    tokens = []
    for token in text.split():
        if (not stopwords) or (stopwords and (token not in stop_words)):
            if stemming:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [67]:
cleaned = [preprocess(tweet, stemming=False) for tweet in tweets]
TF_tokenizer = Tokenizer(num_words=N_MAX_WORDS)
TF_tokenizer.fit_on_texts(cleaned)
test_vectors = TF_tokenizer.texts_to_matrix(cleaned, mode='tfidf')

### Pre-processing - Categorias

In [68]:
import tensorflow_hub as hub
from bert.tokenization.bert_tokenization import FullTokenizer

In [69]:
bert_path = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1'
bert_layer = hub.KerasLayer(bert_path, trainable=True)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
bert_tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [70]:
def prepare_input_for_bert(texts, bert_tokenizer, max_seq_len):
  x, y, z =[],[], []
  for text in texts:
    tokens = bert_tokenizer.tokenize(text)
    tokens = tokens[:min(len(tokens),max_seq_len-2)]
    tokens = ["[CLS]"]+tokens+["[SEP]"]
    token_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_len-len(token_ids))
    input_mask = ([1] * len(token_ids)) + ([0] *  (max_seq_len - len(token_ids)))
    input_type = ([0] * (len(token_ids)-1)) + [1] + ([0] *  (max_seq_len - len(token_ids)))
    x.append(np.array(input_ids))
    y.append(np.array(input_mask))
    z.append(np.array(input_type))
  return [tf.cast(np.array(x),tf.int32), tf.cast(np.array(y),tf.int32), tf.cast(np.array(z),tf.int32)]

In [71]:
bert_input_test = prepare_input_for_bert(tweets, bert_tokenizer, 50)

# Predictions - Odio

In [72]:
METRICS = [
      tf.metrics.BinaryAccuracy(name='accuracy'),
      tfa.metrics.F1Score(name='F1',average='macro', num_classes=1)
]

### Load Model

In [73]:
model_odio = tf.keras.models.load_model(PATH_ODIO+'Modelo_denso.h5')

### Predictions

In [74]:
y_pred = model_odio.predict(test_vectors)
for row in y_pred:
    if row[0] > 0.5:
        row[0] = 1
    else:
        row[0] = 0



In [89]:
y_pred = pd.DataFrame(y_pred, columns=['Odio'])
y_pred['tweet_id'] = data_test['tweet_id']

In [90]:
y_pred_csv = y_pred[['tweet_id', 'Odio']]

# Predictions - Categorias

In [77]:
METRICS_CATEGORIAS = [
      tf.metrics.BinaryAccuracy(name='accuracy'),
      tfa.metrics.F1Score(name='F1',average='macro', num_classes=4)
]

### Load model

In [85]:
model_categorias = tf.keras.models.load_model(PATH_CATEGORIAS+'BERT_based_model_F1-5930.h5', custom_objects={"KerasLayer": hub.KerasLayer}, compile=False)

### Predictions

In [86]:
y_pred_categorias = model_categorias.predict(bert_input_test)
for row in y_pred_categorias:
    for i in range(4):
        if row[i] > 0.5:
            row[i] = 1
        else:
            row[i] = 0



In [87]:
y_pred_categorias = pd.DataFrame(y_pred_categorias, columns=['Mujeres', 'Comunidad LGBTQ+', 'Comunidades Migrantes', 'Pueblos Originarios'])

In [91]:
y_pred_csv = pd.concat([y_pred_csv, y_pred_categorias], axis=1)
y_pred_csv.to_csv('predicciones.csv', index=False)
y_pred_csv

Unnamed: 0,tweet_id,Odio,Mujeres,Comunidad LGBTQ+,Comunidades Migrantes,Pueblos Originarios
0,1533854540763742209,0.0,0.0,0.0,0.0,0.0
1,1277756504519725057,0.0,0.0,0.0,0.0,0.0
2,1529500412402757632,0.0,0.0,0.0,0.0,0.0
3,1167425893066838016,1.0,0.0,0.0,0.0,1.0
4,1399515878727749632,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
2286,1469006079782645762,0.0,0.0,0.0,1.0,0.0
2287,1528569883868508161,0.0,0.0,0.0,0.0,0.0
2288,1502776153001455616,1.0,0.0,0.0,0.0,0.0
2289,1540938860363907073,1.0,0.0,1.0,0.0,0.0
