In [162]:
import tensorflow as tf
import numpy as np



from numpy.random import seed
seed(42)

from tensorflow.random import set_seed
set_seed(42)

In [163]:
# Download Tweets
!curl -O https://raw.githubusercontent.com/carlosep93/Text-classification-with-Keras-lab/master/data/airline/tweets.txt

#Download Labels
!curl -O https://raw.githubusercontent.com/carlosep93/Text-classification-with-Keras-lab/master/data/airline/labels.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1508k  100 1508k    0     0  6365k      0 --:--:-- --:--:-- --:--:-- 6365k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  125k  100  125k    0     0   682k      0 --:--:-- --:--:-- --:--:--  682k


In [164]:
def read_file(file_name):
  raw_text = []
  with open(file_name,'rb') as f:
    return [str(line.strip()) for line in f.readlines()]

In [165]:
tweets = read_file('tweets.txt')

labels = read_file('labels.txt')

In [166]:
from sklearn.model_selection import train_test_split

tweets_train, tweets_test, labels_train, labels_test = train_test_split(tweets, 
                                                                        labels, 
                                                                        test_size=0.33, 
                                                                        random_state=42)

tweets_train, tweets_valid, labels_train, labels_valid = train_test_split(tweets_train, 
                                                                        labels_train, 
                                                                        test_size=0.33, 
                                                                        random_state=42)

In [167]:
label_dict = {l:i for i,l in enumerate(set(labels_train))}
print(label_dict)

{"b'positive'": 0, "b'neutral'": 1, "b'negative'": 2}


In [168]:
def prepare_labels(split, label_dict=label_dict):
  prep_labels = []
  for label in split:
    idx = label_dict[label]
    classes = [0]*len(label_dict)
    classes[idx] = 1
    prep_labels.append(classes)
  return np.asarray(prep_labels)

In [169]:
prep_labels_train = prepare_labels(labels_train, label_dict)
prep_labels_valid = prepare_labels(labels_valid, label_dict)
prep_labels_test = prepare_labels(labels_test, label_dict)

In [170]:
prep_labels_train[0]

array([0, 1, 0])

In [171]:
import re
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer

stop_words = list(stopwords.words('english')) #About 150 stopwords
stemmer = PorterStemmer()
tknzr = TweetTokenizer()

def clean_data(split):
  clean_split = []
  for sentence in split:
    #Remove urls 
    sentence = re.sub(r'http\S+', '', sentence)
    #Tokenize
    sentence = tknzr.tokenize(sentence)
    #Lower Casing & Remove Stop Words
    sentence = [word.lower() for word in sentence 
                  if word not in stop_words 
                  or not word.isalnum()]
    #Stemming
    sentence = ' '.join([stemmer.stem(word) for word in sentence])
    
    clean_split.append(sentence)

  return clean_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [172]:
clean_tweets_train = clean_data(tweets_train)
clean_tweets_valid = clean_data(tweets_valid)
clean_tweets_test = clean_data(tweets_test)

In [173]:
clean_tweets_train[1]

"b ' @jetblu wish everyon felt like '"

In [174]:
raw_train_ds = tf.data.Dataset.from_tensor_slices((clean_tweets_train, prep_labels_train))
raw_test_ds = tf.data.Dataset.from_tensor_slices((clean_tweets_test, prep_labels_test))
raw_valid_ds = tf.data.Dataset.from_tensor_slices((clean_tweets_valid, prep_labels_valid))

In [175]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_features = 2000
sequence_length = 20
UNK = '<unk>'

vectorize_layer = TextVectorization(
    max_tokens=2000,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Let's make a text-only dataset (no labels):
text_ds = raw_train_ds.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

In [176]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    tensor_label = tf.convert_to_tensor(label)
    tensor_label = tf.reshape(tensor_label,[1,n_labels])
    return vectorize_layer(text), tensor_label

In [177]:
n_labels = len(label_dict)

train_ds = raw_train_ds.map(vectorize_text)
valid_ds = raw_valid_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [178]:
for batch in test_ds:
  print(batch)
  break

(<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[  2,   9,  72, 385,   1, 157,  36,   1,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]])>, <tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[1, 0, 0]])>)


In [179]:
from tensorflow.keras import layers
from tensorflow.keras.backend import mean
from tensorflow.keras.optimizers import Adam

embedding_dim = 10



# Text input
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = mean(x,axis=1)

# We project onto a single unit output layer:
predictions = layers.Dense(n_labels, activation="softmax", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with crossentropy loss and an adam optimizer.
model.compile(loss="categorical_crossentropy", optimizer='Adam', metrics=["accuracy"])

In [180]:
epochs = 1
batch_size = 32

# Fit the model using the train and test datasets.
history = model.fit(train_ds,
          validation_data=valid_ds, 
          epochs=epochs, 
          batch_size=batch_size)



In [181]:
# Test it with `raw_test_ds`, which yields raw strings
model.evaluate(test_ds)



[0.6408934593200684, 0.7489652037620544]

In [182]:
embedding_dim = 10
kernel_size = 5

# Text input
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Conv1D(50, kernel_size, strides=3)(x)
x = layers.Conv1D(30, kernel_size, strides=1)(x)
x = mean(x,axis=1)


#lstm = tf.keras.layers.LSTM(embedding_dim)
#x = lstm(x)

# We project onto a single unit output layer:
predictions = layers.Dense(n_labels, activation="softmax", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with crossentropy loss and an adam optimizer.
model.compile(loss="categorical_crossentropy", optimizer='Adam', metrics=["accuracy"])

In [183]:
epochs = 1
batch_size = 32

# Fit the model using the train and test datasets.
history = model.fit(train_ds,
          validation_data=valid_ds, 
          epochs=epochs, 
          batch_size=batch_size)



In [184]:
# Test it with `raw_test_ds`, which yields raw strings
model.evaluate(test_ds)




[0.5374395847320557, 0.7884933948516846]

In [188]:
embedding_dim = 10

# Text input
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Conv1D(50, 7, strides=3)(x)
x = layers.LSTM(10)(x)

# We project onto a single unit output layer:
predictions = layers.Dense(n_labels, activation="softmax", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with crossentropy loss and an adam optimizer.

model.compile(loss="categorical_crossentropy", optimizer='Adam', metrics=["accuracy"])

In [189]:
epochs = 2
batch_size = 32

# Fit the model using the train and test datasets.
history = model.fit(train_ds,
          validation_data=valid_ds, 
          epochs=epochs, 
          batch_size=batch_size)

Epoch 1/2
Epoch 2/2


In [187]:
# Test it with `raw_test_ds`, which yields raw strings
model.evaluate(test_ds)



[0.5654414892196655, 0.779594361782074]

**Tip:** You can increase the number of epochs to 5 to see more differences.


**Pooling model**
 1. Try different values of embedding_dim. What happens when the embedding size is smaller (e.g. 20)? And bigger (e.g. 200)? 
        
      Smaller (5) -> Tal y como vamos haciendo más pequeño la variable embedding_dim el accuracy va disminuyendo poco a poco y el loss va aumentando.

      Bigger (200) -> En cada modelo, el accuracy aumenta y el loss disminuye.

**Conv model**

  2. Kernel size defines how many words we use to compute local context. Try different values of this parameter to see how the local context affects the performance of the model.
        
        Al variar este parámetro, podemos observar cómo cambia la capacidad del modelo para comprender y contextualizar el texto. Tal y como vamos aumentando el tamaño del kernel, el modelo tiene una ventana de contexto aún más amplia, lo que le permite capturar relaciones más extensas en el texto y puede entender conexiones más largas.

  3. Add a new Conv1 layer to the model, after the current one. What paramenters would you use?
        
        Se agrega una nueva capa Conv1D con 30 filtros y el mismo kernel_size que la capa Conv1D anterior, que es 5. El parámetro strides se establece en 1, lo que indica que la operación de convolución avanzará un paso a la vez a lo largo de la secuencia..

**LSTM model**

  4. Like the embeddings, the LSTM units define how much information our representations can encode. Try different values of units and the LSTM layer. How it affects the accuracy? And the training time?

    1. Número de unidades LSTM:
      - Aumentar el número de unidades LSTM mejora la capacidad del modelo para capturar patrones y dependencias más complejos. Esto tiene como resultado una mayor precisión en el modelo.
      - Sin embargo, también implica más parámetros que entrenar, lo cual aumenta la complejidad del modelo y puede llegar a requerir más tiempo de entrenamiento.

    2. Capa LSTM:
      - Agregar múltiples capas LSTM proporciona al modelo una mayor profundidad y la capacidad de aprender representaciones más abstractas de los datos. Esto puede mejorar la precisión del modelo.
      - No obstante, agregar más capas LSTM también aumenta la complejidad del modelo y puede incrementar significativamente el tiempo de entrenamiento. 


  5. This model is by far the one with the most parameters, and that leads to overfitting. Dropout is regularization technique that drops to 0 a percentaje of the values that a layer gets, reducing the amount of information the layer gets. (https://keras.io/api/layers/regularization_layers/dropout/). How would this technique affect hour network? Keep in mind these considerations:

        El Dropout es una técnica de regularización que ayuda a reducir el sobreajuste en el modelo al "apagar" un porcentaje de las activaciones de una capa durante el entrenamiento. Esto impide que ciertas unidades dependan demasiado de otras y promueve una mayor independencia entre las unidades, lo que puede ayudar a mejorar la generalización del modelo.

  * Where would you place the new layer and why?

      La capa Dropout se suele colocar después de la capa LSTM. Esto se debe a que esta capa suele tener más parámetros y, por lo tanto, son más propensas al sobreajuste. Colocar la capa Dropout después de esta capa ayuda a regularizarla.

  * How does the performance change with different amounts of droput?

      Es recomendable experimentar con diferentes valores para encontrar el equilibrio adecuado entre regularización y rendimiento del modelo.

  * You should augment the number of epochs (epochs parameter) to see the results.

      Esto se debe a que el Dropout introduce un efecto de "ruido" durante el entrenamiento, lo que puede hacer que el modelo tarde más tiempo en converger. Aumentar el número de épocas permite al modelo ajustarse mejor y alcanzar una mayor precisión.
