**HW#2 - YCBS 273 Intro to Prac ML**


# Coding report

## Importing data and libraries

In [None]:
# import the necessary libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import zipfile

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_hub as hub
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
import zipfile
with zipfile.ZipFile('data_v2.zip', 'r') as zip_ref: 
    zip_ref.extractall('data')

## Doing text vectorization

In [None]:
batch_size = 512
seed = 1337 # Keep the seed same for both 'train' & 'validation' to avoid overlap

train_ds = keras.preprocessing.text_dataset_from_directory(
    'data/train', 
    batch_size=batch_size,
    label_mode='int',
    validation_split=0.1,
    subset='training',
    seed=seed)

val_ds = keras.preprocessing.text_dataset_from_directory(
    'data/train',
    batch_size=batch_size,
    label_mode='int',
    validation_split=0.1,
    subset='validation',
    seed=seed)

text_only_train_ds = train_ds.map(lambda x, y: x) # It is used to train the vectorization

Found 120000 files belonging to 4 classes.
Using 114000 files for training.
Found 120000 files belonging to 4 classes.
Using 6000 files for validation.


In [None]:
max_length = 100
max_tokens = 20000
text_vectorization = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorization.adapt(text_only_train_ds)

In [None]:
seq_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
seq_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))

## Loading the GloVe embeddings

In [None]:
!unzip -q glove.6B.100d.txt.zip

In [None]:
path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [None]:
embedding_dim = 100

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

## Building a sequential model

### RNN model

In [None]:
# RNN
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(4, activation="softmax")(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
       loss="sparse_categorical_crossentropy",
       metrics=["accuracy"])

model.summary()

callbacks = [
       keras.callbacks.EarlyStopping(monitor="val_loss",
                       patience=2),
             
       keras.callbacks.ModelCheckpoint("sequential.keras",
                       monitor="val_loss",
                       save_best_only=True)
]

model.fit(seq_train_ds.cache(),
          validation_data = seq_val_ds,
          epochs=10,
          callbacks=callbacks,
          )

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         2000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                34048     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080

<tensorflow.python.keras.callbacks.History at 0x7fd387342f90>

In [None]:
# RNN
model2 = keras.models.load_model("sequential.keras")

prediction_model = tf.keras.Sequential(
    [text_vectorization, model2])

prediction_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer='adam',
    metrics=['accuracy'])

loss, accuracy = prediction_model.evaluate(val_ds)
print("Accuracy: {:2.2%}".format(accuracy))

Accuracy: 90.53%


### Transformer model

In [None]:
# define a class
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [None]:
# transformer
embed_dim = 100
num_heads = 2
dense_dim = 32
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(embedded)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)

outputs = layers.Dense(4, activation="softmax")(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
       loss="sparse_categorical_crossentropy",
       metrics=["accuracy"])

model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 100)         2000000   
_________________________________________________________________
transformer_encoder_3 (Trans (None, None, 100)         87632     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 100)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 4)                 404       
Total params: 2,088,036
Trainable params: 88,036
Non-trainable params: 2,000,000
____________________________________________

In [None]:
# transformer
callbacks = [
       keras.callbacks.EarlyStopping(monitor="val_loss",
                       patience=2),
             
       keras.callbacks.ModelCheckpoint("transformer.keras",
                       monitor="val_loss",
                       save_best_only=True)
]

model.fit(seq_train_ds.cache(),
     validation_data = seq_val_ds,
     epochs=25,
     callbacks=callbacks,
    )

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25


<keras.callbacks.History at 0x7faf477f0610>

In [None]:
# transformer
# Using the trained model to make prediction on unseen (test) data
# Here we use the 'adapted' text_vectorization layer and include it as part of a prediction_model

model2 = keras.models.load_model(
    "transformer.keras",
    custom_objects={"TransformerEncoder": TransformerEncoder})

prediction_model = tf.keras.Sequential(
    [text_vectorization, model2])

prediction_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer='adam',
    metrics=['accuracy'])

# Test it with `val_ds`, which yields raw strings
loss, accuracy = prediction_model.evaluate(val_ds)
print("Accuracy: {:2.2%}".format(accuracy))

Accuracy: 90.95%


## Recording the results

In [None]:
df_test_data = pd.read_csv('data/data_test_df.csv')
inputs = df_test_data['data']

In [None]:
# Make sure you use the 'prediction_model' and not the trained 'model' alone
# If you use the 'model' object, you will run int error as the data is still in the 'text' format and needs vectorization

predicted_scores = prediction_model.predict(inputs)
predicted_scores[0:5]

array([[5.8026876e-02, 3.4643817e-04, 9.3418193e-01, 7.4447407e-03],
       [1.2452911e-02, 4.2287694e-04, 5.8042642e-04, 9.8654383e-01],
       [6.0894764e-03, 7.3628509e-03, 6.8372384e-02, 9.1817528e-01],
       [1.1799880e-02, 7.2705969e-02, 1.2470035e-01, 7.9079384e-01],
       [4.1254425e-01, 1.8902628e-03, 4.2133275e-02, 5.4343218e-01]],
      dtype=float32)

In [None]:
# populating the dataframe to make a submission on Kaggle

df_predictions = pd.DataFrame(predicted_scores, columns=['solution_' + str(i+1) for i in range(4)])
df_predictions.index.rename('Id', inplace=True)

df_predictions.head(10)

Unnamed: 0_level_0,solution_1,solution_2,solution_3,solution_4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.058027,0.000346,0.934182,0.007445
1,0.012453,0.000423,0.00058,0.986544
2,0.006089,0.007363,0.068372,0.918175
3,0.0118,0.072706,0.1247,0.790794
4,0.412544,0.00189,0.042133,0.543432
5,0.123073,0.000524,0.027899,0.848504
6,0.005756,0.000131,0.003011,0.991102
7,0.021702,0.001688,0.026682,0.949927
8,0.086772,0.000145,0.032083,0.881
9,0.044639,0.006712,0.045626,0.903023


In [None]:
df_predictions.to_csv('df_predictions.csv')

## [failed] Attempt to do NLP data augmentation

In [None]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.7-py3-none-any.whl (405 kB)
[?25l[K     |▉                               | 10 kB 25.8 MB/s eta 0:00:01[K     |█▋                              | 20 kB 28.6 MB/s eta 0:00:01[K     |██▍                             | 30 kB 17.4 MB/s eta 0:00:01[K     |███▎                            | 40 kB 15.2 MB/s eta 0:00:01[K     |████                            | 51 kB 10.8 MB/s eta 0:00:01[K     |████▉                           | 61 kB 12.2 MB/s eta 0:00:01[K     |█████▋                          | 71 kB 13.0 MB/s eta 0:00:01[K     |██████▌                         | 81 kB 14.3 MB/s eta 0:00:01[K     |███████▎                        | 92 kB 12.8 MB/s eta 0:00:01[K     |████████                        | 102 kB 11.0 MB/s eta 0:00:01[K     |█████████                       | 112 kB 11.0 MB/s eta 0:00:01[K     |█████████▊                      | 122 kB 11.0 MB/s eta 0:00:01[K     |██████████▌                     | 133 kB 11.0 MB/s eta 0:0

In [None]:
import nlpaug.augmenter.word as naw

In [None]:
#swap
text = 'I like machine learning.'
aug = naw.RandomWordAug(action="swap")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
I like machine learning.
Augmented Text:
I like learning machine.


In [None]:
#delete
aug = naw.RandomWordAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
I like machine learning.
Augmented Text:
Like machine learning.


In [None]:
import os
path = 'data/train'
list_catalog=os.listdir(path) 
for root, dirs, files in os.walk(path):
  for f in files:
    with open(os.path.join(root,f),"r") as f:    
      str = f.read()  
      print(str)

In [None]:
list_catalog

['3', '4', '1', '2']

In [None]:
count = 0
for text_batch, label_batch in train_ds:
  print(len(text_batch))
  print(len(label_batch))
  for i in range(2):
    text = bytes.decode(text_batch.numpy()[i])
    aug = naw.RandomWordAug()
    augmented_text = aug.augment(text)
    print("Original:")
    print(text)
    print("Augmented Text:")
    print(augmented_text)
    count = count+1

# Kaggle submission journal

## Submission 1&2

We reversed the test dataset and training dataset when we submitted it for the first time, kaggle rejected it since the number of rows was wrong.
Submission 2 was the version one of benchmark code.<br>
**Score: 0.56234**

## Submission 3-5

In submission 3, Xiaotong Xu added one more dense layer to the binary unigram model.<br>
**Score: 0.56233**<br>
In submission 4, Xiaotong Xu increased the max_tokens to 30000.<br>
**Score: 0.56233**<br>
In submission 5, Xiaotong Xu used binary bigram model with "tf-idf" text vectoriazion.<br>
**Score: 0.56232**<br>
From submission 3 to submission 4, we got to know that max_tokens of 20000 was enough for this dataset. The performance improved little under the "tf-idf" text vectorization, we discussed a lot but could not figure out the reason.

## Submission 6

In the class, we learned that it was essential to do text vectorization on the test dataset, and teacher gave us an example to label the training dataset. Then we downloaded the newest data and tried the RNN model based on benchmark code v2.<br>
**Score: 0.17201**

## Submission 7

We added one more dense layer to the RNN model and had 128 nodes in embedding layer. The result indicated that a complex layer was not good. Jiehao Wan said that this was because the data for embedding training were too small. Therefore, we turned to a pretrained embedding dataset in the next time.<br>
**Score: 0.18624**

## Submission 8

In the breakout room exercise, we took an arduous journey to fix the bug of importing pretraied embedding layer. With the help of teachers, Jiehao Wan made a submission and the score improved a lot. <br>
**Score: 0.13760**

## Submission 9

We read the relevant literature and found that in the field of natural language processing, the transformer model is the most popular. Since the RNN model had little to improve, we tried to adopt the transformer model. The score improved litte, which disappointed us. However, after the competition, we noticed that transformer model performed well in the private dataset.<br>
**Score: 0.13753**

## Submission 10

Weihang Fu tried an advanced Bow model to make a comparison.<br>
It seemed that transformer model was better.<br>
**Score: 0.13991**

## Submission 11

Xiaotong Xu used positional embeddings instead of GloVe embeddings.<br>
We also attempted to use the bert model but failed.<br>
**Score: 0.15036**

## Submission 12

We no longer changed the methods of embedding. <br>
In the next stage, we wanted to do some data augmentation. We knew that data augmentation is useful in image processing, but had no idea about the application on NLP. Then we read the paper 《EDA: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks》 and found out some methods to do it. At first, we doubted whether the changes on the sentences would distort the meanings. Xiaotong Xu said that it may be feasible,  since it is a classfication problem rather than a translation task. Unfortunately, we didn't know how to generate the new data in txt format. The code always told us a compile error.<br>
In another way, we "increased" the training data by adjusting the ratio of the training dataset and the validation dataset to 9:1.<br>
**Score: 0.13557**

## Submission 13-15

In this stage, we focused on the model itself.<br>
We wanted to obtain the best hyperparameters through Bayesian optimization.<br>
In the end, we adopted dense_dim=64 and num_heads=2.<br>
It seemed that score was improved.<br>
**Score: 0.13339 (best)**

## Submission 16

In the process of randomly printing the sentences of the dataset, Xiaotong Xu found that the average length of the sentences exceeded 50, which means that setting max_length of 50 will lose some important information. Therefore we increased the max_length to 100.<br>
**Score: 0.12732**

## Final score

**Public leaderboard: 0.12897 (ranked 4th)**<br>
**Private leaderboard: 0.12732 (ranked 3rd)**<br> 
For both assignments, we worked hard from the bottom of the leaderboard and finally got a satisfactory model result. This was a very interesting and challenging process and we enjoyed the exploration of machine learning a lot.