<a href="https://colab.research.google.com/github/poolGolez/tf-playground/blob/main/NLP_Fundamentals_Follow_Through.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup


In [1]:
# Check for GPU
!nvidia-smi

Sun Jun 18 12:25:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split

In [3]:
# Helper functions
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

--2023-06-18 12:25:35--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py.4’


2023-06-18 12:25:35 (92.9 MB/s) - ‘helper_functions.py.4’ saved [10246/10246]



In [4]:
# The dataset (disaster or not disaster from tweets)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"
unzip_data('nlp_getting_started.zip')

--2023-06-18 12:25:35--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.24.128, 142.251.10.128, 142.251.12.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.24.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip.4’


2023-06-18 12:25:37 (604 KB/s) - ‘nlp_getting_started.zip.4’ saved [607343/607343]



# Exploratory Data Analysis

In [5]:
train_df = pd.read_csv('train.csv')

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
tally = train_df['target'].value_counts()

print("Train dataset target occurences")
print(f"No(%): {tally[0]} ({tally[0]/tally.sum() * 100:.2f}%)")
print(f"Yes(%): {tally[1]} ({tally[1]/tally.sum() * 100:.2f}%)")

Train dataset target occurences
No(%): 4342 (57.03%)
Yes(%): 3271 (42.97%)


# Split train and validation set

In [7]:
X_train, X_val, y_train, y_val = train_test_split(train_df['text'], train_df['target'],
                                                  shuffle=True,
                                                  test_size=0.1,
                                                  random_state = 42)
len(X_train), len(y_train), len(X_val), len(y_val)

(6851, 6851, 762, 762)

In [8]:
pd.concat([X_train, y_train], axis=1).head(15)

Unnamed: 0,text,target
4620,'McFadden Reportedly to Test Hamstring Thursda...,0
2858,w--=-=-=-[ NEMA warns Nigerians to prepare for...,1
3098,When I was cooking earlier I got electrocuted ...,0
3751,I'm On Fire. http://t.co/WATsmxYTVa,0
5285,More than 40 families affected by the fatal ou...,1
5863,Why do u ruin everything? @9tarbox u ruined t...,0
4827,http://t.co/c1H7JECFrV @RoyalCarribean do your...,1
5190,WACKOES like #MicheleBachman predict the WORLD...,0
5784,@Reuters people like you should be charged aft...,0
4369,Remove the http://t.co/2nS5TfnxpA and Linkury ...,0


In [9]:
pd.concat([X_val, y_val], axis=1).head(15)

Unnamed: 0,text,target
2644,So you have a new weapon that can cause un-ima...,1
2227,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,Aftershock back to school kick off was great. ...,0
6845,in response to trauma Children of Addicts deve...,0
5559,@Calum5SOS you look like you got caught in a r...,0
1765,my favorite lady came to our volunteer meeting...,1
1817,@brianroemmele UX fail of EMV - people want to...,1
6810,Can't find my ariana grande shirt this is a f...,0
4398,The Murderous Story Of AmericaÛªs First Hijac...,1


# Play with TextVectorization

In [10]:
round(sum([len(stmt) for stmt in X_train])/len(X_train))

101

In [11]:
vectorizer = tf.keras.layers.TextVectorization(
  max_tokens = 1000,
  output_mode = 'int',
  output_sequence_length= 101
)

In [12]:
vocab = vectorizer.get_vocabulary()
len(vocab)

2

In [13]:
# fit vectorizer to X_train
vectorizer.adapt(X_train)

In [14]:
vocab = vectorizer.get_vocabulary()
len(vocab)

1000

In [15]:
vocab[:15], vocab[-15:]

(['',
  '[UNK]',
  'the',
  'a',
  'in',
  'to',
  'of',
  'and',
  'i',
  'is',
  'for',
  'on',
  'you',
  'my',
  'with'],
 ['ppl',
  'point',
  'patience',
  'party',
  'parole',
  'pamela',
  'pakistani',
  'outrage',
  'offensive',
  'nws',
  'needs',
  'nearly',
  'muslims',
  'morning',
  'moment'])

In [16]:
vectorizer([X_train[72]])

<tf.Tensor: shape=(1, 101), dtype=int64, numpy=
array([[980,   1, 921, 132,   4,   1,  11,   1, 794, 218,   1,   1,  43,
          1, 921, 158, 499,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>

In [17]:
original_stmt = X_train[72]
reconstructed_stmt = " ".join([ vocab[i] for i in vectorizer([X_train[72]])[0] ])

print(f"Original      > {original_stmt}")
print(f"Reconstructed > {reconstructed_stmt}")

Original      > Reported motor vehicle accident in Curry on Herman Rd near Stephenson involving an overturned vehicle. Please use... http://t.co/YbJezKuRW1
Reconstructed > reported [UNK] vehicle accident in [UNK] on [UNK] rd near [UNK] [UNK] an [UNK] vehicle please use [UNK]                                                                                   


# Play with Embedding

In [18]:
embedding = tf.keras.layers.Embedding(
    input_dim=len(vocab),
    output_dim=32,
    input_length=101
)

In [19]:
X_train[72]

'Reported motor vehicle accident in Curry on Herman Rd near Stephenson involving an overturned vehicle. Please use... http://t.co/YbJezKuRW1'

In [20]:
embedding(vectorizer([X_train[72]]))

<tf.Tensor: shape=(1, 101, 32), dtype=float32, numpy=
array([[[ 0.04763884, -0.03281065, -0.03078926, ...,  0.02598828,
          0.01624611, -0.04201897],
        [-0.01897745,  0.0395453 , -0.0160863 , ..., -0.02003567,
         -0.00786115,  0.04372832],
        [ 0.01840964, -0.03848721,  0.01074309, ..., -0.03013144,
         -0.04779107, -0.03588898],
        ...,
        [-0.01387496,  0.01828134, -0.04817946, ...,  0.0090137 ,
          0.0144227 ,  0.01020113],
        [-0.01387496,  0.01828134, -0.04817946, ...,  0.0090137 ,
          0.0144227 ,  0.01020113],
        [-0.01387496,  0.01828134, -0.04817946, ...,  0.0090137 ,
          0.0144227 ,  0.01020113]]], dtype=float32)>

# Modelling


## Model 0 (Baseline): Naive Bayes with TF-IDF encoder

In [21]:
def build_model_0(X, y):
  from sklearn.pipeline import Pipeline
  from sklearn.feature_extraction.text import TfidfVectorizer
  from sklearn.naive_bayes import MultinomialNB

  model = Pipeline([
      ('vectorizer', TfidfVectorizer()),
      ('clf', MultinomialNB())
  ])
  model.fit(X, y)

  return model


model_0 = build_model_0(X_train, y_train)

In [22]:
model_0_preds = model_0.predict(X_val)

model_0_preds.shape, model_0_preds[:15]

((762,), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]))

In [23]:
def score_predictions(y_true, y_pred):
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  accuracy = accuracy_score(y_true, y_pred)
  precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')

  return {
      'accuracy': accuracy,
      'precision': precision,
      'recall': recall,
      'f1': f1
  }

score_predictions(y_val, model_0_preds)

{'accuracy': 0.7782152230971129,
 'precision': 0.792992256322435,
 'recall': 0.7782152230971129,
 'f1': 0.7703527809038113}

## Model 1: Feed-forwared neural network

In [24]:
VOCAB_SIZE = 1000
VECTOR_OUTPUT_LENGTH = 101

def build_model_1(X, y):
  from tensorflow.keras.layers import Input, TextVectorization, Embedding, Dense, GlobalAveragePooling1D
  model = tf.keras.Sequential()
  inputs = Input(shape=(1, ), dtype=tf.string)
  model.add(inputs)

  vectorizer = TextVectorization(
      max_tokens=VOCAB_SIZE,
      standardize="lower_and_strip_punctuation",
      output_mode="int",
      output_sequence_length=VECTOR_OUTPUT_LENGTH
  )
  vectorizer.adapt(X)
  model.add(vectorizer)

  embedding = Embedding(
      input_dim=VOCAB_SIZE,
      output_dim=128,
      input_length=VECTOR_OUTPUT_LENGTH
  )
  model.add(embedding)

  pool = GlobalAveragePooling1D()
  model.add(pool)

  output = Dense(1, activation='sigmoid')
  model.add(output)

  return model

model_1 = build_model_1(X_train, y_train)
model_1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 101)              0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 101, 128)          128000    
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 128,129
Trainable params: 128,129
Non-trainable params: 0
_________________________________________________________________


In [25]:
model_1.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
  )

model_1.fit(X_train, y_train,
            epochs=10,
            validation_data=(X_val, y_val)
            )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe2e3b56f50>

In [26]:
model_1_preds = model_1.predict(X_val)

model_1_preds.shape, model_1_preds[:15]



((762, 1),
 array([[0.05634722],
        [0.16854191],
        [0.6611411 ],
        [0.03421649],
        [0.46054524],
        [0.19526057],
        [0.13259429],
        [0.30689484],
        [0.1654063 ],
        [0.46561748],
        [0.6832941 ],
        [0.29427433],
        [0.06743946],
        [0.46495488],
        [0.3309948 ]], dtype=float32))

In [27]:
model_1_preds = tf.squeeze(tf.round(model_1_preds))
model_1_preds[:15]

<tf.Tensor: shape=(15,), dtype=float32, numpy=
array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
      dtype=float32)>

In [28]:
score_predictions(y_val, model_1_preds)

{'accuracy': 0.7834645669291339,
 'precision': 0.7885887093531081,
 'recall': 0.7834645669291339,
 'f1': 0.7791869858943502}

## Model 2: LSTM (RNN)

## Model 3: GRU (RNN)

## Model 4: Bidirectional-LSTM (RNN)

## Model 5: 1D Convolutional Network

## Model 6: Tensorflow Hub Pretrained Feature Extractor

## Model 7: TensorFlow Hub PretrainedFEature Extrator with 10% of data