In [1]:
import tensorflow as tf

In [2]:
# NLP
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2024-03-27 09:00:03--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.126.207, 142.251.171.207, 74.125.201.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.126.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-03-27 09:00:03 (57.2 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [4]:
import zipfile
zip = zipfile.ZipFile("nlp_getting_started.zip")
zip.extractall()
zip.close()

In [5]:
import pandas as pd
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv("test.csv")

In [6]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
train_df.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [8]:
train_df_shuffled = train_df.sample(frac = 1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [9]:
import random
random_index = random.randint(0, len(train_df) - 1)
for row in train_df_shuffled[["text", 'target']][random_index : random_index + 5].itertuples():
  index, text, target = row
  print(f"Target: {target}; {'real disaster' if target!=0 else 'not a real disaster'}")
  print("Text:",text,end='\n')
  print("--"*20)

Target: 1; real disaster
Text: A demolished Palestinian village comes back to life http://t.co/9Lpf4V4hMq
----------------------------------------
Target: 1; real disaster
Text: Iraq - Hashd Shaabi Theft ISIS Suicide Car bomb http://t.co/2AG9auABr3 #ISIS http://t.co/Qna4TUBnWh
----------------------------------------
Target: 0; not a real disaster
Text: Had lunch with Stewart &amp; Julian only a couple of hours earlier. Good to finally find out what happened to them. http://t.co/AnP9g6NjFd
----------------------------------------
Target: 0; not a real disaster
Text: #hot  Funtenna: hijacking computers to send data as sound waves [Black Hat 2015] http://t.co/gexHzU1VK8 #prebreak #best
----------------------------------------
Target: 0; not a real disaster
Text: I rate Hazard very highly but his fanboys are among the worst accounts on Twitter.
----------------------------------------


In [10]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size=0.2,
                                                                            random_state=42)

In [11]:
len(train_sentences), len(val_sentences)

(6090, 1523)

In [12]:
max_vocab_length = 10000
max_length = round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))
max_length

15

In [13]:
#convert text to numbers
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens = max_vocab_length,
                                                 output_sequence_length=max_length)

In [14]:
text_vectorizer.adapt(train_sentences)

In [15]:
random_sentence = random.choice(train_sentences)
print(random_sentence,'\n' ,len(random_sentence))
text_vectorizer([random_sentence])

West Nyack Pineview Road railroad crossing off Western Highway. Units on scene of a CSX Train vs. truck no injuries. 
 116


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 644, 9476, 9011,  312, 8660,    1,  102, 1457, 1696, 1463,   11,
         957,    6,    3, 3657]])>

In [16]:
words = text_vectorizer.get_vocabulary()
words[:5]

['', '[UNK]', 'the', 'a', 'in']

In [17]:
embedding = tf.keras.layers.Embedding(input_dim = max_vocab_length,
                                      output_dim=128,
                                      input_length=max_length)

In [18]:
sample = embedding(text_vectorizer([random_sentence]))

In [19]:
sample[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0.00218471, -0.04161106,  0.0335702 ,  0.00891335,  0.03860814,
        0.03319863, -0.02146928, -0.04943568, -0.04939151, -0.01179548,
        0.00916985,  0.02762618, -0.04668466, -0.03956477, -0.04003911,
        0.00471673,  0.01953084, -0.01678883, -0.03066017,  0.02394   ,
        0.00282335,  0.02344644,  0.04968877, -0.03588381, -0.00655208,
        0.02835547, -0.04641449,  0.02016164,  0.04458631, -0.04315803,
       -0.03062146, -0.0163205 , -0.01473306,  0.0007421 ,  0.02533353,
       -0.04683844,  0.01131252, -0.0256521 , -0.04086021,  0.02751298,
        0.02159179, -0.03878351, -0.0429445 ,  0.03571949, -0.02989376,
        0.02588533, -0.03062025,  0.00514002,  0.04133013, -0.04499562,
       -0.00105796, -0.03247186, -0.04546119,  0.02780583, -0.04137027,
        0.01202095,  0.04766427, -0.03712791,  0.04364704, -0.04783354,
       -0.01085689, -0.01384977,  0.0054137 , -0.0483057 ,  0.00038118,
        0.047976

In [20]:
inputs1 = tf.keras.layers.Input(shape=(1,), dtype='string')
y = text_vectorizer(inputs1)
y = embedding(y)
y = tf.keras.layers.GlobalAveragePooling1D()(y)
outputs1 = tf.keras.layers.Dense(1, activation='sigmoid')(y)

model1 = tf.keras.Model(inputs1, outputs1)
model1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
model1.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 1280129 (4.

In [21]:
history = model1.fit(tf.expand_dims(train_sentences, axis=-1), train_labels,
          epochs=5,
          validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
text_vectorizer

<keras.src.layers.preprocessing.text_vectorization.TextVectorization at 0x7f6475eeee90>

In [23]:
embedding

<keras.src.layers.core.embedding.Embedding at 0x7f64682d4bb0>

In [28]:
# LSTM model

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(32, return_sequences=True)(x)
x = tf.keras.layers.LSTM(32)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model=tf.keras.models.Model(inputs, outputs)

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
history = model.fit(train_sentences, train_labels,
                    epochs=5,
                    validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(32, return_sequences=True)(x)
x = tf.keras.layers.LSTM(32)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model=tf.keras.models.Model(inputs, outputs)
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_9 (LSTM)               (None, 15, 32)            20608     
                                                                 
 lstm_10 (LSTM)              (None, 32)                8320      
                                                                 
 dense_6 (Dense)             (None, 1)                 33        
                                                           

In [33]:

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
# x = tf.keras.layers.LSTM(32, return_sequences=True)(x)
x = tf.keras.layers.LSTM(32)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model=tf.keras.models.Model(inputs, outputs)
model.summary()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_12 (LSTM)              (None, 32)                20608     
                                                                 
 dense_8 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1300641 (4.96 MB)
Trainable params: 1300641 (4.96 MB)
Non-trainable params: 0 (0.00 Byte)
_____________________

In [34]:
inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GRU(32, return_sequences=True)(x)
x = tf.keras.layers.GRU(32)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model=tf.keras.models.Model(inputs, outputs)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
history = model.fit(train_sentences, train_labels,
                    epochs=5,
                    validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
#bidirectional
inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(x)
x = tf.keras.layers.GRU(32)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model=tf.keras.models.Model(inputs, outputs)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
history = model.fit(train_sentences, train_labels,
                    epochs=5,
                    validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [38]:
model.predict(val_sentences)



array([[0.01108829],
       [0.7381666 ],
       [0.99944085],
       ...,
       [0.9981576 ],
       [0.9996232 ],
       [0.99930644]], dtype=float32)

In [43]:
inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(32, return_sequences=True)(x)
x = tf.keras.layers.Conv1D(32, kernel_size=3)(x)
x  = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model=tf.keras.models.Model(inputs, outputs)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
history = model.fit(train_sentences, train_labels,
                    epochs=5,
                    validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [44]:
model.summary()

Model: "model_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_16 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_14 (LSTM)              (None, 15, 32)            20608     
                                                                 
 conv1d_2 (Conv1D)           (None, 13, 32)            3104      
                                                                 
 global_average_pooling1d_2  (None, 32)                0         
  (GlobalAveragePooling1D)                                

In [51]:
embed_test = tf.expand_dims(embedding(text_vectorizer(random_sentence)), axis=0)

conv_output = tf.keras.layers.Conv1D(32, kernel_size=5)(embed_test)

max_pool_output = tf.keras.layers.GlobalAveragePooling1D()(conv_output)

embed_test.shape, conv_output.shape, max_pool_output.shape

(TensorShape([1, 15, 128]), TensorShape([1, 11, 32]), TensorShape([1, 32]))

In [54]:
sample

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.00218471, -0.04161106,  0.0335702 , ...,  0.02089611,
          0.01493256, -0.00632071],
        [-0.02844492,  0.00417246,  0.04647546, ...,  0.01357153,
          0.02590873, -0.04069691],
        [-0.03118578, -0.003856  ,  0.00860796, ..., -0.01314294,
          0.00821984, -0.04735507],
        ...,
        [-0.02048516, -0.01886226,  0.01735269, ..., -0.03261473,
         -0.00904393,  0.02740797],
        [ 0.04087671,  0.0290784 , -0.01900009, ...,  0.01491563,
         -0.04077481, -0.02359316],
        [-0.02953429,  0.04129423, -0.03373753, ...,  0.00120276,
         -0.00522272,  0.00801835]]], dtype=float32)>

In [52]:
embed_test

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.07021279, -0.1683552 , -0.12314296, ..., -0.07314578,
          0.12345617, -0.16574727],
        [ 0.01850186, -0.0542043 , -0.03069371, ..., -0.04426027,
          0.08655186, -0.10371312],
        [ 0.01352757, -0.06369113, -0.06815638, ..., -0.06861822,
          0.06723383, -0.11211368],
        ...,
        [-0.01679607,  0.00811965,  0.01976355, ..., -0.06633724,
         -0.00765045,  0.01948671],
        [-0.00100783,  0.03024441,  0.02894142, ...,  0.08202777,
         -0.06691855, -0.04328826],
        [ 0.02182009, -0.07452771, -0.13433129, ..., -0.05593798,
          0.05702537, -0.11859339]]], dtype=float32)>

In [55]:
import tensorflow_hub as hub
embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2")


In [57]:
embed_sample = embed([random_sentence])
embed_sample

<tf.Tensor: shape=(1, 512), dtype=float32, numpy=
array([[ 0.07001627,  0.04336466,  0.04114123,  0.0165662 ,  0.05908086,
        -0.03386001,  0.00343576,  0.01385107, -0.03554146, -0.00288789,
         0.04043909,  0.06820885, -0.04659358,  0.02944117,  0.0693081 ,
         0.02766102,  0.00744253,  0.02970405, -0.04040539,  0.00042897,
         0.0411342 , -0.01125658,  0.03643789, -0.05253817,  0.03712482,
         0.00503884,  0.05375282,  0.00158123, -0.05218032, -0.00938291,
        -0.06017241,  0.01996459,  0.03556127, -0.04047453, -0.06997656,
         0.03447177,  0.0272627 ,  0.04226906,  0.0358908 ,  0.05641507,
         0.02857709,  0.05450268,  0.0250902 , -0.03590032, -0.07848429,
         0.06709968,  0.00743552,  0.04487915,  0.07002385, -0.00025081,
        -0.0711809 , -0.05344303, -0.0169886 ,  0.03775658,  0.07756229,
        -0.06554384, -0.01913133, -0.03679187,  0.06128075,  0.02603814,
        -0.06406588,  0.07743259,  0.06713787,  0.00547378, -0.07326891,
 

In [59]:
sentence_encoder = hub.KerasLayer(embed,
                                  input_shape=[],
                                  dtype=tf.string,
                                  trainable=False)

In [60]:
model = tf.keras.models.Sequential([
    sentence_encoder,
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
history = model.fit(train_sentences, train_labels,
                    epochs=5,
                    validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [62]:
model.predict(['disaster'])



array([[0.5982488]], dtype=float32)

In [65]:
model = tf.keras.models.Sequential([
    sentence_encoder,
    tf.keras.layers.Dense(32),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
history = model.fit(train_sentences, train_labels,
                    epochs=5,
                    validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
