In [1]:
import os
import tensorflow as tf
import pandas as pd

tf.__version__


'2.13.0'

In [2]:
train_dir = r'https://raw.githubusercontent.com/sedeba19/NLP--Random-Models/main/train.csv'
test_dir = r'https://raw.githubusercontent.com/sedeba19/NLP--Random-Models/main/test.csv'

In [3]:
train_dir

'https://raw.githubusercontent.com/sedeba19/NLP--Random-Models/main/train.csv'

In [4]:
# Make a train_df
train_df = pd.read_csv(train_dir)
test_df = pd.read_csv(test_dir)
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac = 1, random_state = 1)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
3228,4632,emergency%20services,"Sydney, New South Wales",Goulburn man Henry Van Bilsen missing: Emergen...,1
3706,5271,fear,,The things we fear most in organizations--fluc...,0
6957,9982,tsunami,Land Of The Kings,@tsunami_esh ?? hey Esh,0
2887,4149,drown,,@POTUS you until you drown by water entering t...,0
7464,10680,wounds,"cody, austin follows ?*?",Crawling in my skin\nThese wounds they will no...,1


In [6]:
train_df_shuffled.shape

(7613, 5)

In [7]:
# How many examples of each class?
train_df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [8]:
# How many total samples?
len(train_df), len(test_df)

(7613, 3263)

In [9]:
# Visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5)
for row in train_df_shuffled[["text", "target"]][random_index: random_index +5].itertuples():
    _, text, target = row
    if target > 0:
        print(f"Target:{target}", "(real disaster)")
    else:
        print(f"Target:{target}", "(not a real disaster)")
    
    print(f"Text: \n{text}")
    print("-----\n")

Target:1 (real disaster)
Text: 
Slip Sliding Away - Flash Floods Info for Writers w/Tony Nester @SonoranRattler #writingtips http://t.co/sLTtOrRLHs
-----

Target:0 (not a real disaster)
Text: 
13 reasons why we love women in the military   - lulgzimbestpicts http://t.co/uZ1yiZ7n6m http://t.co/IjwAr15H16
-----

Target:0 (not a real disaster)
Text: 
'Trust us to get rescued by the dopey ones!' Val is hilarious shame she's probably going to die #emmerdale
-----

Target:0 (not a real disaster)
Text: 
One day this heart gone get me zipped up in a body bag.
-----

Target:0 (not a real disaster)
Text: 
Fire waves and darkness
-----


### Split data into training and validation sets

In [10]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df['text'],
                                                                            train_df['target'],
                                                                            test_size = 0.1,
                                                                            random_state = 42)

In [11]:
train_df['text'].shape

(7613,)

In [12]:
train_df["text"]

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [13]:
train_df['text'].to_numpy()

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [14]:
train_df['text'].to_numpy()

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [15]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

### Converting text into numbers: Text Vectorization also known as Tokenization

In [16]:
# Get the total words
total_words = 0
for i in train_sentences:
    total_words += len(i.split())

total_words

101905

In [17]:
train_sentences.shape

(6851,)

In [18]:
# Get the average words per sentence or line
avg_words_per_sentence = round(total_words/len(train_sentences))
avg_words_per_sentence

15

In [19]:
# Normally, total number of words is equal or less than the max vocabulary length
max_vocab_length = 10000

# max length of a statement is equal or more than the average word per sentence or line
max_length = 15

In [20]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
                                    output_mode = "int",
                                    output_sequence_length = max_length)

# Fit the text vectorizer to the training set
text_vectorizer.adapt(train_sentences)

In [21]:
# Create a sample sentence and tokenize it
samp_sentence = "There's a flood in my street!"
print(samp_sentence, f"Sentence length is {len(samp_sentence)}.")
text_vectorizer([samp_sentence])
#len(tf.squeeze(text_vectorizer([samp_sentence])))

There's a flood in my street! Sentence length is 29.


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[282,   3, 206,   4,  13, 674,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [22]:
# Choose random sentence from the training dataset and tokenize it
rand_train_sentence = random.choice(train_sentences)
print(f"Original text: \n\n {rand_train_sentence} \n\nVectorized text:")
text_vectorizer([rand_train_sentence])

Original text: 

 @dicehateme @PuppyShogun This makes sense. Paper beats rock paper comes from wood so wood should be able to support and obliterate rock. 

Vectorized text:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   1, 9461,   19,  802, 1166, 2352, 3149,  979, 2352, 1019,   20,
        3237,   31, 3237,  149]], dtype=int64)>

In [23]:
# Get unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
words_in_vocab

['',
 '[UNK]',
 'the',
 'a',
 'in',
 'to',
 'of',
 'and',
 'i',
 'is',
 'for',
 'on',
 'you',
 'my',
 'with',
 'it',
 'that',
 'at',
 'by',
 'this',
 'from',
 'are',
 'be',
 'was',
 'have',
 'like',
 'as',
 'me',
 'but',
 'up',
 'just',
 'so',
 'im',
 'not',
 'amp',
 'your',
 'out',
 'all',
 'after',
 'its',
 'has',
 'no',
 'will',
 'an',
 'fire',
 'when',
 'if',
 'we',
 'get',
 'now',
 'more',
 'via',
 'new',
 'about',
 'what',
 'people',
 'or',
 'news',
 'he',
 'they',
 'over',
 'one',
 'been',
 'how',
 'dont',
 'who',
 'video',
 'into',
 'were',
 'do',
 'us',
 'can',
 'emergency',
 '2',
 'disaster',
 'there',
 'her',
 'some',
 'than',
 'still',
 'would',
 'his',
 'police',
 'crash',
 'burning',
 'suicide',
 'body',
 'california',
 'back',
 'time',
 'buildings',
 'had',
 'why',
 'off',
 'them',
 'got',
 'man',
 'see',
 'storm',
 'know',
 'going',
 'cant',
 'world',
 'first',
 'day',
 'rt',
 'nuclear',
 'love',
 'youtube',
 'our',
 'attack',
 'go',
 'fires',
 'two',
 'their',
 'bomb',

In [24]:
len(words_in_vocab)

10000

In [25]:
# Top 5 words in words_in_vocab
words_in_vocab[:5]

['', '[UNK]', 'the', 'a', 'in']

In [26]:
# Least 5 words in words_in_vocab
words_in_vocab[-5:]

['pakthey', 'pakistan\x89Ûªs', 'pakistans', 'pajamas', 'paints']

### Embedding layer

To make our embedding, we are going to use TensorFlow’s embedding layer: https://www.tensorflow.org/api/docs/python/tf/keras/layers/

The parameters are below mostly used.
* input_dim = size of the vocabulary
* output_dim = size of the output embedding vector.  For example, a value of 100 would mean each token gets represented by a vector 100 long
* input_length = length of the sequences being passed to the embedding layer

In [27]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = max_vocab_length,
                             output_dim = 64,
                             embeddings_initializer = "uniform",
                             input_length = max_length)

In [28]:
# Get a random sentence from the training set and return Original text, Vectorized text, Embedded text
rand_sentence = random.choice(train_sentences)
print(f"Original text: {rand_sentence}\n")
print(f"Vectorized text: {text_vectorizer([rand_sentence])}\n")
print(f"Vectorized text shape: {text_vectorizer([rand_sentence]).shape}")
print(f"Embbedded text: {embedding(text_vectorizer([rand_sentence]))}")
print(f"Embedded text shape: {embedding(text_vectorizer([rand_sentence])).shape}")

Original text: There is no greater tragedy than becoming comfortable with where you are in life.

Vectorized text: [[  75    9   41    1  454   78 4101    1   14  208   12   21    4  122
     0]]

Vectorized text shape: (1, 15)
Embbedded text: [[[ 1.26075782e-02  2.85288356e-02  4.39691804e-02  2.18316168e-03
    4.58868407e-02  1.05042085e-02 -1.80365555e-02 -3.05623934e-03
   -3.41365710e-02 -2.00509783e-02 -1.34960040e-02  2.74082161e-02
    2.83643119e-02  3.57388519e-02 -4.94100899e-03  2.17640139e-02
    1.65640227e-02  3.22681330e-02 -4.29302454e-02  2.52962969e-02
    2.96481363e-02 -3.22423130e-03 -3.28739658e-02 -1.99830532e-03
   -4.73808534e-02 -2.99576763e-02 -2.39703897e-02 -1.91247351e-02
   -9.51079279e-03 -1.97924133e-02 -3.11579704e-02 -4.25532945e-02
   -6.57411665e-03 -3.39848883e-02 -4.89907637e-02  3.96311283e-03
    4.21259142e-02 -7.21549988e-03  7.58897141e-03  4.22671475e-02
    1.32018663e-02 -3.43545303e-02  1.08594783e-02  5.91820478e-03
   -4.08825502e-02 

### Model_4: Bidirectional-LSTM Model (RNN)

In [29]:
# Build a Bidirectional RNN using tf
from tensorflow.keras import layers
inputs = layers.Input(shape= (1,), dtype = tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences = True))(x)
x = layers.Bidirectional(layers.GRU(64))(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x)
model_4 = tf.keras.Model(inputs, outputs, name = "model_4_bidirectional")

# Compile the model
model_4.compile(loss = 'binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

In [30]:
model_4.summary()

Model: "model_4_bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 15, 64)            640000    
                                                                 
 bidirectional (Bidirection  (None, 15, 128)           66048     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               74496     
 onal)                                                           
                                             

In [31]:
# Fit the model
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs = 5,
                              validation_data = (val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
# Make predictions with our bidirectional model
model_4_preds_probs = model_4.predict(val_sentences)
model_4_preds_probs



array([[8.37476517e-04],
       [7.45571917e-03],
       [1.31698372e-02],
       [3.82663653e-04],
       [8.09200943e-01],
       [8.69984552e-03],
       [2.58309417e-04],
       [5.36024235e-02],
       [1.69344917e-01],
       [9.97609019e-01],
       [9.36134875e-01],
       [7.80503526e-02],
       [2.32805338e-04],
       [1.34736355e-02],
       [1.78580526e-02],
       [8.58788267e-02],
       [7.79334068e-01],
       [3.95308511e-04],
       [9.99916434e-01],
       [8.62272049e-04],
       [2.49181733e-01],
       [9.98743594e-01],
       [1.75687715e-01],
       [9.75726306e-01],
       [3.30374553e-03],
       [5.42663224e-02],
       [9.99594212e-01],
       [7.69283712e-01],
       [9.99810159e-01],
       [3.26174166e-04],
       [2.79099704e-03],
       [2.00613591e-04],
       [7.77914166e-01],
       [5.05566120e-01],
       [2.39427574e-02],
       [2.61462517e-02],
       [1.10328034e-01],
       [2.98398472e-02],
       [4.58436349e-04],
       [4.39229757e-02],


In [34]:
model_4_preds_probs.shape

(762, 1)

In [36]:
tf.squeeze(model_4_preds_probs)

<tf.Tensor: shape=(762,), dtype=float32, numpy=
array([8.37476517e-04, 7.45571917e-03, 1.31698372e-02, 3.82663653e-04,
       8.09200943e-01, 8.69984552e-03, 2.58309417e-04, 5.36024235e-02,
       1.69344917e-01, 9.97609019e-01, 9.36134875e-01, 7.80503526e-02,
       2.32805338e-04, 1.34736355e-02, 1.78580526e-02, 8.58788267e-02,
       7.79334068e-01, 3.95308511e-04, 9.99916434e-01, 8.62272049e-04,
       2.49181733e-01, 9.98743594e-01, 1.75687715e-01, 9.75726306e-01,
       3.30374553e-03, 5.42663224e-02, 9.99594212e-01, 7.69283712e-01,
       9.99810159e-01, 3.26174166e-04, 2.79099704e-03, 2.00613591e-04,
       7.77914166e-01, 5.05566120e-01, 2.39427574e-02, 2.61462517e-02,
       1.10328034e-01, 2.98398472e-02, 4.58436349e-04, 4.39229757e-02,
       1.25538651e-03, 9.99832690e-01, 4.87010879e-03, 9.74357426e-01,
       1.49467727e-03, 3.55872617e-04, 9.98797178e-01, 1.55281872e-01,
       9.92724240e-01, 9.75733936e-01, 2.75566266e-03, 9.99928296e-01,
       9.07589030e-03, 5.7129

In [37]:
model_4_preds = tf.squeeze(tf.round(model_4_preds_probs))
model_4_preds

<tf.Tensor: shape=(762,), dtype=float32, numpy=
array([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1.,
       0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0.,
       1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0.,
       0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 0.,
       0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1.,
       1., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1

In [38]:
val_labels.shape

(762,)

In [39]:
# Evaluate the model by comparing the model_4_preds to val_labels
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model_4_accuracy_score = accuracy_score(val_labels, model_4_preds)
model_4_precision, model_4_recall, model_4_fscore, _ =precision_recall_fscore_support(val_labels,
                                                                                      model_4_preds, average="weighted")

In [40]:
dict_result = {"model_4_accuracy_score": model_4_accuracy_score,
               "model_4_precision": model_4_precision,
               "model_4_recall": {model_4_recall},
               "model_4_f1score": {model_4_fscore}}
dict_result

{'model_4_accuracy_score': 0.7598425196850394,
 'model_4_precision': 0.7615361193455306,
 'model_4_recall': {0.7598425196850394},
 'model_4_f1score': {0.7562429605741445}}