In [1]:
import os
import tensorflow as tf
import pandas as pd

tf.__version__


'2.13.0'

In [2]:
train_dir = r'https://raw.githubusercontent.com/sedeba19/NLP--Random-Models/main/train.csv'
test_dir = r'https://raw.githubusercontent.com/sedeba19/NLP--Random-Models/main/test.csv'

In [3]:
train_dir

'https://raw.githubusercontent.com/sedeba19/NLP--Random-Models/main/train.csv'

In [4]:
# Make a train_df
train_df = pd.read_csv(train_dir)
test_df = pd.read_csv(test_dir)
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac = 1, random_state = 1)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
3228,4632,emergency%20services,"Sydney, New South Wales",Goulburn man Henry Van Bilsen missing: Emergen...,1
3706,5271,fear,,The things we fear most in organizations--fluc...,0
6957,9982,tsunami,Land Of The Kings,@tsunami_esh ?? hey Esh,0
2887,4149,drown,,@POTUS you until you drown by water entering t...,0
7464,10680,wounds,"cody, austin follows ?*?",Crawling in my skin\nThese wounds they will no...,1


In [6]:
train_df_shuffled.shape

(7613, 5)

In [7]:
# How many examples of each class?
train_df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [8]:
# How many total samples?
len(train_df), len(test_df)

(7613, 3263)

In [9]:
# Visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5)
for row in train_df_shuffled[["text", "target"]][random_index: random_index +5].itertuples():
    _, text, target = row
    if target > 0:
        print(f"Target:{target}", "(real disaster)")
    else:
        print(f"Target:{target}", "(not a real disaster)")
    
    print(f"Text: \n{text}")
    print("-----\n")

Target:0 (not a real disaster)
Text: 
Calgary news weather and traffic for August 5 * ~ 45 http://t.co/zAGBMlSf4H http://t.co/HVYXehXBmq
-----

Target:0 (not a real disaster)
Text: 
'There is no victory at bargain basement prices.' Dwight David Eisenhower
-----

Target:0 (not a real disaster)
Text: 
Broadcast journalism: hostages to fortune otherwise quot-television blind else quot-operations since-3g superv...
-----

Target:1 (real disaster)
Text: 
The ol' meltdown victory for the Mets.
-----

Target:0 (not a real disaster)
Text: 
Photo: postapocalypticflimflam: Prodding around the rubble. http://t.co/Bgy4i47j70
-----


### Split data into training and validation sets

In [10]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df['text'],
                                                                            train_df['target'],
                                                                            test_size = 0.1,
                                                                            random_state = 42)

In [11]:
train_df['text'].shape

(7613,)

In [12]:
train_df["text"]

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [13]:
train_df['text'].to_numpy()

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [14]:
train_df['text'].to_numpy()

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [15]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

### Converting text into numbers: Text Vectorization also known as Tokenization

In [16]:
# Get the total words
total_words = 0
for i in train_sentences:
    total_words += len(i.split())

total_words

101905

In [17]:
train_sentences.shape

(6851,)

In [18]:
# Get the average words per sentence or line
avg_words_per_sentence = round(total_words/len(train_sentences))
avg_words_per_sentence

15

In [19]:
# Normally, total number of words is equal or less than the max vocabulary length
max_vocab_length = 10000

# max length of a statement is equal or more than the average word per sentence or line
max_length = 15

In [20]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
                                    output_mode = "int",
                                    output_sequence_length = max_length)

# Fit the text vectorizer to the training set
text_vectorizer.adapt(train_sentences)

In [21]:
# Create a sample sentence and tokenize it
samp_sentence = "There's a flood in my street!"
print(samp_sentence, f"Sentence length is {len(samp_sentence)}.")
text_vectorizer([samp_sentence])
#len(tf.squeeze(text_vectorizer([samp_sentence])))

There's a flood in my street! Sentence length is 29.


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[282,   3, 206,   4,  13, 674,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [22]:
# Choose random sentence from the training dataset and tokenize it
rand_train_sentence = random.choice(train_sentences)
print(f"Original text: \n\n {rand_train_sentence} \n\nVectorized text:")
text_vectorizer([rand_train_sentence])

Original text: 

 Families to sue over Legionnaires: More than 40 families affected by the fatal outbreak of Legionnaires' disease in Edinburgh are to ... 

Vectorized text:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[137,   5, 643,  60, 196,  50,  78, 426, 137, 531,  18,   2, 180,
        325,   6]], dtype=int64)>

In [23]:
# Get unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
words_in_vocab

['',
 '[UNK]',
 'the',
 'a',
 'in',
 'to',
 'of',
 'and',
 'i',
 'is',
 'for',
 'on',
 'you',
 'my',
 'with',
 'it',
 'that',
 'at',
 'by',
 'this',
 'from',
 'are',
 'be',
 'was',
 'have',
 'like',
 'as',
 'me',
 'but',
 'up',
 'just',
 'so',
 'im',
 'not',
 'amp',
 'your',
 'out',
 'all',
 'after',
 'its',
 'has',
 'no',
 'will',
 'an',
 'fire',
 'when',
 'if',
 'we',
 'get',
 'now',
 'more',
 'via',
 'new',
 'about',
 'what',
 'people',
 'or',
 'news',
 'he',
 'they',
 'over',
 'one',
 'been',
 'how',
 'dont',
 'who',
 'video',
 'into',
 'were',
 'do',
 'us',
 'can',
 'emergency',
 '2',
 'disaster',
 'there',
 'her',
 'some',
 'than',
 'still',
 'would',
 'his',
 'police',
 'crash',
 'burning',
 'suicide',
 'body',
 'california',
 'back',
 'time',
 'buildings',
 'had',
 'why',
 'off',
 'them',
 'got',
 'man',
 'see',
 'storm',
 'know',
 'going',
 'cant',
 'world',
 'first',
 'day',
 'rt',
 'nuclear',
 'love',
 'youtube',
 'our',
 'attack',
 'go',
 'fires',
 'two',
 'their',
 'bomb',

In [24]:
len(words_in_vocab)

10000

In [25]:
# Top 5 words in words_in_vocab
words_in_vocab[:5]

['', '[UNK]', 'the', 'a', 'in']

In [26]:
# Least 5 words in words_in_vocab
words_in_vocab[-5:]

['pakthey', 'pakistan\x89Ûªs', 'pakistans', 'pajamas', 'paints']

### Embedding layer

To make our embedding, we are going to use TensorFlow’s embedding layer: https://www.tensorflow.org/api/docs/python/tf/keras/layers/

The parameters are below mostly used.
* input_dim = size of the vocabulary
* output_dim = size of the output embedding vector.  For example, a value of 100 would mean each token gets represented by a vector 100 long
* input_length = length of the sequences being passed to the embedding layer

In [27]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = max_vocab_length,
                             output_dim = 64,
                             embeddings_initializer = "uniform",
                             input_length = max_length)

In [28]:
# Get a random sentence from the training set and return Original text, Vectorized text, Embedded text
rand_sentence = random.choice(train_sentences)
print(f"Original text: {rand_sentence}\n")
print(f"Vectorized text: {text_vectorizer([rand_sentence])}\n")
print(f"Vectorized text shape: {text_vectorizer([rand_sentence]).shape}")
print(f"Embbedded text: {embedding(text_vectorizer([rand_sentence]))}")
print(f"Embedded text shape: {embedding(text_vectorizer([rand_sentence])).shape}")

Original text: Beat:G3 MOTOR VEHICLE COLLISION HIT AND RUN at RAINIER AV S / S CHARLES ST reported on 8/5/2015 6:08 PM Call# 15000270653

Vectorized text: [[   1 2381  921  524  241    7  320   17 4787 2579  251  251 3105  536
   980]]

Vectorized text shape: (1, 15)
Embbedded text: [[[-3.63407135e-02  2.10540779e-02  2.09156312e-02 -2.23741289e-02
    1.80211402e-02 -1.65765882e-02  2.63943411e-02  2.43734568e-04
   -4.00209427e-02  2.97746770e-02  3.07333581e-02 -1.18108988e-02
   -2.77475249e-02  4.79103960e-02  4.69737165e-02 -1.66717172e-02
    4.30268906e-02  4.07264270e-02 -4.52147126e-02  4.93646301e-02
   -4.91703041e-02 -1.60159692e-02 -8.79808515e-03  2.92730965e-02
   -3.66511829e-02 -4.43222038e-02  2.03654803e-02  4.76327874e-02
    4.29905988e-02  3.77776735e-02 -4.69477549e-02  2.63527371e-02
    2.76186131e-02 -3.38257551e-02 -4.77507375e-02  3.84902023e-02
   -4.35551666e-02 -3.05009838e-02 -3.33156139e-02 -1.67606026e-03
    2.97838338e-02  3.71469967e-02  3.15144993

### Model_3: GRU Model

Another popular and effective RNN component is the Gated Recurrent Unit (GRU) Model. The GRU cell has an LSTM cell but has less parameters.

In [29]:
# Build a RNN Model using GRU Cell

from tensorflow.keras import layers
inputs = layers.Input(shape = (1,), dtype = tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64, return_sequences = True)(x)
x = layers.LSTM(32, return_sequences = True)(x)
x = layers.Dense(64, activation = 'relu')(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x)
model_3 = tf.keras.Model(inputs, outputs, name = 'model_3_gru')

# Compile the model
model_3.compile(loss = 'binary_crossentropy',
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

In [30]:
model_3.summary()

Model: "model_3_gru"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 15, 64)            640000    
                                                                 
 gru (GRU)                   (None, 15, 64)            24960     
                                                                 
 lstm (LSTM)                 (None, 15, 32)            12416     
                                                                 
 dense (Dense)               (None, 15, 64)            2112      
                                                       

In [31]:
# Fit the model
model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              epochs = 5,
                              validation_data = (val_sentences, val_labels))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
# Make predictions 
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs[:5]



array([[0.00449536],
       [0.16224536],
       [0.00897315],
       [0.01469627],
       [0.9833816 ]], dtype=float32)

In [33]:
# Convert model_3 pred_prob to labels
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:5]

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 0., 0., 1.], dtype=float32)>

In [34]:
# Evaluate the model by comparing the model_3_preds to val_labels
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model_3_accuracy_score = accuracy_score(val_labels, model_3_preds)
model_3_precision, model_3_recall, model_3_fscore, _ =precision_recall_fscore_support(val_labels,
                                                                                      model_3_preds, average="weighted")

In [35]:
dict_result = {"model_3_accuracy_score": model_3_accuracy_score,
               "model_3_precision": model_3_precision,
               "model_3_recall": {model_3_recall},
               "model_3_f1score": {model_3_fscore}}
dict_result

{'model_3_accuracy_score': 0.7519685039370079,
 'model_3_precision': 0.7512931005930606,
 'model_3_recall': {0.7519685039370079},
 'model_3_f1score': {0.7501882508181344}}