In [1]:
import collections
import os
import tensorflow as tf
import pandas as pd

tf.__version__


'2.13.0'

In [2]:
train_dir = r'https://raw.githubusercontent.com/sedeba19/NLP--Random-Models/main/train.csv'
test_dir = r'https://raw.githubusercontent.com/sedeba19/NLP--Random-Models/main/test.csv'

In [3]:
train_dir

'https://raw.githubusercontent.com/sedeba19/NLP--Random-Models/main/train.csv'

In [4]:
# Make a train_df
train_df = pd.read_csv(train_dir)
test_df = pd.read_csv(test_dir)
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac = 1, random_state = 1)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
3228,4632,emergency%20services,"Sydney, New South Wales",Goulburn man Henry Van Bilsen missing: Emergen...,1
3706,5271,fear,,The things we fear most in organizations--fluc...,0
6957,9982,tsunami,Land Of The Kings,@tsunami_esh ?? hey Esh,0
2887,4149,drown,,@POTUS you until you drown by water entering t...,0
7464,10680,wounds,"cody, austin follows ?*?",Crawling in my skin\nThese wounds they will no...,1


In [6]:
train_df_shuffled.shape

(7613, 5)

In [7]:
# How many examples of each class?
train_df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [8]:
# How many total samples?
len(train_df), len(test_df)

(7613, 3263)

In [9]:
# Visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5)
for row in train_df_shuffled[["text", "target"]][random_index: random_index +5].itertuples():
    _, text, target = row
    if target > 0:
        print(f"Target:{target}", "(real disaster)")
    else:
        print(f"Target:{target}", "(not a real disaster)")
    
    print(f"Text: \n{text}")
    print("-----\n")

Target:1 (real disaster)
Text: 
Large sinkhole swallows entire pond in Lowndes County Georgia: Large sinkholeÛ_ http://t.co/bCLDQmMEHg #Occasion2B
-----

Target:0 (not a real disaster)
Text: 
Heard theres two more deaths and a murder chrissie kills adam?  val and finn die? #emmerdale
-----

Target:1 (real disaster)
Text: 
There's a fire in the Catalinas. Looks kinda cool. This picture doesn't do it justice. https://t.co/N0tAwGeZJx
-----

Target:1 (real disaster)
Text: 
Here I'm the UK there isn't a deluge of Canadian themed tops around...The timing was perfect. I can't quite believe it. Mad.
-----

Target:1 (real disaster)
Text: 
BMX issues Areal Flood Advisory for Shelby [AL] till Aug 5 9:00 PM CDT http://t.co/62OddEkVLi
-----


### Split data into training and validation sets

In [10]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df['text'],
                                                                            train_df['target'],
                                                                            test_size = 0.1,
                                                                            random_state = 42)

In [11]:
train_df['text'].shape

(7613,)

In [12]:
train_df["text"]

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [13]:
train_df['text'].to_numpy()

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [14]:
train_df['text'].to_numpy()

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [15]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

### Converting text into numbers: Text Vectorization also known as Tokenization

In [16]:
# Get the total words
total_words = 0
for i in train_sentences:
    total_words += len(i.split())

total_words

101905

In [17]:
train_sentences.shape

(6851,)

In [18]:
# Get the average words per sentence or line
avg_words_per_sentence = round(total_words/len(train_sentences))
avg_words_per_sentence

15

In [19]:
# Normally, total number of words is equal or less than the max vocabulary length
max_vocab_length = 10000

# max length of a statement is equal or more than the average word per sentence or line
max_length = 15

In [20]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
                                    output_mode = "int",
                                    output_sequence_length = max_length)

# Fit the text vectorizer to the training set
text_vectorizer.adapt(train_sentences)

In [21]:
# Create a sample sentence and tokenize it
samp_sentence = "There's a flood in my street!"
print(samp_sentence, f"Sentence length is {len(samp_sentence)}.")
text_vectorizer([samp_sentence])
#len(tf.squeeze(text_vectorizer([samp_sentence])))

There's a flood in my street! Sentence length is 29.


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[282,   3, 206,   4,  13, 674,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [22]:
# Choose random sentence from the training dataset and tokenize it
rand_train_sentence = random.choice(train_sentences)
print(f"Original text: \n\n {rand_train_sentence} \n\nVectorized text:")
text_vectorizer([rand_train_sentence])

Original text: 

 @minhazmerchant Govt should pass the bills in the Pandemonium. UPA used to do it why cant NDA? 

Vectorized text:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[5077, 1071,  149, 1374,    2, 6054,    4,    2,  510, 7117,  560,
           5,   69,   15,   92]], dtype=int64)>

In [23]:
# Get unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
words_in_vocab

['',
 '[UNK]',
 'the',
 'a',
 'in',
 'to',
 'of',
 'and',
 'i',
 'is',
 'for',
 'on',
 'you',
 'my',
 'with',
 'it',
 'that',
 'at',
 'by',
 'this',
 'from',
 'are',
 'be',
 'was',
 'have',
 'like',
 'as',
 'me',
 'but',
 'up',
 'just',
 'so',
 'im',
 'not',
 'amp',
 'your',
 'out',
 'all',
 'after',
 'its',
 'has',
 'no',
 'will',
 'an',
 'fire',
 'when',
 'if',
 'we',
 'get',
 'now',
 'more',
 'via',
 'new',
 'about',
 'what',
 'people',
 'or',
 'news',
 'he',
 'they',
 'over',
 'one',
 'been',
 'how',
 'dont',
 'who',
 'video',
 'into',
 'were',
 'do',
 'us',
 'can',
 'emergency',
 '2',
 'disaster',
 'there',
 'her',
 'some',
 'than',
 'still',
 'would',
 'his',
 'police',
 'crash',
 'burning',
 'suicide',
 'body',
 'california',
 'back',
 'time',
 'buildings',
 'had',
 'why',
 'off',
 'them',
 'got',
 'man',
 'see',
 'storm',
 'know',
 'going',
 'cant',
 'world',
 'first',
 'day',
 'rt',
 'nuclear',
 'love',
 'youtube',
 'our',
 'attack',
 'go',
 'fires',
 'two',
 'their',
 'bomb',

In [24]:
len(words_in_vocab)

10000

In [25]:
# Top 5 words in words_in_vocab
words_in_vocab[:5]

['', '[UNK]', 'the', 'a', 'in']

In [26]:
# Least 5 words in words_in_vocab
words_in_vocab[-5:]

['pakthey', 'pakistan\x89Ûªs', 'pakistans', 'pajamas', 'paints']

### Embedding layer

To make our embedding, we are going to use TensorFlow’s embedding layer: https://www.tensorflow.org/api/docs/python/tf/keras/layers/

The parameters are below mostly used.
* input_dim = size of the vocabulary
* output_dim = size of the output embedding vector.  For example, a value of 100 would mean each token gets represented by a vector 100 long
* input_length = length of the sequences being passed to the embedding layer

In [27]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = max_vocab_length,
                             output_dim = 64,
                             embeddings_initializer = "uniform",
                             input_length = max_length)

In [28]:
# Get a random sentence from the training set and return Original text, Vectorized text, Embedded text
rand_sentence = random.choice(train_sentences)
print(f"Original text: {rand_sentence}\n")
print(f"Vectorized text: {text_vectorizer([rand_sentence])}\n")
print(f"Vectorized text shape: {text_vectorizer([rand_sentence]).shape}")
print(f"Embbedded text: {embedding(text_vectorizer([rand_sentence]))}")
print(f"Embedded text shape: {embedding(text_vectorizer([rand_sentence])).shape}")

Original text: Lets collide untill we fill the space.. ??

Vectorized text: [[ 626  491 7119   47 2468    2  727    0    0    0    0    0    0    0
     0]]

Vectorized text shape: (1, 15)
Embbedded text: [[[-0.02228637 -0.00947329 -0.0163114  -0.03736948  0.02488824
    0.01472025 -0.00469341 -0.03942858 -0.02593633  0.01290803
    0.00384271 -0.04834062 -0.04896084 -0.01508756  0.0196147
    0.04230246  0.00835015 -0.04076647  0.02260426  0.00933318
    0.01028373  0.04660355  0.00659395 -0.03558195 -0.02024691
   -0.02573478  0.036111   -0.0085351   0.00478745 -0.04428923
    0.02805993 -0.04240767  0.04449243  0.0447212   0.04168984
    0.04138031 -0.03626964 -0.00903473 -0.03090723 -0.00973574
    0.01069944  0.00698551 -0.04655854 -0.0372542   0.03168334
   -0.00199701  0.03509508 -0.03886942  0.02392096 -0.04088578
    0.02755543 -0.0294963  -0.03834801  0.00479911 -0.02095217
   -0.03763348 -0.0410137   0.00165931 -0.04156733  0.04389076
   -0.04110622 -0.02194468  0.01512226  

### Model_6: TensorFlow Hub Pretrained Feature Extractor: Universal Sentence Encoder

https://tfhub.dev/google/universal-sentence-encoder/4
See how the USE was created here: https://arvix.org/abs/1803.11175

In [29]:
# Create a Keras Layer using the USE Pretrained Layer from TensorFlow Hub
import tensorflow_hub as hub
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape = [],
                                        dtype = tf.string,
                                        trainable = False,
                                        name = 'USE')

In [30]:
# Create model using the Sequential API
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation = 'relu'),
    layers.Dense(1, activation = 'sigmoid')
],  name = "Model_6_USE")

# Compile the model
model_6.compile(loss = 'binary_crossentropy',
                optimizer= tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

In [31]:
model_6.summary()

Model: "Model_6_USE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 64)                32832     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256830721 (979.73 MB)
Trainable params: 32897 (128.50 KB)
Non-trainable params: 256797824 (979.61 MB)
_________________________________________________________________


In [32]:
# Make predictions with USE TF Hub Model
model_6_pred_probs = model_6.predict(val_sentences)
model_6_pred_probs



array([[0.4924408 ],
       [0.48849168],
       [0.50193673],
       [0.52213603],
       [0.4912856 ],
       [0.51232547],
       [0.5129209 ],
       [0.49586514],
       [0.49343225],
       [0.51077175],
       [0.49312463],
       [0.4901473 ],
       [0.4940431 ],
       [0.5176412 ],
       [0.5052407 ],
       [0.50227845],
       [0.53047943],
       [0.500984  ],
       [0.5200668 ],
       [0.49704862],
       [0.49331227],
       [0.5361298 ],
       [0.47368258],
       [0.503551  ],
       [0.4929105 ],
       [0.4973081 ],
       [0.5297302 ],
       [0.50074947],
       [0.50397336],
       [0.50943464],
       [0.49893928],
       [0.50273246],
       [0.49815634],
       [0.527015  ],
       [0.5055498 ],
       [0.5033864 ],
       [0.48147637],
       [0.4970862 ],
       [0.49884704],
       [0.5139729 ],
       [0.49933702],
       [0.5059863 ],
       [0.49705234],
       [0.49817935],
       [0.50032777],
       [0.5048313 ],
       [0.5057713 ],
       [0.509

In [33]:
val_labels.shape

(762,)

In [34]:
model_6_preds = tf.squeeze(tf.round(model_6_pred_probs))
model_6_preds.shape

TensorShape([762])

In [35]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model_6_accuracy_score = accuracy_score(val_labels, model_6_preds)
model_6_precision, model_6_recall, model_6_f1score, _, =precision_recall_fscore_support(val_labels, model_6_preds, average = "weighted")

In [36]:
dict_result = {"model_6_accuracy_score": model_6_accuracy_score,
               "model_6_precision": model_6_precision,
               "model_6_recall": {model_6_recall},
               "model_6_f1score": {model_6_f1score}}
dict_result

{'model_6_accuracy_score': 0.5498687664041995,
 'model_6_precision': 0.5858114524864652,
 'model_6_recall': {0.5498687664041995},
 'model_6_f1score': {0.5401814795219433}}