In [21]:
#NLP Notebook
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt

In [22]:
import zipfile 
zip = zipfile.ZipFile("nlp_getting_started.zip")
zip.extractall()
zip.close()

In [28]:
train_df = pd.read_csv("nlp_getting_started/train.csv")
test_df = pd.read_csv("nlp_getting_started/test.csv")

In [30]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [31]:
train_df.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [33]:
train_df_shuffled = train_df.sample(frac = 1, random_state = 42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [38]:
import random
random_index = random.randint(0, len(train_df) - 1)
for row in train_df_shuffled[['text', 'target']][random_index : random_index + 5].itertuples():
    index, text, target = row
    print(f"Target: {target}; {'real disaster' if target != 0 else 'not a real disaster'}")
    print("Text: ", text, end = "\n")
    print("--"*20)


Target: 0; not a real disaster
Text:  Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...
----------------------------------------
Target: 1; real disaster
Text:  S61.231A Puncture wound without foreign body of left index finger without damage to nail initial encounter #icd10
----------------------------------------
Target: 1; real disaster
Text:  Udhampur terror attack: Militants attack police post 2 SPOs injured http://t.co/zMWeCBWVaO
----------------------------------------
Target: 0; not a real disaster
Text:  The once desolate valley was transformed into a thriving hub of hiÛÓtech business.
----------------------------------------
Target: 0; not a real disaster
Text:  if firefighters acted like cops they'd drive around shooting a flamethrower at burning buildings
----------------------------------------


In [34]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size = 0.2,
                                                                            random_state = 42)

                                                                            

In [35]:
len(train_sentences), len(val_sentences)

(6090, 1523)

In [37]:
max_vocab_length = 10000 #setting limit on number of words model can learn
max_length = round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))
max_length

15

In [41]:
#convert text to numbers - text vectorisation
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens = max_vocab_length,
                                                 output_sequence_length = max_length)

In [42]:
text_vectorizer.adapt(train_sentences)

In [43]:
random_sentence = random.choice(train_sentences)
print(random_sentence, '\n', len(random_sentence))
text_vectorizer([random_sentence])

A young heavyweight rapping off of detonate I been a leader not a lemon better get it straight ?? 
 97


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   3, 1003,    1, 4387,  102,    6,  507,    8,   61,    3, 1908,
          34,    3, 4793,  453]])>

In [44]:
words = text_vectorizer.get_vocabulary() #retrieves all words used in dataset as a vocabulary
words 
# '[UNK]' means UNKNOWN 

['',
 '[UNK]',
 'the',
 'a',
 'in',
 'to',
 'of',
 'and',
 'i',
 'is',
 'for',
 'on',
 'you',
 'my',
 'with',
 'it',
 'that',
 'at',
 'by',
 'this',
 'from',
 'be',
 'are',
 'was',
 'have',
 'like',
 'as',
 'so',
 'up',
 'im',
 'but',
 'just',
 'me',
 'your',
 'not',
 'amp',
 'out',
 'its',
 'no',
 'has',
 'will',
 'all',
 'an',
 'after',
 'fire',
 'when',
 'if',
 'via',
 'we',
 'now',
 'get',
 'new',
 'more',
 'or',
 'what',
 'people',
 'over',
 'news',
 'about',
 'he',
 'dont',
 'been',
 'how',
 'who',
 'they',
 'one',
 'into',
 'do',
 'were',
 'us',
 'video',
 '2',
 'emergency',
 'disaster',
 'can',
 'there',
 'his',
 'than',
 'her',
 'still',
 'would',
 'storm',
 'some',
 'police',
 'body',
 'them',
 'crash',
 'back',
 'suicide',
 'man',
 'burning',
 'why',
 'time',
 'day',
 'california',
 'rt',
 'first',
 'see',
 'had',
 'going',
 'world',
 'nuclear',
 'off',
 'know',
 'buildings',
 'bomb',
 'got',
 'cant',
 'our',
 'youtube',
 'love',
 'car',
 'attack',
 'killed',
 'fires',
 'tra

In [45]:
embedding = tf.keras.layers.Embedding(input_dim = max_vocab_length,
                                      output_dim = 120,
                                      input_length = max_length)



In [46]:
sample = embedding(text_vectorizer([random_sentence]))
sample

<tf.Tensor: shape=(1, 15, 120), dtype=float32, numpy=
array([[[-0.04244869, -0.03680097, -0.02805817, ...,  0.04411365,
         -0.02866998,  0.01786036],
        [ 0.01437977, -0.03324739, -0.00604177, ..., -0.01516079,
         -0.04671365,  0.01019373],
        [ 0.00918283, -0.04909357, -0.03701273, ...,  0.04926488,
          0.01956513,  0.00249987],
        ...,
        [-0.04244869, -0.03680097, -0.02805817, ...,  0.04411365,
         -0.02866998,  0.01786036],
        [ 0.0381521 ,  0.01924082, -0.04228556, ..., -0.00395843,
          0.02309265, -0.01705937],
        [-0.01951817,  0.02419404, -0.01492586, ..., -0.01188663,
         -0.00550442, -0.00846777]]], dtype=float32)>

In [48]:
sample[0][0] #for each word there's a vector assigned

<tf.Tensor: shape=(120,), dtype=float32, numpy=
array([-0.04244869, -0.03680097, -0.02805817, -0.04663133,  0.02923149,
        0.04409691, -0.01188319, -0.00391092,  0.02141175, -0.02238894,
       -0.02937688, -0.0340545 ,  0.01052625, -0.04886233,  0.01378752,
       -0.04016966,  0.04714585,  0.04233185,  0.00229738,  0.02793683,
       -0.0152971 ,  0.02105315, -0.00784013, -0.02254069,  0.02612816,
        0.03484404, -0.0366744 , -0.03547295, -0.03939867,  0.01134398,
       -0.04432642, -0.03725815, -0.02055209,  0.00555314,  0.02360162,
       -0.00315198,  0.04300695,  0.01287955, -0.00634867,  0.02777685,
        0.02099793,  0.04763869,  0.04546868,  0.00012308, -0.03878796,
       -0.04406376,  0.02074586,  0.03911132, -0.00931199, -0.02762206,
       -0.00795136,  0.02071022, -0.04422308,  0.000994  , -0.01855528,
       -0.01222867,  0.02672818, -0.03936536,  0.04385774,  0.0022599 ,
        0.03657756,  0.0411781 ,  0.04997564,  0.04583656,  0.0093592 ,
       -0.034374

In [55]:
inputs = tf.keras.layers.Input(shape = (1, ), dtype = 'string') #shape = (1, ) because only 1 sentence is getting processed
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x) #1D because we're only having input_dim = 1
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)

model = tf.keras.Model(inputs, outputs)
model.compile(loss = tf.keras.losses.BinaryCrossentropy(),
              optimizer = tf.keras.optimizers.legacy.Adam(),
              metrics =  ['accuracy'])



In [56]:
model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 15)                0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 15, 120)           1200000   
                                                                 
 global_average_pooling1d_4  (None, 120)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_7 (Dense)             (None, 1)                 121       
                                                                 
Total params: 1200121 (4.58 MB)
Trainable params: 1200121 (

In [60]:
history = model.fit(tf.expand_dims(train_sentences, axis = 1), train_labels,
          epochs = 10,
          validation_data = (val_sentences, val_labels))


Epoch 1/10
  1/191 [..............................] - ETA: 0s - loss: 0.0145 - accuracy: 1.0000

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
