In [18]:
import pandas as  pd 
import numpy as np
import tensorflow as tf 
from tensorflow.keras.layers import  Dense,TextVectorization,Embedding,Input
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [6]:

def create_model(model_url, num_classes=10):
  """Takes a TensorFlow Hub URL and creates a Keras Sequential model with it.
  
  Args:
    model_url (str): A TensorFlow Hub feature extraction URL.
    num_classes (int): Number of output neurons in output layer,
      should be equal to number of target classes, default 10.

  Returns:
    An uncompiled Keras Sequential model with model_url as feature
    extractor layer and Dense output layer with num_classes outputs.
  """
  # Download the pretrained model and save it as a Keras layer
  feature_extractor_layer = hub.KerasLayer(model_url,
                                           trainable=False, # freeze the underlying patterns
                                           name='feature_extraction_layer',
                                           input_shape=IMAGE_SHAPE+(3,)) # define the input image shape
  
  # Create our own model
  model = tf.keras.Sequential([
    feature_extractor_layer, # use the feature extraction layer as the base
    layers.Dense(num_classes, activation='softmax', name='output_layer') # create our own output layer      
  ])

  return model

In [8]:
df=pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
X=df["text"]
Y=df["target"]
x_train,x_val,y_train,y_val= train_test_split(X,Y, random_state=1,test_size=0.30, shuffle=True,stratify=Y)


In [12]:
# Basic model is build 

model_0 = Pipeline([
    ("tfid",TfidfVectorizer()),
    ("MultinomialNB",MultinomialNB())
])
model_0.fit(x_train,y_train)

In [15]:
def model_performance(model,predictor,target):
    pre=model.predict(predictor)
    recall=metrics.recall_score(target,pre)
    accuracy=metrics.accuracy_score(target,pre)
    f1_score=metrics.f1_score(target,pre)
    precision=metrics.precision_score(target,pre)
    final_matrics=pd.DataFrame([
        {
            "recall":recall,
            "precision":precision,
            "accuracy":accuracy,
            "f1_score":f1_score
        }
    ],index=[0])
    return final_matrics

In [16]:
model_performance(model_0,x_train,y_train)

Unnamed: 0,recall,precision,accuracy,f1_score
0,0.768996,0.974004,0.891912,0.859444


In [79]:
base_line=model_performance(model_0,x_val,y_val)
base_line


Unnamed: 0,recall,precision,accuracy,f1_score
0,0.590214,0.874622,0.787653,0.704808


### we are now building the NLP model 

# first create the input 

In [46]:
## first we need to adapt the textvector 
text_vector=TextVectorization(max_tokens=1000,output_mode="int",output_sequence_length=15)
text_vector.adapt(x_train)

In [71]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, GlobalAveragePooling1D, Dense

# Define input layer
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)

# Text vectorization layer
x = text_vector(inputs)

# Embedding layer
x = Embedding(input_dim=1000,output_dim=128)(x)  # Adjust output_dim as needed

# Global average pooling
x = GlobalAveragePooling1D()(x)

# Output layer
outputs = Dense(1, activation="sigmoid")(x)

# Create the model
model_2 = tf.keras.Model(inputs, outputs)
model_2.summary()


In [72]:
## compile the modmode
# Compile model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [73]:
model_2_history=model_2.fit(
    x_train,
    y_train,
    epochs=5,
    validation_data=(x_val,y_val))

Epoch 1/5


[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6096 - loss: 0.6603 - val_accuracy: 0.7294 - val_loss: 0.5790
Epoch 2/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7583 - loss: 0.5434 - val_accuracy: 0.7631 - val_loss: 0.5091
Epoch 3/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7953 - loss: 0.4623 - val_accuracy: 0.7798 - val_loss: 0.4854
Epoch 4/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8195 - loss: 0.4170 - val_accuracy: 0.7802 - val_loss: 0.4749
Epoch 5/5
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8238 - loss: 0.4025 - val_accuracy: 0.7837 - val_loss: 0.4731


In [70]:
prc=model_2.predict(x_train)
prc[:5]

[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 401us/step


array([[0.43593556],
       [0.6119642 ],
       [0.60438526],
       [0.6693997 ],
       [0.95964515]], dtype=float32)

In [74]:
def model_performance(model,predictor,target):
    pre=model.predict(predictor)
    pre=tf.squeeze(np.round(pre))
    recall=metrics.recall_score(target,pre)
    accuracy=metrics.accuracy_score(target,pre)
    f1_score=metrics.f1_score(target,pre)
    precision=metrics.precision_score(target,pre)
    final_matrics=pd.DataFrame([
        {
            "recall":recall,
            "precision":precision,
            "accuracy":accuracy,
            "f1_score":f1_score
        }
    ],index=[0])
    return final_matrics

In [75]:
model_performance(model_2,x_train,y_train)

[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 752us/step


Unnamed: 0,recall,precision,accuracy,f1_score
0,0.761572,0.838059,0.834303,0.797987


In [80]:
nlp_model=model_performance(model_2,x_val,y_val)
nlp_model


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 477us/step


Unnamed: 0,recall,precision,accuracy,f1_score
0,0.698267,0.775764,0.783713,0.734979


In [81]:
base_line,nlp_model

(     recall  precision  accuracy  f1_score
 0  0.590214   0.874622  0.787653  0.704808,
      recall  precision  accuracy  f1_score
 0  0.698267   0.775764  0.783713  0.734979)

In [84]:
words_vocab=text_vector.get_vocabulary()
words_vocab[:5]

['', '[UNK]', 'the', 'a', 'in']

### Get the weight matrix of embedding layer 
## (these are the numerical patterns between the text in the training dataset the model has learned)
- embed_weights = model_1.get_layer("embedding_1").get_weights()[0]
print(embed_weights.shape) # same size as vocab size and embedding_dim (each word is a embedding_dim size vector)
(10000, 128)
Now we've got these two objects, we can use the Embedding Projector tool to visualize our embedding.

To use the Embedding Projector tool, we need two files:

The embedding vectors (same as embedding weights).
The meta data of the embedding vectors (the words they represent - our vocabulary).
Right now, we've got of these files as Python objects. To download them to file, we're going to use the code example available on the TensorFlow word embeddings tutorial page.

<!-- ## Code below is adapted from: https://www.tensorflow.org/tutorials/text/word_embeddings#retrieve_the_trained_word_embeddings_and_save_them_to_disk
## import io

## # Create output writers
## out_v = io.open("embedding_vectors.tsv", "w", encoding="utf-8")
## out_m = io.open("embedding_metadata.tsv", "w", encoding="utf-8")

# # Write embedding vectors and words to file
# for num, word in enumerate(words_in_vocab):
#   if num == 0: 
#      continue # skip padding token
#   vec = embed_weights[num]
#   out_m.write(word + "\n") # write words to file
#   out_v.write("\t".join([str(x) for x in vec]) + "\n") # write corresponding word vector to file
# out_v.close()
# out_m.close()

# # Download files locally to upload to Embedding Projector
# try:
#   from google.colab import files
# except ImportError:
#   pass
# else:
#   files.download("embedding_vectors.tsv")
#   files.download("embedding_metadata.tsv") -->
Once you've downloaded the embedding vectors and metadata, you can visualize them using Embedding Vector tool:

Go to http://projector.tensorflow.org/
Click on "Load data"
Upload the two files you downloaded (embedding_vectors.tsv and embedding_metadata.tsv)
Explore
Optional: You can share the data you've created by clicking "Publish"
What do you find?

Are words with similar meanings close together?

Remember, they might not be. The embeddings we downloaded are how our model interprets words, not necessarily how we interpret them.

Also, since the embedding has been learned purely from Tweets, it may contain some strange values as Tweets are a very unique style of natural language.

🤔 Question: Do you have to visualize embeddings every time?

No. Although helpful for gaining an intuition of what natural language embeddings are, it's not completely necessary. Especially as the dimensions of your vocabulary and embeddings grow, trying to comprehend them would become an increasingly difficult task.