In [1]:
def get_data(url, name):
    import urllib
    import zipfile
    import os

    if not os.path.isdir(name): # check whether the directory exists or not
        if not os.path.isfile(name): # check whether zip is downloaded or not
            try: 
                urllib.request.urlretrieve(url, name)
            except:
                pass

        if name[-4:] == ".zip" or name[-4:] == ".rar":
            zip_ref = zipfile.ZipFile(name, "r")
            zip_ref.extractall()
            zip_ref.close()

In [2]:
url = "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"
get_data(url, "nlp_getting_started.zip")

In [3]:
url = "https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py"
get_data(url, "helper_functions.py")

from helper_functions import create_tensorboard_callback, plot_loss_curves, compare_historys

In [4]:
# Visualizing a text dataset 
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# shuffle training dataframe
train_df = train_df.sample(frac=1, random_state=42)

In [6]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [7]:
# split data into training and validation sets
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df["text"].to_numpy(),
                                                                           train_df["target"].to_numpy(),
                                                                           test_size=0.1,
                                                                           random_state=42)

In [9]:
# text vectorization
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization as tv

text_vectorizer = tv(max_tokens=10000,
                    standardize="lower_and_strip_punctuation",
                    split="whitespace",
                    ngrams=None,
                    output_mode="int",
                    output_sequence_length=15,
                    pad_to_max_tokens=True)

In [10]:
text_vectorizer.adapt(train_sentences)

In [13]:
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [16]:
 # create an embedding  using an embedding layer
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=10000,
                            output_dim=128,
                            input_length=15
                            )
embedding

<keras.layers.core.embedding.Embedding at 0x2120b4bd8b0>

In [22]:
import random 
sample_sentence = random.choice(train_sentences)
sample_emb = embedding(text_vectorizer([sample_sentence]))
print(f"sentence: {sample_sentence}\nembedding output: \n{sample_emb}")
print("shape of this embedding layer:",sample_emb.shape)

sentence: @muttatek m believe my 'blue' not isis kwwwkwwwk 
Without weapon 'blue' will hug me jiahahahha 
Yeyeulala....
embedding output: 
[[[-0.03759124 -0.04377259  0.02773656 ...  0.01505175 -0.04070671
   -0.03210203]
  [ 0.01040085  0.02860156 -0.04083972 ... -0.02524467  0.00990736
    0.0077399 ]
  [-0.01052104  0.0018298  -0.04811005 ... -0.02443733  0.01460924
   -0.03601595]
  ...
  [ 0.04510109  0.03563361  0.04587003 ... -0.02516408 -0.02001232
    0.01042796]
  [-0.02791005  0.03573992 -0.02323746 ... -0.03033364 -0.04694866
    0.03226638]
  [-0.03759124 -0.04377259  0.02773656 ...  0.01505175 -0.04070671
   -0.03210203]]]
shape of this embedding layer: (1, 15, 128)


In [26]:
from sklearn.naive_bayes import MultinomialNB as mnb
from sklearn.feature_extraction.text import TfidfVectorizer as tfidfv
from sklearn.pipeline import Pipeline as pipeline

In [28]:
model_0 = pipeline([
    ("tdidf", tfidfv()),
    ("clf", mnb())
])
model_0.fit(train_sentences, train_labels)

In [32]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of {baseline_score*100:.2f}")

Our baseline model achieves an accuracy of 79.27
