In [None]:
!nvidia-smi -L

In [None]:
import tensorflow as tf

In [None]:
import zipfile
import matplotlib.pyplot as plt

In [None]:
def unzip_data(filename):
  zip_ref = zipfile.ZipFile(filename)
  zip_ref.extractall()
  zip_ref.close()

In [None]:
def plot_loss_curves(history):
  loss = history.history["loss"]
  val_loss = history.history["val_loss"]
  accuracy = history.history["accuracy"]
  val_accuracy = history.history["val_accuracy"]
  epochs = range(len(history.history["loss"]))
  plt.plot(epochs, loss, label="training loss")
  plt.plot(epochs, val_loss, label="validation loss")
  plt.title("LOSS")
  plt.xlabel("epochs")
  plt.legend()

  plt.plot(epochs, accuracy, label="training accuracy")
  plt.plot(epochs, val_accuracy, label="validation accuracy")
  plt.title("ACCURACY")
  plt.xlable("epochs")
  plt.legend()


In [None]:
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

In [None]:
unzip_data("/content/nlp_getting_started.zip")

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv("train.csv")

In [None]:
train.head(5)

In [None]:
test = pd.read_csv("test.csv")
test.head(5)

In [None]:
train.info()

In [None]:
type(train.loc[0]["id"]), type(train.loc[0]), type(train)

In [None]:
train.loc[0:3]

In [None]:
train.iloc[0:3]

In [None]:
train.loc[(train.id==1)]

In [None]:
train.loc[train.id==1]

In [None]:
train.iloc[[0,3, 2]]

In [None]:
train.loc[[0,2]]

In [None]:
train.loc[0]["text"], train.loc[0]["target"]

In [None]:
train.loc[(train.target==0)]

In [None]:
train.loc[(train["id"]==23)] # train.iloc[(train.id==23)] gives error coz ilocation does not use boolean based index

### From above, we can say that target 0 ==> normal tweet & 1 ==> disaster tweet

In [None]:
train_shuffled = train.sample(frac=1, random_state = 42)

In [None]:
train_shuffled.head(5)

In [None]:
train["target"].value_counts()

In [None]:
len(train), len(train.loc[train["target"]==0])/len(train), len(train.loc[(train["target"]==1)])/len(train)

In [None]:
# How many samples in total?
print(f"Total Train samples:{len(train)}") # print("total: ",len(train))
print(f"Total test samples: {len(test)}")
print(f"Total samples: {len(train) + len(test)}")

In [None]:
train.sample(frac=0.25).head(10)

In [None]:
# let's visualize random training samples
import random
random_idx = random.randint(0, len(train)-5) # generate a random integer within the range (0,len(train)-5)

In [None]:
random_idx

In [None]:
for row in train_shuffled[["text","target"]][random_idx:random_idx+5].itertuples():
  _, text, target = row
  print(f"{target} (real disaster)" if target > 0 else f"{target} (not real disaster)")
  print(f"Text: {text}")
  print("\n")

In [None]:
type(train_shuffled[['text','target']]), len(train_shuffled[['text','target']])

In [None]:
train_shuffled[['text','target']][0:2] # train[0:2]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_data, val_data, train_labels, val_labels = train_test_split(np.array(train_shuffled["text"]),
                                                                  np.array(train_shuffled["target"]), 
                                                                  test_size=0.1, 
                                                                  random_state=42)

# train_data, val_data, train_labels, val_labels = train_test_split(train_shuffled["text"].to_numpy(),
#                                                                    train_shuffled["target"].to_numpy(),
#                                                                    test_size=0.1,
#                                                                    random_state = 42)

In [None]:
train_data

In [None]:
len(train_data),len(val_data), len(train_labels), len(val_labels)

In [None]:
type(train_data), sum([len(sent.split()) for sent in train_data])

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
# text_vectorizer = TextVectorization(max_tokens=None,
#                                     standardize="lower_and_strip_punctuation",
#                                     split="whitespace",
#                                     ngrams=None,
#                                     output_mode="int",
#                                     output_sequence_length=None)

In [None]:
# setting max_tokens
max_vocab_length = 10000

# setting output_sequence_length to the average no. of words per tweet
max_length = round(sum([len(sent.split()) for sent in train_data])/ len(train_data)) # list comprehension

In [None]:
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                   output_mode="int",
                                   output_sequence_length=max_length)

In [None]:
text_vectorizer.adapt(train_data)
# During adapt(), the layer will build a vocabulary of all string tokens seen in the dataset,
# sorted by occurance count, with ties broken by sort order of the tokens (high to low).

In [None]:
# let's test our text vectorizer
text_vectorizer(["I am enough!"])

In [None]:
random_sent = random.choice(train_data)
text_vectorizer(random_sent)

In [None]:
words_in_vocab = text_vectorizer.get_vocabulary()

top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print(f"Total words in vocabualry: {len(words_in_vocab)}")
print(f"five most common words: {top_5_words}")
print(f"five most uncommon words: {bottom_5_words}")

In [None]:
tf.random.set_seed(42)
from tensorflow.keras import layers
embedding = layers.Embedding(input_dim=max_vocab_length,
                                              output_dim=128,
                                              embeddings_initializer="uniform",
                                              input_length=max_length,
                                              name="embedding_1")

In [None]:
embedding

In [None]:
random_sentence = random.choice(train_data)
print("Original sentence: ",random_sentence)
print("Numerical encoded sentence: ",text_vectorizer([random_sentence]))
print("Embedded version: ")
sample_embed = embedding(text_vectorizer([random_sentence]))
print(sample_embed)

In [None]:
sample_embed[0][0]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer