Checking GPU runtime

In [None]:
!nvidia-smi

Wed Jan 25 02:26:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    28W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Helper Functions

In [None]:
### We create a bunch of helpful functions throughout the course.
### Storing them here so they're easily accessible.

import tensorflow as tf

# Create a function to import an image and resize it to be able to be used with our model
def load_and_prep_image(filename, img_shape=224, scale=True):
  """
  Reads in an image from filename, turns it into a tensor and reshapes into
  (224, 224, 3).

  Parameters
  ----------
  filename (str): string filename of target image
  img_shape (int): size to resize target image to, default 224
  scale (bool): whether to scale pixel values to range(0, 1), default True
  """
  # Read in the image
  img = tf.io.read_file(filename)
  # Decode it into a tensor
  img = tf.image.decode_jpeg(img)
  # Resize the image
  img = tf.image.resize(img, [img_shape, img_shape])
  if scale:
    # Rescale the image (get all values between 0 and 1)
    return img/255.
  else:
    return img

# Note: The following confusion matrix code is a remix of Scikit-Learn's 
# plot_confusion_matrix function - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_confusion_matrix.html
import itertools
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

# Our function needs a different name to sklearn's plot_confusion_matrix
def make_confusion_matrix(y_true, y_pred, classes=None, figsize=(10, 10), text_size=15, norm=False, savefig=False): 
  """Makes a labelled confusion matrix comparing predictions and ground truth labels.

  If classes is passed, confusion matrix will be labelled, if not, integer class values
  will be used.

  Args:
    y_true: Array of truth labels (must be same shape as y_pred).
    y_pred: Array of predicted labels (must be same shape as y_true).
    classes: Array of class labels (e.g. string form). If `None`, integer labels are used.
    figsize: Size of output figure (default=(10, 10)).
    text_size: Size of output figure text (default=15).
    norm: normalize values or not (default=False).
    savefig: save confusion matrix to file (default=False).
  
  Returns:
    A labelled confusion matrix plot comparing y_true and y_pred.

  Example usage:
    make_confusion_matrix(y_true=test_labels, # ground truth test labels
                          y_pred=y_preds, # predicted labels
                          classes=class_names, # array of class label names
                          figsize=(15, 15),
                          text_size=10)
  """  
  # Create the confustion matrix
  cm = confusion_matrix(y_true, y_pred)
  cm_norm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] # normalize it
  n_classes = cm.shape[0] # find the number of classes we're dealing with

  # Plot the figure and make it pretty
  fig, ax = plt.subplots(figsize=figsize)
  cax = ax.matshow(cm, cmap=plt.cm.Blues) # colors will represent how 'correct' a class is, darker == better
  fig.colorbar(cax)

  # Are there a list of classes?
  if classes:
    labels = classes
  else:
    labels = np.arange(cm.shape[0])
  
  # Label the axes
  ax.set(title="Confusion Matrix",
         xlabel="Predicted label",
         ylabel="True label",
         xticks=np.arange(n_classes), # create enough axis slots for each class
         yticks=np.arange(n_classes), 
         xticklabels=labels, # axes will labeled with class names (if they exist) or ints
         yticklabels=labels)
  
  # Make x-axis labels appear on bottom
  ax.xaxis.set_label_position("bottom")
  ax.xaxis.tick_bottom()

  # Set the threshold for different colors
  threshold = (cm.max() + cm.min()) / 2.

  # Plot the text on each cell
  for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    if norm:
      plt.text(j, i, f"{cm[i, j]} ({cm_norm[i, j]*100:.1f}%)",
              horizontalalignment="center",
              color="white" if cm[i, j] > threshold else "black",
              size=text_size)
    else:
      plt.text(j, i, f"{cm[i, j]}",
              horizontalalignment="center",
              color="white" if cm[i, j] > threshold else "black",
              size=text_size)

  # Save the figure to the current working directory
  if savefig:
    fig.savefig("confusion_matrix.png")
  
# Make a function to predict on images and plot them (works with multi-class)
def pred_and_plot(model, filename, class_names):
  """
  Imports an image located at filename, makes a prediction on it with
  a trained model and plots the image with the predicted class as the title.
  """
  # Import the target image and preprocess it
  img = load_and_prep_image(filename)

  # Make a prediction
  pred = model.predict(tf.expand_dims(img, axis=0))

  # Get the predicted class
  if len(pred[0]) > 1: # check for multi-class
    pred_class = class_names[pred.argmax()] # if more than one output, take the max
  else:
    pred_class = class_names[int(tf.round(pred)[0][0])] # if only one output, round

  # Plot the image and predicted class
  plt.imshow(img)
  plt.title(f"Prediction: {pred_class}")
  plt.axis(False);
  
import datetime

def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback

# Plot the validation and training data separately
import matplotlib.pyplot as plt

def plot_loss_curves(history):
  """
  Returns separate loss curves for training and validation metrics.

  Args:
    history: TensorFlow model History object (see: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/History)
  """ 
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  accuracy = history.history['accuracy']
  val_accuracy = history.history['val_accuracy']

  epochs = range(len(history.history['loss']))

  # Plot loss
  plt.plot(epochs, loss, label='training_loss')
  plt.plot(epochs, val_loss, label='val_loss')
  plt.title('Loss')
  plt.xlabel('Epochs')
  plt.legend()

  # Plot accuracy
  plt.figure()
  plt.plot(epochs, accuracy, label='training_accuracy')
  plt.plot(epochs, val_accuracy, label='val_accuracy')
  plt.title('Accuracy')
  plt.xlabel('Epochs')
  plt.legend();

def compare_histories(original_history, new_history, initial_epochs=5):
    """
    Compares two TensorFlow model History objects.
    
    Args:
      original_history: History object from original model (before new_history)
      new_history: History object from continued model training (after original_history)
      initial_epochs: Number of epochs in original_history (new_history plot starts from here) 
    """
    
    # Get original history measurements
    acc = original_history.history["accuracy"]
    loss = original_history.history["loss"]

    val_acc = original_history.history["val_accuracy"]
    val_loss = original_history.history["val_loss"]

    # Combine original history with new history
    total_acc = acc + new_history.history["accuracy"]
    total_loss = loss + new_history.history["loss"]

    total_val_acc = val_acc + new_history.history["val_accuracy"]
    total_val_loss = val_loss + new_history.history["val_loss"]

    # Make plots
    plt.figure(figsize=(8, 8))
    plt.subplot(2, 1, 1)
    plt.plot(total_acc, label='Training Accuracy')
    plt.plot(total_val_acc, label='Validation Accuracy')
    plt.plot([initial_epochs-1, initial_epochs-1],
              plt.ylim(), label='Start Fine Tuning') # reshift plot around epochs
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(total_loss, label='Training Loss')
    plt.plot(total_val_loss, label='Validation Loss')
    plt.plot([initial_epochs-1, initial_epochs-1],
              plt.ylim(), label='Start Fine Tuning') # reshift plot around epochs
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.xlabel('epoch')
    plt.show()
  
# Create function to unzip a zipfile into current working directory 
# (since we're going to be downloading and unzipping a few files)
import zipfile

def unzip_data(filename):
  """
  Unzips filename into the current working directory.

  Args:
    filename (str): a filepath to a target zip folder to be unzipped.
  """
  zip_ref = zipfile.ZipFile(filename, "r")
  zip_ref.extractall()
  zip_ref.close()

# Walk through an image classification directory and find out how many files (images)
# are in each subdirectory.
import os

def walk_through_dir(dir_path):
  """
  Walks through dir_path returning its contents.

  Args:
    dir_path (str): target directory
  
  Returns:
    A print out of:
      number of subdiretories in dir_path
      number of images (files) in each subdirectory
      name of each subdirectory
  """
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")
    
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
      y_true: true labels in the form of a 1D array
      y_pred: predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

## Get a text dataset

In [None]:
# Download the dataset
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

# Unzip data using helper function
unzip_data('nlp_getting_started.zip')

--2023-01-25 02:26:49--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.16.128, 142.251.163.128, 142.251.167.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.16.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2023-01-25 02:26:49 (43.4 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



## Visualize the data

In [None]:
import pandas as pd
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
train_df['text'][1]

'Forest fire near La Ronge Sask. Canada'

In [None]:
test_df['text'][1]

'Heard about #earthquake is different cities, stay safe everyone.'

In [None]:
# Shuffle trainning dataframes
train_df_shuffled=train_df.sample(frac=1, random_state=42)
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
# Shuffle test dataframes
test_df_shiffled=train_df.sample(frac=1, random_state=42)
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
# How many examples of each classs?
print("Training class: \n{} Test Class:{}",train_df.target.value_counts())

Training class: 
{} Test Class:{} 0    4342
1    3271
Name: target, dtype: int64


In [None]:
# Total Samples
len(train_df), len (test_df), len(train_df)+len(test_df)

(7613, 3263, 10876)

In [None]:
# 3 Visualize random data
import random
random_idx=random.randint(0,len(train_df)-5)
for row in train_df_shuffled[['text','target']][random_idx:random_idx+5].itertuples():
    _,text,target=row
    print(f'Target : {target}', '(real disaster)' if target>0 else '(not real disaster)')
    print(f'text:\n{text}\n')

Target : 0 (not real disaster)
text:
@UABStephenLong @courtlizcamp Total tweet fail! You are so beautiful inside and out Blaze On!

Target : 1 (real disaster)
text:
Road closures remain in effect due to hazard trees falling tree torching and uphill runs of the fire. Forest Service Road #1 remains close

Target : 0 (not real disaster)
text:
#socialmedia news - New Facebook Page Features Seek to Help Personalize the Customer Experience http://t.co/nbizaTlsmV

Target : 1 (real disaster)
text:
News786-UK Islamist Cleric Anjem Choudary Charged Under Terrorism Act: http://t.co/u7bBeNXWYK

Target : 0 (not real disaster)
text:
http://t.co/kG5pLkeDhr WRAPUP 2-U.S. cable TV companies' shares crushed after Disney disappoints http://t.co/QeIhvn3DNQ



In [None]:
## Split into training and validation sets
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

 ## Text Vectorization (Tokenization)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Use the default TextVectorzation params
text_vectorizer=TextVectorization(max_tokens=None,
                                  standardize='lower_and_strip_punctuation',
                                  split='whitespace',
                                  ngrams=None, #create groups of n-words?
                                  output_mode='int',
                                  output_sequence_length=None,
                                #   pad_to_max_tokens=True,
                                  )


In [None]:
# Find the avergae nnumber of tokens (words) in the training tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [None]:
# Setup text vectorization variables
max_vocab_length=10000 # Max number of words to have in our vocab
max_length=15 # max length of our sequences

text_vectorizer=TextVectorization(max_tokens=max_vocab_length,
                                  output_mode='int',
                                  output_sequence_length=max_length)

In [None]:
# Fit the text vectorizzer to our training 
text_vectorizer.adapt(train_sentences)

In [None]:
train_sentences

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       ...,
       'Near them on the sand half sunk a shattered visage lies... http://t.co/0kCCG1BT06',
       "kesabaran membuahkan hasil indah pada saat tepat! life isn't about waiting for the storm to pass it's about learning to dance in the rain.",
       "@ScottDPierce @billharris_tv @HarrisGle @Beezersun I'm forfeiting this years fantasy football pool out of fear I may win n get my ass kicked"],
      dtype=object)

In [None]:
# Create a sample sentence and tokenize it 
sample_sentence='There  is a flood in my street!'
text_vectorizer([sample_sentence])
# TF shape is (1,15) because max_sequence was set to 15

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 74,   9,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [None]:
# Chose a random senetence from the training dataset and tokenize it
random_sentence=random.choice(train_sentences)
print(f'Orifinal text:\n{random_sentence}\n\nVecotrized  version:\n')
text_vectorizer([random_sentence])

Orifinal text:
news@@ Refugio oil spill may have been costlier bigger than projected http://t.co/jhpdSSVhvE

Vecotrized  version:



<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  58,  877,  254,  385,  133,   24,   59,  903,  825,   76,  837,
        5439,    0,    0,    0]])>

In [None]:
# get the unique words in vocab
words_in_vocab=text_vectorizer.get_vocabulary()
print(f'Number of words in the vocab" {len(words_in_vocab)}')
print(f"Most common words: {words_in_vocab[:5]}")
print(f"Least common words: {words_in_vocab[-5:]}")

Number of words in the vocab" 10000
Most common words: ['', '[UNK]', 'the', 'a', 'in']
Least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


## Creating an Embedding using an Embedding Layer

In [None]:
# Params for an embedding layer: input dim (The side of our vocab)
# : output dim (the size of the output embedding vector, for ex, a value of 100 
# means each token represented by a vector 100 long
# input_length= length of sequences being passed to the embedding layer

In [None]:
from tensorflow.keras import layers
embedding=layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length # how long is each input
)

In [None]:
# Get a random sentence from the training set
random_sentence=random.choice(train_sentences)
print(f'original Text:\n{random_sentence}\n Embedded version:')
sample_embed=embedding(text_vectorizer([random_sentence]))
sample_embed

original Text:
Nashville Theater Attack: Will Gun Grabbers Now Demand ÛÏHatchet Control?Û  http://t.co/OyoGII97yH
 Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.00476965, -0.0128742 ,  0.03243409, ...,  0.01102564,
          0.03371737, -0.04248101],
        [-0.0028124 ,  0.03591036, -0.00859927, ...,  0.04084356,
         -0.01129714,  0.00333037],
        [-0.00798839,  0.01555112,  0.01221825, ..., -0.03843876,
          0.04191059,  0.00211458],
        ...,
        [ 0.01794226, -0.04114918, -0.01369365, ..., -0.03610616,
          0.0139318 ,  0.00177097],
        [ 0.01794226, -0.04114918, -0.01369365, ..., -0.03610616,
          0.0139318 ,  0.00177097],
        [ 0.01794226, -0.04114918, -0.01369365, ..., -0.03610616,
          0.0139318 ,  0.00177097]]], dtype=float32)>

In [None]:
# Check out a single token's embedding
sample_embed[0][0], sample_embed[0][0].shape, random_sentence[0]

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.00476965, -0.0128742 ,  0.03243409,  0.00718433,  0.03764597,
        -0.02010239, -0.00484956,  0.03666592,  0.02182647, -0.02685745,
         0.04862895, -0.02692365, -0.03207918,  0.01872952,  0.0383732 ,
         0.04621769,  0.04401717, -0.01301666,  0.04486955, -0.03482618,
         0.01940661,  0.00434028,  0.02986857,  0.02839965,  0.01059791,
        -0.03210064,  0.03562161, -0.04418817, -0.03115938,  0.0250031 ,
        -0.00558246,  0.04843855,  0.03641251,  0.00373591,  0.04452701,
         0.04140153, -0.00603806,  0.01064005,  0.03125178, -0.00580448,
        -0.0054499 , -0.00314232,  0.04623863, -0.00624144, -0.01791961,
        -0.01857027,  0.01496209,  0.02829106, -0.01422935, -0.00768279,
         0.04601935, -0.0343016 , -0.03138343, -0.0055765 , -0.0160082 ,
         0.02667941, -0.01788199, -0.03065494,  0.02700232, -0.02376973,
        -0.02889676,  0.04013726,  0.02477289, -0.01236875, -0.02292258,
  