In [3]:
import os
import shutil
import random as rnd

import trax
import trax.fastmath.numpy as np
from trax import layers as tl
from trax import fastmath

In [4]:
from utils import Layer, load_tweets, process_tweet

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/pallavisingh/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pallavisingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
a = np.array(5.0)
display(a)

Array(5., dtype=float32, weak_type=True)

In [6]:
print(type(a))

<class 'jaxlib.xla_extension.ArrayImpl'>


In [7]:
def f(x):
    return (x**3)

In [8]:
print(f"f(a) for a={a} is {f(a)}")

f(a) for a=5.0 is 125.0


In [9]:
# Directly use trax.fastmath.grad to calculate the gradient (derivative) of the function
grad_f = trax.fastmath.grad(fun=f)  # df / dx - Gradient of function f(x) with respect to x

# View the type of the retuned object (it's a function)
type(grad_f)

function

In [12]:
# Call the newly created function and pass in a value for x (the DeviceArray stored in 'a')
grad_calculation = grad_f(a)

# View the result of calling the grad_f function
display(grad_calculation)

Array(75., dtype=float32, weak_type=True)

In [13]:
def train_val_split():
    # Load positive and negative tweets
    all_positive_tweets, all_negative_tweets = load_tweets()

    # View the total number of positive and negative tweets.
    print(f"The number of positive tweets: {len(all_positive_tweets)}")
    print(f"The number of negative tweets: {len(all_negative_tweets)}")

    # Split positive set into validation and training
    val_pos   = all_positive_tweets[4000:] # generating validation set for positive tweets
    train_pos  = all_positive_tweets[:4000]# generating training set for positive tweets

    # Split negative set into validation and training
    val_neg   = all_negative_tweets[4000:] # generating validation set for negative tweets
    train_neg  = all_negative_tweets[:4000] # generating training set for nagative tweets
    
    # Combine training data into one set
    train_x = train_pos + train_neg 

    # Combine validation data into one set
    val_x  = val_pos + val_neg

    # Set the labels for the training set (1 for positive, 0 for negative)
    train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))

    # Set the labels for the validation set (1 for positive, 0 for negative)
    val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))


    return train_pos, train_neg, train_x, train_y, val_pos, val_neg, val_x, val_y


In [15]:
train_pos, train_neg, train_x, train_y, val_pos, val_neg, val_x, val_y = train_val_split()

print(f"length of train_x {len(train_x)}")
print(f"length of val_x {len(val_x)}")

The number of positive tweets: 5000
The number of negative tweets: 5000
length of train_x 8000
length of val_x 2000


In [17]:
def get_vocab(train_x):

    # Include special tokens 
    # started with pad, end of line and unk tokens
    Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

    # Note that we build vocab using training data
    for tweet in train_x: 
        processed_tweet = process_tweet(tweet)
        for word in processed_tweet:
            if word not in Vocab: 
                Vocab[word] = len(Vocab)
    
    return Vocab

Vocab = get_vocab(train_x)

print("Total words in vocab are",len(Vocab))


Total words in vocab are 9088


# Converting a tweet to a tensor

Instructions: Write a program tweet_to_tensor that takes in a tweet and converts it to an array of numbers. You can use the Vocab dictionary you just found to help create the tensor.

Use the vocab_dict parameter and not a global variable.
Do not hard code the integer value for the __UNK__ token.

In [18]:
def tweet_to_tensor(tweet, vocab_dict, unk_token = '__UNK__', verbose = False):
    word_l = process_tweet(tweet)
    
    if verbose:
        print("List of words from the processed tweet:")
        print(word_l)
        
    # Initialize the list that will contain the unique integer IDs of each word
    tensor_l = [] 
    
    # Get the unique integer ID of the __UNK__ token
    unk_ID = vocab_dict[unk_token]
    
    if verbose:
        print(f"The unique integer ID for the unk_token is {unk_ID}")
        
    # for each word in the list:
    for word in word_l:
        
        # Get the unique integer ID.
        # If the word doesn't exist in the vocab dictionary,
        # use the unique ID for __UNK__ instead.        
        word_ID = vocab_dict.get(word,unk_ID)
            
        # Append the unique integer ID to the tensor list.
        tensor_l.append(word_ID)
    ### END CODE HERE ###
    
    return tensor_l

In [20]:
print("Actual tweet is\n", val_pos[1])
print("\nTensor of tweet:\n", tweet_to_tensor(val_pos[1], vocab_dict=Vocab))

Actual tweet is
 @heyclaireee is back! thnx God!!! i'm so happy :)

Tensor of tweet:
 [443, 2, 303, 566, 56, 9]
