In [12]:
import os
import random as rnd
import trax
trax.supervised.trainer_lib.init_random_number_generators(31)
import trax.fastmath.numpy as np
from trax import layers as tl
from utils import load_tweets,process_tweet,Layer

# Importing and loading the data

In [3]:
import numpy as np

all_positive_tweets, all_negative_tweets = load_tweets()

print(f"The number of positive tweets: {len(all_positive_tweets)}")
print(f"The number of negative tweets: {len(all_negative_tweets)}")

# Split positive set into validation and training
val_pos   = all_positive_tweets[4000:] # generating validation set for positive tweets
train_pos  = all_positive_tweets[:4000]# generating training set for positive tweets

# Split negative set into validation and training
val_neg   = all_negative_tweets[4000:] # generating validation set for negative tweets
train_neg  = all_negative_tweets[:4000] # generating training set for nagative tweets

# Combine training data into one set
train_x = train_pos + train_neg 

# Combine validation data into one set
val_x  = val_pos + val_neg

# Set the labels for the training set (1 for positive, 0 for negative)
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))

# Set the labels for the validation set (1 for positive, 0 for negative)
val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))

print(f"length of train_x {len(train_x)}")
print(f"length of val_x {len(val_x)}")

The number of positive tweets: 5000
The number of negative tweets: 5000
length of train_x 8000
length of val_x 2000


In [4]:
# Try out function that processes tweets
print("original tweet at training position 0")
print(train_pos[0])

print("Tweet at training position 0 after processing:")
process_tweet(train_pos[0])

original tweet at training position 0
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Tweet at training position 0 after processing:


['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']

# Build Vocabulary based on training data

1. Map each word in each tweet to an integer (index)
2. Augment vocabulary by adding special tokens
"__PAD__: padding"
"__UNK__ : unknown word in vocabulary"

In [5]:
vocab = {'__PAD__':0,'__</e>__':1,'__UNK__':2}

for tweet in train_x:
    processed_tweet = process_tweet(tweet)
    for word in processed_tweet:
        if word not in vocab:
            vocab[word] = len(vocab)

print("Total words in vocab are ",len(vocab))
display(vocab)

Total words in vocab are  9088


{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'followfriday': 3,
 'top': 4,
 'engag': 5,
 'member': 6,
 'commun': 7,
 'week': 8,
 ':)': 9,
 'hey': 10,
 'jame': 11,
 'odd': 12,
 ':/': 13,
 'pleas': 14,
 'call': 15,
 'contact': 16,
 'centr': 17,
 '02392441234': 18,
 'abl': 19,
 'assist': 20,
 'mani': 21,
 'thank': 22,
 'listen': 23,
 'last': 24,
 'night': 25,
 'bleed': 26,
 'amaz': 27,
 'track': 28,
 'scotland': 29,
 'congrat': 30,
 'yeaaah': 31,
 'yipppi': 32,
 'accnt': 33,
 'verifi': 34,
 'rqst': 35,
 'succeed': 36,
 'got': 37,
 'blue': 38,
 'tick': 39,
 'mark': 40,
 'fb': 41,
 'profil': 42,
 '15': 43,
 'day': 44,
 'one': 45,
 'irresist': 46,
 'flipkartfashionfriday': 47,
 'like': 48,
 'keep': 49,
 'love': 50,
 'custom': 51,
 'wait': 52,
 'long': 53,
 'hope': 54,
 'enjoy': 55,
 'happi': 56,
 'friday': 57,
 'lwwf': 58,
 'second': 59,
 'thought': 60,
 '’': 61,
 'enough': 62,
 'time': 63,
 'dd': 64,
 'new': 65,
 'short': 66,
 'enter': 67,
 'system': 68,
 'sheep': 69,
 'must': 70,
 'buy':

Each unique word has a unique integer associated with it

# Defining a function to convert a tweet to tensor with indices of each word

In [6]:
def tweet_to_tensor(tweet,vocab,unk_token='__UNK__',verbose=False):
    
    word_list = process_tweet(tweet)
    
    if verbose:
        print("List of words from processed tweet : \n")
        print(word_list)
        
    tensor_list = []
    unkId = vocab[unk_token]
    
    for word in word_list:
        wordId = vocab.get(word,unkId)
        tensor_list.append(wordId)
    
    return tensor_list

In [7]:
print("Actual tweet is \n",val_pos[0])
print("\nTensor of tweet:\n",tweet_to_tensor(val_pos[0],vocab))

Actual tweet is 
 Bro:U wan cut hair anot,ur hair long Liao bo
Me:since ord liao,take it easy lor treat as save $ leave it longer :)
Bro:LOL Sibei xialan

Tensor of tweet:
 [1065, 136, 479, 2351, 745, 8148, 1123, 745, 53, 2, 2672, 791, 2, 2, 349, 601, 2, 3489, 1017, 597, 4559, 9, 1065, 157, 2, 2]


# Define a data generator

In [90]:
def data_generator(pos_data,neg_data,batch_size,loop,vocab,shuffle=False):
    
    assert batch_size % 2 == 0
    
    n_to_take = batch_size//2
    
    pos_index=0
    neg_index=0
    
    len_data_pos = len(pos_data)
    len_data_neg = len(neg_data)
    
    #get arrays with data indexes
    pos_index_array = list(range(len_data_pos))
    neg_index_array = list(range(len_data_neg))
    
    if shuffle:
        rnd.shuffle(pos_index_array)
        rnd.shuffle(neg_index_array)
    
    stop = False
    
    while not stop:
        
        #create a batch with postive and negative examples
        batch = []
        
        #First part: Pack n_to_take postive examples.
        for i in range(n_to_take):
            
            if pos_index >= len_data_pos:
                if not loop:
                    stop = True
                    break;
                pos_index = 0
                if shuffle:
                    rnd.shuffle(pos_index_array)
            
            tweet = pos_data[pos_index_array[pos_index]]
            tensor = tweet_to_tensor(tweet,vocab)
            batch.append(tensor)
            pos_index += 1
            
        for i in range(n_to_take):
            
            if neg_index>=len_data_neg:
                if not loop:
                    stop=True
                    break;
                neg_index = 0
                if shuffle:
                    rnd.shuffle(neg_index_array)
            
            tweet = neg_data[neg_index_array[neg_index]]
            tensor = tweet_to_tensor(tweet,vocab)
            batch.append(tensor)
            neg_index+=1
        
        if stop:
            break
        
        pos_index += n_to_take
        neg_index += n_to_take
        
        #get max tweet length to pad all other shorter tweets
        max_len = max([len(t) for t in batch])
        
        tensor_pad_list = []
        
        for tensor in batch:
            
            n_pad = max_len - len(tensor)
            pad_l = [0]*n_pad
            tensor_pad = tensor + pad_l
            tensor_pad_list.append(tensor_pad)
        
        inputs = np.array(tensor_pad_list)
        targets = np.array([1]*n_to_take+[0]*n_to_take)
        #Weightage for examples
        example_weights = np.ones(len(targets))
            
        yield inputs,targets,example_weights 
    

In [75]:
rnd.seed(30)

def train_generator(batch_size,shuffle=False):
    return data_generator(train_pos,train_neg,batch_size,True,vocab,shuffle)

def val_generator(batch_size,shuffle=False):
    return data_generator(val_pos,val_neg,batch_size,True,vocab,shuffle)

def test_generator(batch_size,shuffle=False):
    return data_generator(val_pos,val_neg,batch_size,False,vocab,shuffle)

inputs,targets,example_weights = next(train_generator(4,shuffle=True))

print("Inputs : ",inputs)
print("Targets : ",targets)
print(example_weights)

Length of targets =  4
Inputs :  [[2005 4451 3201    9    0    0    0    0    0    0    0]
 [4954  567 2000 1454 5174 3499  141 3499  130  459    9]
 [3761  109  136  583 2930 3969    0    0    0    0    0]
 [ 250 3761    0    0    0    0    0    0    0    0    0]]
Targets :  [1 1 0 0]
[1. 1. 1. 1.]


In [55]:
# Test the train_generator

# Create a data generator for training data,
# which produces batches of size 4 (for tensors and their respective targets)
tmp_data_gen = train_generator(batch_size = 4)

# Call the data generator to get one batch and its targets
tmp_inputs, tmp_targets, tmp_example_weights = next(tmp_data_gen)

print(f"The inputs shape is {tmp_inputs.shape}")
for i,t in enumerate(tmp_inputs):
    print(f"input tensor: {t}; target {tmp_targets[i]}; example weights {tmp_example_weights[i]}")

The inputs shape is (4, 14)
input tensor: [3 4 5 6 7 8 9 0 0 0 0 0 0 0]; target 1; example weights 1.0
input tensor: [10 11 12 13 14 15 16 17 18 19 20  9 21 22]; target 1; example weights 1.0
input tensor: [5738 2901 3761    0    0    0    0    0    0    0    0    0    0    0]; target 0; example weights 1.0
input tensor: [ 858  256 3652 5739  307 4458  567 1230 2767  328 1202 3761    0    0]; target 0; example weights 1.0


# Defining Classes

 
ReLU Class

In [56]:
class Relu(Layer):
    def forward(self,x):
        activation = np.maximum(x,0)
        return activation

In [57]:
x = np.array([[-2.0,-1.0,0.0],[0.0,1.0,2.0]])
relu_layer = Relu()
print("Test data : \n",x)
print("\nOutput of Relu :\n",relu_layer(x))

Test data : 
 [[-2. -1.  0.]
 [ 0.  1.  2.]]

Output of Relu :
 [[0. 0. 0.]
 [0. 1. 2.]]



Dense Class

In [58]:
from trax.fastmath import numpy as np
from trax.fastmath import random as random

In [59]:
tmp_key = random.get_prng(seed=1)
print("The random seed generated by random.get_prng")
print(tmp_key)
print()
print("choose a matrix with 2 rows and 3 columns")
tmp_shape=(2,3)
print(tmp_shape)

tmp_weight = trax.fastmath.random.normal(key=tmp_key,shape=tmp_shape)
print("Weight matrix generated with a normal distribution with mean 0 and stdev of 1")
print(tmp_weight)

The random seed generated by random.get_prng
[0 1]

choose a matrix with 2 rows and 3 columns
(2, 3)
Weight matrix generated with a normal distribution with mean 0 and stdev of 1
[[ 0.957307   -0.9699291   1.0070664 ]
 [ 0.36619022  0.17294823  0.29092228]]


In [60]:
class Dense(Layer):
    
    def __init__(self,n_units,init_stdev=0.1):
        #set nuber of units in the layer
        self._n_units = n_units
        self._init_stdev = init_stdev
    
    def forward(self,x):
        #matrix multiply x and weight matrix
        dense = np.dot(x,self.weights)
        return dense
    
    def init_weights_and_state(self,input_signature,random_key):
        
        input_shape = (input_signature.shape[-1],self._n_units)
        w = trax.fastmath.random.normal(key=random_key,shape=input_shape)
        self.weights = w*self._init_stdev
        return self.weights
        

In [61]:
dense_layer = Dense(n_units=10)
random_key = random.get_prng(seed=0)
z = np.array([[2.0,7.0,25.0]])

dense_layer.init(z,random_key)
print("Weights are \n",dense_layer.weights)
print("\nOuput of forward function :\n",dense_layer(z))

Weights are 
 [[-0.02837108  0.09368162 -0.10050076  0.14165013  0.10543301  0.09108126
  -0.04265672  0.0986188  -0.05575325  0.00153249]
 [-0.20785688  0.0554837   0.09142365  0.05744595  0.07227863  0.01210617
  -0.03237354  0.16234995  0.02450038 -0.13809784]
 [-0.06111237  0.01403724  0.08410042 -0.1094358  -0.10775021 -0.11396459
  -0.05933381 -0.01557652 -0.03832145 -0.11144515]]

Ouput of forward function :
 [[-3.0395496   0.9266802   2.5414743  -2.050473   -1.9769388  -2.582209
  -1.7952735   0.94427425 -0.8980402  -3.7497487 ]]


# Model

In [62]:
def classifier(vocab_size=len(vocab),embedding_dim=256,output_dim=2,mode='train'):
    
    embed_layer = tl.Embedding(vocab_size=len(vocab),d_feature=embedding_dim)
    mean_layer = tl.Mean(axis=1)
    dense_output_layer = tl.Dense(n_units = output_dim)
    log_softmax_layer = tl.LogSoftmax()
    
    model = tl.Serial(
            embed_layer,
            mean_layer,
            dense_output_layer,
            log_softmax_layer
            )
    
    return model

In [63]:
tmp_model = classifier()
display(tmp_model)

Serial[
  Embedding_9088_256
  Mean
  Dense_2
  LogSoftmax
]

# Training the model

In [64]:
from trax.supervised import training

batch_size=16
rnd.seed(271)

train_task = training.TrainTask(
    labeled_data = train_generator(batch_size=batch_size,shuffle=True),
    loss_layer = tl.CrossEntropyLoss(),
    optimizer = trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint=10)

eval_task = training.EvalTask(
    labeled_data = val_generator(batch_size=batch_size,shuffle=True),
    metrics = [tl.CrossEntropyLoss(),tl.Accuracy()])

model = classifier()

In [65]:
output_dir = '/model/'
output_dir_expand = os.getcwd()+output_dir
print(output_dir_expand)

/home/siddarthathentu/Desktop/NLP/SentimentAnalysis-Trax_DeepNet/model/


In [66]:
def train_model(classifier,train_task,eval_task,n_steps,output_dir):
    
    training_loop = training.Loop(
                            classifier,
                            train_task,
                            eval_task=eval_task,
                            output_dir=output_dir)
    
    training_loop.run(n_steps=n_steps)
    
    return training_loop

In [67]:
training_loop = train_model(model,train_task,eval_task,100,output_dir_expand)

Step      1: train CrossEntropyLoss |  0.88939184
Step      1: eval  CrossEntropyLoss |  0.68833977
Step      1: eval          Accuracy |  0.50000000
Step     10: train CrossEntropyLoss |  0.61036736
Step     10: eval  CrossEntropyLoss |  0.52182281
Step     10: eval          Accuracy |  0.68750000
Step     20: train CrossEntropyLoss |  0.34137666
Step     20: eval  CrossEntropyLoss |  0.20654777
Step     20: eval          Accuracy |  1.00000000
Step     30: train CrossEntropyLoss |  0.20208919
Step     30: eval  CrossEntropyLoss |  0.21594885
Step     30: eval          Accuracy |  0.93750000
Step     40: train CrossEntropyLoss |  0.19611199
Step     40: eval  CrossEntropyLoss |  0.17582780
Step     40: eval          Accuracy |  1.00000000
Step     50: train CrossEntropyLoss |  0.11203776
Step     50: eval  CrossEntropyLoss |  0.07589274
Step     50: eval          Accuracy |  1.00000000
Step     60: train CrossEntropyLoss |  0.09375445
Step     60: eval  CrossEntropyLoss |  0.09290722


# Prediction

In [81]:
tmp_train_generator = train_generator(batch_size=16)
tmp_batch = next(tmp_train_generator)
tmp_inputs,tmp_targets,tmp_example_weights = tmp_batch
print(f"The batch is a tuple of length {len(tmp_batch)} because position 0 contains the tweets, and position 1 contains the targets.") 
print(f"The shape of the tweet tensors is {tmp_inputs.shape} (num of examples, length of tweet tensors)")
print(f"The shape of the labels is {tmp_targets.shape}, which is the batch size.")
print(f"The shape of the example_weights is {tmp_example_weights.shape}, which is the same as inputs/targets size.")

Length of targets =  16
The batch is a tuple of length 3 because position 0 contains the tweets, and position 1 contains the targets.
The shape of the tweet tensors is (16, 15) (num of examples, length of tweet tensors)
The shape of the labels is (16,), which is the batch size.
The shape of the example_weights is (16,), which is the same as inputs/targets size.


In [82]:
tmp_pred = training_loop.eval_model(tmp_inputs)
print(f"The prediction shape is {tmp_pred.shape}, num of tensor_tweets as rows")
print("Column 0 is the probability of a negative sentiment (class 0)")
print("Column 1 is the probability of a positive sentiment (class 1)")
print()
print("View the prediction array")
tmp_pred

The prediction shape is (16, 2), num of tensor_tweets as rows
Column 0 is the probability of a negative sentiment (class 0)
Column 1 is the probability of a positive sentiment (class 1)

View the prediction array


DeviceArray([[-4.9417334e+00, -7.1678162e-03],
             [-6.5846405e+00, -1.3823509e-03],
             [-5.4463038e+00, -4.3215752e-03],
             [-4.3487496e+00, -1.3007164e-02],
             [-4.9131699e+00, -7.3764324e-03],
             [-4.7097702e+00, -9.0477467e-03],
             [-5.2801600e+00, -5.1045418e-03],
             [-4.1103230e+00, -1.6538382e-02],
             [-1.8327236e-03, -6.3028107e+00],
             [-4.7376156e-03, -5.3545637e+00],
             [-3.4697056e-03, -5.6654320e+00],
             [-1.1444092e-05, -1.1379559e+01],
             [-1.0051131e-02, -4.6050968e+00],
             [-1.0130405e-03, -6.8951969e+00],
             [-6.1047077e-03, -5.1017342e+00],
             [-7.4422359e-03, -4.9043016e+00]], dtype=float32)

In [83]:
tmp_is_positive = tmp_pred[:,1] > tmp_pred[:,0]
for i, p in enumerate(tmp_is_positive):
    print(f"Neg log prob {tmp_pred[i,0]:.4f}\tPos log prob {tmp_pred[i,1]:.4f}\t is positive? {p}\t actual {tmp_targets[i]}")

Neg log prob -4.9417	Pos log prob -0.0072	 is positive? True	 actual 1
Neg log prob -6.5846	Pos log prob -0.0014	 is positive? True	 actual 1
Neg log prob -5.4463	Pos log prob -0.0043	 is positive? True	 actual 1
Neg log prob -4.3487	Pos log prob -0.0130	 is positive? True	 actual 1
Neg log prob -4.9132	Pos log prob -0.0074	 is positive? True	 actual 1
Neg log prob -4.7098	Pos log prob -0.0090	 is positive? True	 actual 1
Neg log prob -5.2802	Pos log prob -0.0051	 is positive? True	 actual 1
Neg log prob -4.1103	Pos log prob -0.0165	 is positive? True	 actual 1
Neg log prob -0.0018	Pos log prob -6.3028	 is positive? False	 actual 0
Neg log prob -0.0047	Pos log prob -5.3546	 is positive? False	 actual 0
Neg log prob -0.0035	Pos log prob -5.6654	 is positive? False	 actual 0
Neg log prob -0.0000	Pos log prob -11.3796	 is positive? False	 actual 0
Neg log prob -0.0101	Pos log prob -4.6051	 is positive? False	 actual 0
Neg log prob -0.0010	Pos log prob -6.8952	 is positive? False	 actual 0

# Evaluation

In [84]:
def compute_accuracy(preds,y,y_weights):
    
    is_pos = preds[:,1]>preds[:,0]
    is_pos_int = is_pos.astype(np.int32)
    correct = (is_pos_int==y)
    sum_weights = np.sum(y_weights)
    correct_float = correct.astype(np.float32)
    weighted_correct_float = correct_float * y_weights
    weighted_num_correct = np.sum(weighted_correct_float)
    accuracy = weighted_num_correct/sum_weights
    
    return accuracy,weighted_num_correct,sum_weights
    

In [85]:
tmp_val_generator = val_generator(64)

# get one batch
tmp_batch = next(tmp_val_generator)

# Position 0 has the model inputs (tweets as tensors)
# position 1 has the targets (the actual labels)
tmp_inputs, tmp_targets, tmp_example_weights = tmp_batch

# feed the tweet tensors into the model to get a prediction
tmp_pred = training_loop.eval_model(tmp_inputs)

tmp_acc, tmp_num_correct, tmp_num_predictions = compute_accuracy(preds=tmp_pred, y=tmp_targets, y_weights=tmp_example_weights)

print(f"Model's prediction accuracy on a single training batch is: {100 * tmp_acc}%")
print(f"Weighted number of correct predictions {tmp_num_correct}; weighted number of total observations predicted {tmp_num_predictions}")

Length of targets =  64
Model's prediction accuracy on a single training batch is: 100.0%
Weighted number of correct predictions 64.0; weighted number of total observations predicted 64.0


In [87]:
def test_model(generator,model):
    
    accuracy = 0.
    total_num_correct = 0
    total_num_pred = 0
    
    for batch in generator:
        
        inputs = batch[0]
        targets = batch[1]
        example_weights = batch[2]
        
        pred = model(inputs)
        
        batch_accuracy,batch_num_correct,batch_num_pred = compute_accuracy(
                                                          pred,
                                                          targets,
                                                          example_weights)
        
        total_num_correct += batch_num_correct
        total_num_pred += batch_num_pred
        
    accuracy = total_num_correct/total_num_pred
    
    return accuracy


In [93]:
model = training_loop.eval_model
accuracy = test_model(test_generator(16),model)

print("Accuracy of the model = ",accuracy)

Accuracy of the model =  0.9930556


# Testing Own Input

In [99]:
def predict(sentence):
    
    inputs = np.array(tweet_to_tensor(sentence,vocab))
    #Adding a dimension for batch
    inputs = inputs[None,:]
    
    pred_probs = model(inputs)
    
    preds = int(pred_probs[0,1]>pred_probs[0,0])
    
    sentiment = "negative"
    
    if preds==1:
        sentiment = "positive"
        
    return preds,sentiment

In [112]:
sentence = "It's such a nice day, think i'll be taking Sid to Ramsgate fish and chips for lunch at Peter's fish factory and then the beach maybe"
tmp_pred,tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")
print("\n")
sentence = "This movie was almost good"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

The sentiment of the sentence 
***
"It's such a nice day, think i'll be taking Sid to Ramsgate fish and chips for lunch at Peter's fish factory and then the beach maybe"
***
is positive.


The sentiment of the sentence 
***
"This movie was almost good"
***
is negative.
