In [1]:
import pandas as pd
from utils import *
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import string
import re
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.utils.np_utils import to_categorical

Load and clean the dataset

In [2]:
train_data = pd.read_csv("sentiment_data/train.csv")
test_data = pd.read_csv("sentiment_data/test.csv")
train_data = train_data.dropna()
test_data = test_data.dropna()

In [3]:
train_data = train_data[['selected_text','sentiment']]
test_data = test_data[['text','sentiment']]

In [4]:
train_x, train_y = train_data['selected_text'], train_data['sentiment']
test_x, test_y = test_data['text'], test_data['sentiment']

In [5]:
train_y = train_y.to_numpy()
test_y = test_y.to_numpy()
for i in range(len(train_y)):
    if train_y[i] == 'positive':
        train_y[i] = 0
    elif train_y[i] == 'negative':
        train_y[i] = 1
    else:
        train_y[i] = 2
for i in range(len(test_y)):
    if test_y[i] == 'positive':
        test_y[i] = 0
    elif test_y[i] == 'negative':
        test_y[i] = 1
    else:
        test_y[i] = 2

In [6]:
y_train_temp = train_y.copy()
y_test_temp = test_y.copy()
train_y = to_categorical(train_y)
test_y = to_categorical(test_y)

In [7]:
print(train_y)
print(test_y)

[[0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [8]:
train_x = train_x.to_numpy()
test_x = test_x.to_numpy()

In [10]:
for i in range(len(train_x)):
    train_x[i] = depure_data(train_x[i])
for i in range(len(test_x)):
    test_x[i] = depure_data(test_x[i])

In [12]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
regular_punct = list(string.punctuation)

In [14]:
for i in range(len(train_x)):
    train_x[i] = remove_punctuation(train_x[i], regular_punct)
for i in range(len(test_x)):
    test_x[i] = remove_punctuation(test_x[i], regular_punct)

In [15]:
print(train_x.shape)
print(test_x)

(27480,)
['Last session of the day'
 'Shanghai is also really exciting precisely  skyscrapers galore Good tweeps in China SH BJ'
 'Recession hit Veronique Branquinho she has to quit her company such a shame'
 ...
 'I know what you mean My little dog is sinking into depression he wants to move someplace tropical'
 'sutra what is your next youtube video gonna be about I love your videos'
 'omgssh ang cute ng bby']


In [16]:
print(max(test_x, key=len))

went 2 see the hannah montana movie wiv jodie on friday and it was WELL GOOD but i feel nasty for laughing at the lil girl that cryed LOL


In [17]:
# print(max(train_x, key=len))
# maxLen = len(max(train_x, key=len).split())
# print(maxLen)
maxLen = 40

In [18]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt\glove.6B.50d.txt')

In [19]:
# testing embeddings
word = "unk"
idx = 98670
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(idx) + "th word in the vocabulary is", index_to_word[idx])

the index of unk in the vocabulary is 372306
the 98670th word in the vocabulary is chiaramonte


Building model

In [20]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()`. 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
                
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].lower().split()
                
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if w not in word_to_index:
                X_indices[i, j] = word_to_index["unk"]
            else:
                X_indices[i, j] = word_to_index[w]

            # Increment j to j + 1
            j = j + 1
    
    return X_indices

In [21]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    # Initialize the embedding matrix as a numpy array of zeros.
    emb_matrix = np.zeros((vocab_len, emb_dim))
   
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [22]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

weights[0][1][3] = -0.3403


In [23]:
def sentinial_classifier(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph
    # It should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(input_shape, dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)    
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    X = LSTM(128, return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 3-dimensional vectors.
    X = Dense(3)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices, outputs=X)
        
    return model

In [24]:
model = sentinial_classifier((maxLen,), word_to_vec_map, word_to_index)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 40)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 40, 50)            20000050  
                                                                 
 lstm (LSTM)                 (None, 40, 128)           91648     
                                                                 
 dropout (Dropout)           (None, 40, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 3)                 387   

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
X_train_indices = sentences_to_indices(train_x, word_to_index, maxLen)

In [27]:
print(X_train_indices)

[[185457. 390139. 174642. ...      0.      0.      0.]
 [337670. 315103.      0. ...      0.      0.      0.]
 [ 86656. 239105.      0. ...      0.      0.      0.]
 ...
 [393223. 164328. 151349. ...      0.      0.      0.]
 [ 87775. 193716. 383514. ...      0.      0.      0.]
 [ 51582. 358160. 150110. ...      0.      0.      0.]]


In [28]:
model.fit(X_train_indices, train_y, epochs = 35, batch_size = 32, shuffle=True)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x225db613d00>

In [30]:
X_test_indices = sentences_to_indices(test_x, word_to_index, max_len = maxLen)
loss, acc = model.evaluate(X_test_indices, test_y)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.5950763821601868


In [39]:
# Change the sentence below to see your prediction. Make sure all the words are in the Glove embeddings.  
x_test = np.array(['hi how are you'])
X_test_indices = sentences_to_indices(x_test, word_to_index, maxLen)
print(x_test[0] +' '+ output_to_sentiment(np.argmax(model.predict(X_test_indices))))

hi how are you 2
