<a href="https://colab.research.google.com/github/sahug/ds-tensorflow-colab/blob/master/Tensorflow%20-%20Word%20Embedding%20in%20NLP%20On%20Twitter%20Sentiment%20Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Tensorflow - Word Embedding in NLP On Twitter Sentiment Data**

In [1]:
import numpy as np
import pandas as pd
from numpy import array
from tensorflow import keras
from keras.layers import (
    Activation,
    Conv1D,
    Dense,
    Dropout,
    Embedding,
    Flatten,
    GlobalMaxPooling1D,
    MaxPooling1D,
)
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("data/twitter4000.csv")
df.head()

Unnamed: 0,twitts,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0


In [4]:
# 1 is Positive Sentiments
# 0 is Negative Sentiments
df["sentiment"].value_counts()

0    2000
1    2000
Name: sentiment, dtype: int64

In [5]:
text = df["twitts"].tolist()
text

['is bored and wants to watch a movie  any suggestions?',
 'back in miami.  waiting to unboard ship ',
 "@misskpey awwww dnt dis brng bak memoriessss,  I thnk I'm sad. LoL",
 'ughhh i am so tired  blahhhhhhhhh',
 "@mandagoforth me bad! It's funny though. Zachary Quinto is only there for a few though.  &amp; to reply just put the @ symbol before the name!",
 "brr, i'm so cold. at the moment doing my assignment on Huntington's Disease, which is really depressing ",
 "@kevinmarquis haha yep but i really need to sleep, i feel like crap lol cant sleep when he's away  god i'm pathetic!",
 "eating some ice-cream while I try to see @peterfacinelli's followers numbre raise...not working sadly ",
 '@phatty84 just hella bored at work  lol',
 'Food poisoning blowssss ',
 "@StaciG She sent home the best guy, I'm over the show already ",
 '@shika Yeah I did. Trying to recover it now. I guess its time to retire that hardware ',
 "can't fall asleep ",
 'Padres come back from being down 6-0 &amp; we lo

In [6]:
y = df["sentiment"]

In [7]:
token = Tokenizer()
token.fit_on_texts(text)
token

<keras_preprocessing.text.Tokenizer at 0x1dec133c0d0>

In [8]:
# Tokenizing the words in each text
vocab = token.index_word
vocab

{1: 'i',
 2: 'to',
 3: 'the',
 4: 'a',
 5: 'my',
 6: 'and',
 7: 'you',
 8: 'is',
 9: 'it',
 10: 'in',
 11: 'for',
 12: 'of',
 13: 'me',
 14: 'on',
 15: 'so',
 16: 'that',
 17: "i'm",
 18: 'have',
 19: 'at',
 20: 'but',
 21: 'just',
 22: 'was',
 23: 'with',
 24: 'not',
 25: 'be',
 26: 'this',
 27: 'day',
 28: 'up',
 29: 'now',
 30: 'good',
 31: 'all',
 32: 'get',
 33: 'out',
 34: 'go',
 35: 'no',
 36: 'http',
 37: 'today',
 38: 'like',
 39: 'are',
 40: 'love',
 41: 'your',
 42: 'quot',
 43: 'too',
 44: 'lol',
 45: 'work',
 46: 'got',
 47: "it's",
 48: 'amp',
 49: 'do',
 50: 'com',
 51: 'u',
 52: 'back',
 53: 'going',
 54: 'what',
 55: 'time',
 56: 'from',
 57: 'had',
 58: 'will',
 59: 'know',
 60: 'about',
 61: 'im',
 62: 'am',
 63: "don't",
 64: 'can',
 65: 'one',
 66: 'really',
 67: "can't",
 68: 'we',
 69: 'oh',
 70: 'well',
 71: 'still',
 72: '2',
 73: 'some',
 74: 'its',
 75: 'miss',
 76: 'want',
 77: 'see',
 78: 'when',
 79: 'home',
 80: 'think',
 81: 'an',
 82: 'as',
 83: 'if',
 

In [9]:
# How it works?
# x = ["i to the a and"] # Before tokenization
# x = [1, 2, 3, 4, 6] # After tokenization

x = ["i to the a and"]
token.texts_to_sequences(x)

[[1, 2, 3, 4, 6]]

In [10]:
encoded_text = token.texts_to_sequences(text)
encoded_text

[[8, 304, 6, 345, 2, 191, 4, 236, 254, 3079],
 [52, 10, 1019, 206, 2, 3080, 3081],
 [3082, 1197, 668, 1955, 3083, 1956, 3084, 1, 3085, 17, 115, 44],
 [1957, 1, 62, 15, 192, 3086],
 [3087,
  13,
  113,
  47,
  328,
  136,
  3088,
  3089,
  8,
  101,
  88,
  11,
  4,
  285,
  136,
  48,
  2,
  448,
  21,
  277,
  3,
  3090,
  218,
  3,
  449],
 [3091,
  17,
  15,
  315,
  19,
  3,
  892,
  164,
  5,
  1459,
  14,
  3092,
  3093,
  386,
  8,
  66,
  1460],
 [3094,
  110,
  366,
  20,
  1,
  66,
  85,
  2,
  108,
  1,
  117,
  38,
  536,
  44,
  182,
  108,
  78,
  346,
  207,
  305,
  17,
  3095],
 [450, 73, 537, 569, 295, 1, 316, 2, 77, 3096, 367, 3097, 1461, 24, 187, 893],
 [3098, 21, 1958, 304, 19, 45, 44],
 [409, 3099, 3100],
 [3101, 132, 609, 79, 3, 193, 368, 17, 131, 3, 158, 199],
 [3102, 127, 1, 139, 226, 2, 1020, 9, 29, 1, 222, 74, 55, 2, 3103, 16, 3104],
 [67, 894, 423],
 [1959,
  119,
  52,
  56,
  211,
  159,
  387,
  669,
  48,
  68,
  255,
  1462,
  3,
  3105,
  71,
  570,
  

In [11]:
# +1 beacuse index starts with 0
vocab_size = len(token.index_word) + 1
vocab_size

10135

In [12]:
# Since all our encoded test are of different length. We need to bring it all to the same size.
max_length = 120
x = pad_sequences(encoded_text, maxlen=max_length, padding="post")
x

# Now we can see all the lengths are of same size.

array([[    8,   304,     6, ...,     0,     0,     0],
       [   52,    10,  1019, ...,     0,     0,     0],
       [ 3082,  1197,   668, ...,     0,     0,     0],
       ...,
       [ 1033,    21,  1021, ...,     0,     0,     0],
       [10134,   134,     7, ...,     0,     0,     0],
       [   94,    11,   226, ...,     0,     0,     0]])

In [13]:
x.shape

(4000, 120)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, random_state=42, test_size=0.2, stratify=y
)

In [15]:
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [16]:
vec_size = 300

model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=vec_size, input_length=max_length))

model.add(Conv1D(filters=64, kernel_size=8, activation="relu"))
model.add(MaxPooling1D(2))
model.add(Dropout(0.2))

model.add(Dense(units=32, activation="relu"))
model.add(Dropout(0.5))

model.add(Dense(units=16, activation="relu"))

model.add(GlobalMaxPooling1D())

model.add(
    Dense(units=1, activation="sigmoid")
)  # Output is 1, i.e., 0 or 1. Positive or Negative

In [17]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 300)          3040500   
                                                                 
 conv1d (Conv1D)             (None, 113, 64)           153664    
                                                                 
 max_pooling1d (MaxPooling1D  (None, 56, 64)           0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 56, 64)            0         
                                                                 
 dense (Dense)               (None, 56, 32)            2080      
                                                                 
 dropout_1 (Dropout)         (None, 56, 32)            0         
                                                        

In [19]:
%%time
model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: total: 2min 52s
Wall time: 34.1 s


<keras.callbacks.History at 0x1debe603820>

In [20]:
# Making Predictions
# Just like we encoded the training and test data. We need to encode the prediction data.
def get_encoded_data(x):
    x = token.texts_to_sequences(x)
    x = pad_sequences(x, maxlen=max_length, padding="post")
    return x

In [22]:
x = ["worst services. will not come again!"]
model.predict(get_encoded_data(x))

array([[0.00035208]], dtype=float32)

In [24]:
x = ["Loved it!"]
model.predict(get_encoded_data(x))

array([[0.81922317]], dtype=float32)