# Tensoflow 2.0 - Word Embedding in NLP On Twitter Sentiment Data

In [None]:
import numpy as np
import pandas as pd
from numpy import array
from tensorflow.keras.layers import (
    Activation,
    Conv1D,
    Dense,
    Dropout,
    Embedding,
    Flatten,
    GlobalMaxPooling1D,
    MaxPooling1D,
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("data/twitter4000.csv")
df.head()

In [None]:
# 1 is Positive Sentiments
# 0 is Negative Sentiments
df["sentiment"].value_counts()

In [None]:
text = df["twitts"].tolist()
text

In [None]:
y = df["sentiment"]

In [None]:
token = Tokenizer()
token.fit_on_texts(text)
token

In [None]:
# Tokenizing the words in each text
vocab = token.index_word
vocab

In [None]:
# How it works?
# x = ["i to the a and"] # Before tokenization
# x = [1, 2, 3, 4, 6] # After tokenization

x = ["i to the a and"]
token.texts_to_sequences(x)

In [None]:
encoded_text = token.texts_to_sequences(text)
encoded_text

In [None]:
# +1 beacuse index starts with 0
vocab_size = len(token.index_word) + 1
vocab_size

In [None]:
# Since all our encoded test are of different length. We need to bring it all to the same size.
max_length = 120
x = pad_sequences(encoded_text, maxlen=max_length, padding="post")
x

# Now we can see all the lengths are of same size.

In [None]:
x.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, random_state=42, test_size=0.2, stratify=y
)

In [None]:
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [None]:
vec_size = 300

model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=vec_size, input_length=max_length))

model.add(Conv1D(filters=64, kernel_size=8, activation="relu"))
model.add(MaxPooling1D(2))
model.add(Dropout(0.2))

model.add(Dense(units=32, activation="relu"))
model.add(Dropout(0.5))

model.add(Dense(units=16, activation="relu"))

model.add(GlobalMaxPooling1D())

model.add(
    Dense(units=1, activation="sigmoid")
)  # Output is 1, i.e., 0 or 1. Positive or Negative

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
%%time
model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

In [None]:
# Making Predictions
# Just like we encoded the training and test data. We need to encode the prediction data.
def get_encoded_data(x):
    x = token.texts_to_sequences(x)
    x = pad_sequences(x, maxlen=max_length, padding="post")
    return x

In [None]:
x = ["worst services. will not come again!"]
model.predict_classes(get_encoded_data(x))

In [None]:
x = ["Loved it!"]
model.predict_classes(get_encoded_data(x))