In [3]:
import gensim
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Bidirectional, Conv1D, Dense, Embedding, GlobalMaxPool1D, LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud

In [2]:
df = pd.read_csv(
    filepath_or_buffer="data.csv",
    encoding="ISO-8859-1",
    names=["sentiment", "ids", "date", "flag", "user", "text"],
    nrows=1600000
)

df.head(5)

Unnamed: 0,sentiment,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
df = df[["sentiment", "text"]] # Remove unused columns
df["sentiment"].replace(to_replace=4, value=1, inplace=True) # Change positive label from 4 to 1

In [None]:
ax = df.groupby(by="sentiment").count().plot(
    kind="bar",
    title="Label Distribution",
    legend=False
)
ax.set_xticklabels(["Negative", "Positive"], rotation=0)

In [5]:
def preprocess(text):
    text = text.encode(encoding="ascii", errors="ignore").decode(encoding="ascii", errors="ignore") # Convert to ASCII
    text = text.lower() # To lowercase
    text = re.sub(pattern="&amp[,;]", repl="&", string=text) # Clean ampersands
    text = re.sub(pattern="#\S+", repl="[#]", string=text) # Clean hashtags
    text = re.sub(pattern="@\S+", repl="[@]", string=text) # Clean mentions
    text = re.sub(pattern="http[\S]+", repl="[/]", string=text) # Clean URLs
    text = re.sub(pattern="\s+", repl=" ", string=text) # Replace all whitespace with single space
    text = re.sub(pattern=r"[^A-Za-z0-9 ]+", repl="", string=text) # Retain alphanumeric characters/spaces
    text = re.sub(pattern=r"(.)\1\1+", repl=r"\1\1", string=text) # Replace three or more consecutive characters with two characters 
    return text

df["preprocessed_text"] = df["text"].apply(preprocess)

In [None]:
# Positive sentiment wordcloud
plt.figure(figsize=(20, 20))
wc = WordCloud(max_words=2000, width=1000, height=500)
wc.generate(" ".join(df[df.sentiment == 1]["preprocessed_text"]))
plt.imshow(wc, interpolation="bilinear")

In [None]:
# Negative sentiment wordcloud
plt.figure(figsize=(20, 20))
wc = WordCloud(max_words=2000, width=1000, height=500)
wc.generate(" ".join(df[df.sentiment == 0]["preprocessed_text"]))
plt.imshow(wc, interpolation="bilinear")

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df["preprocessed_text"], df["sentiment"], test_size=0.1, random_state=0)

In [None]:
W2V_SIZE = 300
w2v_model = gensim.models.word2vec.Word2Vec(
    [text.split() for text in x_train], 
    size=W2V_SIZE
)
len(w2v_model.wv.vocab)

In [None]:
VOCAB_SIZE = 60000
tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters="", lower=True, oov_token="[OOV]")
tokenizer.fit_on_texts(x_train)

In [9]:
SEQUENCE_LENGTH = 300
x_train = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=SEQUENCE_LENGTH)

In [None]:
model = Sequential()

embedding_matrix = np.zeros((VOCAB_SIZE, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
embedding_layer = Embedding(VOCAB_SIZE, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

model.add(embedding_layer)
model.add(Bidirectional(LSTM(100, dropout=0.4, return_sequences=True)))
model.add(Bidirectional(LSTM(100, dropout=0.4, return_sequences=True)))
model.add(Conv1D(100, 7, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer="adam",
    metrics=['accuracy']
)

callbacks = [
             EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5),
             ModelCheckpoint(
                filepath='./tmp/checkpoint',
                monitor='val_accuracy',
                mode='max',
                save_best_only=True
            )
]

In [None]:
history = model.fit(
    x_train, 
    y_train,
    batch_size=1024,
    epochs=10,
    validation_split=0.1,
    verbose=1,
    callbacks=callbacks
)

In [None]:
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
 
epochs = range(len(accuracy))
 
plt.plot(epochs, accuracy, 'b', label='Training Accuracy')
plt.plot(epochs, val_accuracy, 'r', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.figure() 
plt.plot(epochs, loss, 'b', label='Training Loss')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
# Save tokenizer
with open('tokenizer.pickle', 'wb') as file:
    pickle.dump(tokenizer, file)

# Save model
model.save("model.hdf5")