In [1]:
#Import the libraries

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from tqdm import tqdm
import multiprocessing
from sklearn.model_selection import train_test_split
from gensim.models.word2vec import Word2Vec
from sklearn import utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding

In [None]:
#Import the clean tweets csv obtained from cleanTweets microservice
#Volume mapping for microservice- TBD;

csv = "./resources/clean_tweets.csv"
df = pd.read_csv(csv)

In [None]:
df.shape

In [None]:
df.head(10)

In [None]:
df.isnull().sum()  #No missing values

In [8]:
#Input as tweets and output as sentiment
x = df.tweets    
y = df.sentiment

In [9]:
np.random.randint(0, high=1000)

863

In [10]:
#Split into train, test and validation

x_train, x_val_test, y_train, y_val_test = \
    train_test_split(x, y, test_size=0.02, random_state=np.random.randint(0, high=1000))

x_test, x_val, y_test, y_val = \
    train_test_split(x_val_test, y_val_test, test_size=0.5, random_state=np.random.randint(0, high=1000))

In [11]:
x_train.shape[0] + x_test.shape[0] + x_val.shape[0] == x.shape[0]

True

In [12]:
y_train.shape[0] + y_test.shape[0] + y_val.shape[0] == y.shape[0]

True

In [13]:
#Word2Vec: CBOW or Skip Gram 
#CBOW: Predict a word from surrounding context
#Skip Gram: Predict surrounding context from a word

In [14]:
#Word2Vec unigram - CBOW model

cores = multiprocessing.cpu_count()  #no. of cpu threads
model_ug_cbow = Word2Vec(sg=0, workers=cores/2, min_count=2, size=200) #skip-gram=0
model_ug_cbow.build_vocab([x.split() for x in tqdm(df.tweets)])

100%|██████████| 1596251/1596251 [00:06<00:00, 250250.14it/s]


In [15]:
model_ug_cbow.train(utils.shuffle([x.split() for x in tqdm(df.tweets)]), total_examples=len(df.tweets), epochs=1)

100%|██████████| 1596251/1596251 [00:06<00:00, 264914.22it/s]


(15791624, 20613984)

In [16]:
#Word2Vec Unigram - Skip Gram Model
model_ug_sg = Word2Vec(sg=1, workers=cores/2)
model_ug_sg.build_vocab([x.split() for x in tqdm(df.tweets)])

100%|██████████| 1596251/1596251 [00:05<00:00, 266357.61it/s]


In [17]:
model_ug_sg.train(utils.shuffle([x.split() for x in tqdm(df.tweets)]), total_examples=len(df.tweets), epochs=1)

100%|██████████| 1596251/1596251 [00:05<00:00, 269881.64it/s]


(15633668, 20613984)

In [18]:
#Create Word Embeddings for Keras model

embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = model_ug_cbow.wv[w] #np.append(model_ug_cbow.wv[w],model_ug_sg.wv[w])
print('Found %s word vectors.' % len(embeddings_index))

Found 106460 word vectors.


In [19]:
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)

Using TensorFlow backend.


In [20]:
sequences = tokenizer.texts_to_sequences(x_train)
x_train_seq = pad_sequences(sequences, maxlen=45)
print('Shape of data tensor:', x_train_seq.shape)

Shape of data tensor: (1564325, 45)


In [21]:
sequences_val = tokenizer.texts_to_sequences(x_val)
x_val_seq = pad_sequences(sequences_val, maxlen=45)

In [22]:
num_words = 100000
embedding_matrix = np.zeros((num_words, 200))

for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [23]:
#Create Keras model with 1D Conv-NN


embedd = Embedding(100000, 200, weights=[embedding_matrix], input_length=45, trainable=False)
model = Sequential()
model.add(embedd)
model.add(Conv1D(filters=100, kernel_size=2, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_val), epochs=5, batch_size=32, verbose=2)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1564325 samples, validate on 15963 samples
Epoch 1/5
 - 317s - loss: 0.4466 - acc: 0.7893 - val_loss: 0.4344 - val_acc: 0.7969
Epoch 2/5
 - 317s - loss: 0.4256 - acc: 0.8022 - val_loss: 0.4218 - val_acc: 0.8034
Epoch 3/5
 - 313s - loss: 0.4189 - acc: 0.8061 - val_loss: 0.4231 - val_acc: 0.8044
Epoch 4/5
 - 318s - loss: 0.4147 - acc: 0.8087 - val_loss: 0.4148 - val_acc: 0.8102
Epoch 5/5
 - 318s - loss: 0.4120 - acc: 0.8102 - val_loss: 0.4139 - val_acc: 0.8119


<keras.callbacks.History at 0x7fdd2a827e10>

In [25]:
model_name = "keras-word2vec-twitter-sentiment"
model.save(model_name)