In [1]:
import os
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np 
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec 
LabeledSentence = gensim.models.doc2vec.LabeledSentence

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import word_tokenize

In [2]:
DATA_PATH = '/Users/lia/work/study/epfl/CS433-MachineLearning/ML-GroupProject2/twitter-datasets'

TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg.txt') # 100'000 negative tweets
TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos.txt') # 100'000 positive tweets
# TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg_full.txt') # 2'500'000 negative tweets
# TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos_full.txt') # 2'500'000 positive tweets
# TEST_PATH = os.path.join(DATA_PATH, 'test_data.txt')

In [3]:
def load_data_and_labels(train_positive_path, train_negative_path):
    """
    Loads tweet data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    positive_examples = list(open(train_positive_path).readlines())
    negative_examples = list(open(train_negative_path).readlines())
    x_text_train = positive_examples + negative_examples
    
    x_tokens_train = [word_tokenize(x) for x in x_text_train]
    
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text_train, x_tokens_train, y]

In [4]:
x_text_train, x_tokens_train, y_train_full = load_data_and_labels(TRAIN_POS_PATH, TRAIN_NEG_PATH)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_tokens_train,
                                                    y_train_full, 
                                                    test_size=0.01, 
                                                    random_state=42)

In [6]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

198000it [00:01, 161588.48it/s]
2000it [00:00, 310493.69it/s]


In [7]:
x_train[0]

LabeledSentence(words=['<', 'user', '>', 'hey', 'thankx', 'for', 'following', 'dear'], tags=['TRAIN_0'])

In [8]:
n_dim = 200

In [9]:
tweet_w2v = Word2Vec(size=n_dim, min_count=1)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)], 
                epochs=tweet_w2v.iter,
                total_examples=tweet_w2v.corpus_count)
print('Number of words: {}'.format(len(tweet_w2v.wv.vocab)))

100%|██████████| 198000/198000 [00:00<00:00, 1950467.94it/s]
100%|██████████| 198000/198000 [00:00<00:00, 1927739.70it/s]


Number of words: 107666


In [10]:
tweet_w2v.most_similar('good')

[('nice', 0.7896394729614258),
 ('great', 0.7598593235015869),
 ('bad', 0.7344122529029846),
 ('lovely', 0.6982096433639526),
 ('terrible', 0.6812501549720764),
 ('ah.bad', 0.6438660621643066),
 ('busy', 0.6360503435134888),
 ('awesome', 0.6346652507781982),
 ('goood', 0.6319184303283691),
 ('horrible', 0.6308711767196655)]

In [11]:
# # importing bokeh library for interactive dataviz
# import bokeh.plotting as bp
# from bokeh.models import HoverTool, BoxSelectTool
# from bokeh.plotting import figure, show, output_notebook

# # defining the chart
# output_notebook()
# plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
#     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
#     x_axis_type=None, y_axis_type=None, min_border=1)

# # getting a list of word vectors. limit to 10000. each is of 200 dimensions
# word_vectors = [tweet_w2v[w] for w in list(tweet_w2v.wv.vocab.keys())[:5000]]

# # dimensionality reduction. converting the vectors to 2d vectors
# from sklearn.manifold import TSNE
# tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
# tsne_w2v = tsne_model.fit_transform(word_vectors)

# # putting everything in a dataframe
# tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
# tsne_df['words'] = list(tweet_w2v.wv.vocab.keys())[:5000]

# # plotting. the corresponding word appears when you hover on the data point.
# plot_tfidf.scatter(x='x', y='y', source=tsne_df)
# hover = plot_tfidf.select(dict(type=HoverTool))
# hover.tooltips={"word": "@words"}
# show(plot_tfidf)

In [12]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size : {}'.format(len(tfidf)))

vocab size : 11884


In [13]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [14]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

198000it [00:25, 7744.36it/s]
2000it [00:00, 7909.98it/s]


In [19]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, Convolution1D, Flatten, Dropout

model = Sequential()
model.add(Dense(64, activation='relu', input_dim=200))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=10, batch_size=128, verbose=2)

Epoch 1/10
 - 2s - loss: 0.4707 - acc: 0.7639
Epoch 2/10
 - 2s - loss: 0.4418 - acc: 0.7840
Epoch 3/10
 - 2s - loss: 0.4363 - acc: 0.7878
Epoch 4/10
 - 2s - loss: 0.4324 - acc: 0.7908
Epoch 5/10
 - 2s - loss: 0.4289 - acc: 0.7932
Epoch 6/10
 - 2s - loss: 0.4274 - acc: 0.7951
Epoch 7/10
 - 2s - loss: 0.4259 - acc: 0.7963
Epoch 8/10
 - 2s - loss: 0.4237 - acc: 0.7969
Epoch 9/10
 - 2s - loss: 0.4238 - acc: 0.7976
Epoch 10/10
 - 2s - loss: 0.4222 - acc: 0.7984


<keras.callbacks.History at 0x146dbd898>

In [18]:
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print(score[1])

0.805499999523
