In [21]:
# load up the tweets
import sys
sys.path.append('../src')
from data import raw_tweets

all_data = raw_tweets.load()
all_data = raw_tweets.label(all_data)

all_data.groupby('isTrump').count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['isTrump'] = [True if x == 'Twitter for Android' else False for x in data['source']]


Unnamed: 0_level_0,created_at,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text
isTrump,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,2327,2327,2327,2,2327,2327,2327,2327
True,1835,1835,1835,4,1835,1835,1835,1835


In [14]:
tweets_only = all_data[all_data.is_retweet ==False]


In [15]:
# clean the tweets a bit. Can remove stops and/or lemmatize as needed.

import pandas as pd
from features import build_features
    
cleaned = pd.Series([build_features.clean_text(text, remove_stopwords = False, lemmatize = True)
           for text in tweets_only['text']])



In [16]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils

def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result
  
all_x_w2v = labelize_tweets_ug(cleaned, 'all')

In [17]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [18]:
# shape up the training set
import numpy as np

dimensions = 300

cores = multiprocessing.cpu_count()
model_ug_dmm = Doc2Vec(dm=1, dm_mean=1, size=dimensions, window=4, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dmm.build_vocab([x for x in tqdm(all_x_w2v)])

for epoch in range(50):
    model_ug_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmm.alpha -= 0.002
    model_ug_dmm.min_alpha = model_ug_dmm.alpha
    
train_vecs_dmm = get_vectors(model_ug_dmm, cleaned, dimensions)
# reshape a few things
# https://stackoverflow.com/questions/46197493/using-gensim-doc2vec-with-keras-conv1d-valueerror
X = np.array(train_vecs_dmm)

X = X.reshape((X.shape[0],X.shape[1],1))

X.shape

100%|██████████| 3974/3974 [00:00<00:00, 3788219.11it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4339537.65it/s]
100%|██████████| 3974/3974 [00:00<00:00, 3947930.86it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4652013.42it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4613386.13it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4569123.93it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4608284.24it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4276081.09it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4513448.17it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4205946.02it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4593046.04it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4258600.94it/s]
100%|██████████| 3974/3974 [00:00<00:00, 3889886.60it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4436562.18it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4393295.76it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4241263.13it/s]
100%|██████████| 3974/3974 [00:00<00:00, 4792456.61it/s]
100%|██████████| 3974/3974 [00:

(3974, 300, 1)

In [19]:
# shape up the labels
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit([True,False])
Y = le.transform(tweets_only['isTrump'])

In [20]:
#now that we have the vectors, we can try to classify. 
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D

# set parameters:
batch_size = 128
filters = 64
kernel_size = 3
hidden_dims = 250
epochs = 20



model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
#model.add(Embedding(max_features,
#                    embedding_dims,
#                    input_length=maxlen))


#model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(input_shape = (X.shape[1],X.shape[2]),
                 filters=filters,
                 kernel_size=kernel_size,
                 padding='valid',
                 activation='relu'))
model.add(Conv1D(filters=filters,
                 kernel_size=kernel_size, activation='relu'))
model.add(Conv1D(filters=filters,
                 kernel_size=kernel_size, activation='relu'))
model.add(Conv1D(filters=filters,
                 kernel_size=kernel_size, activation='relu'))
model.add(Dropout(0.25))

# Use max pooling:
model.add(MaxPooling1D(3))

model.add(Conv1D(filters=filters,
                 kernel_size=2, activation='relu'))
model.add(Conv1D(filters=filters,
                 kernel_size=2, activation='relu'))
model.add(Conv1D(filters=filters,
                 kernel_size=2, activation='relu'))
model.add(Conv1D(filters=filters,
                 kernel_size=2, activation='relu'))
model.add(Dropout(0.25))

# Use max pooling:
model.add(GlobalMaxPooling1D())

# Add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X, Y,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.25)



Train on 2980 samples, validate on 994 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f0cecb89dd8>

In [None]:
model_ug_dmm.most_similar("maga")