In [1]:
import sys
sys.path.append('../src')
from data import raw_tweets

all_data = raw_tweets.load()
all_data = raw_tweets.label(all_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['isTrump'] = [True if x == 'Twitter for Android' else False for x in data['source']]


In [2]:
tweets_only = all_data[all_data.is_retweet ==False]


In [3]:
from features import build_features
    
cleaned = tweets_only['text'].map(build_features.clean_text)
   
cleaned.to_csv('../data/processed/just-tweets.txt', index=False, sep='\t', encoding = 'utf-8')

[nltk_data] Downloading package wordnet to /home/keras/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# do some fasttexting
import fastText

max_features = 100

ft_model = fastText.FastText.train_unsupervised('../data/processed/just-tweets.txt',
                                                wordNgrams=3,
                                                thread=4,
                                                model='cbow',
                                                dim = max_features)


In [5]:
# shape up the training set
import numpy as np

X = []
for text in cleaned:
    vec = ft_model.get_sentence_vector(text)
    X.append(vec)

# reshape a few things
# https://stackoverflow.com/questions/46197493/using-gensim-doc2vec-with-keras-conv1d-valueerror
X = np.array(X)

X = X.reshape((X.shape[0],X.shape[1],1))

X.shape

(3974, 100, 1)

In [6]:
# shape up the labels
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit([True,False])
Y = le.transform(tweets_only['isTrump'])

In [7]:
#now that we have the vectors, we can try to classify. 
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

# set parameters:
batch_size = 128
filters = 250
kernel_size = 2
hidden_dims = 250
epochs = 20



model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
#model.add(Embedding(max_features,
#                    embedding_dims,
#                    input_length=maxlen))


#model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(input_shape = (X.shape[1],X.shape[2]),
                 filters=filters,
                 kernel_size=kernel_size,
                 padding='valid',
                 activation='relu'))

# Use max pooling:
model.add(GlobalMaxPooling1D())

# Add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# Add a vanilla hidden layer:
model.add(Dense(50))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X, Y,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Train on 3576 samples, validate on 398 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7efc54a0efd0>

In [8]:
Y

array([0, 1, 1, ..., 0, 0, 1])

In [9]:
X

array([[[ 0.02562395],
        [ 0.12267342],
        [-0.22090374],
        ...,
        [-0.03441325],
        [ 0.01117426],
        [-0.13769613]],

       [[ 0.02631074],
        [ 0.12256388],
        [-0.22172602],
        ...,
        [-0.03418927],
        [ 0.01164399],
        [-0.13741669]],

       [[ 0.02632573],
        [ 0.12237485],
        [-0.22180703],
        ...,
        [-0.03365884],
        [ 0.01159522],
        [-0.13782223]],

       ...,

       [[ 0.02674141],
        [ 0.12294166],
        [-0.22161585],
        ...,
        [-0.0347605 ],
        [ 0.01071497],
        [-0.13757446]],

       [[ 0.02612884],
        [ 0.12224888],
        [-0.22117734],
        ...,
        [-0.03427832],
        [ 0.01144536],
        [-0.13772956]],

       [[ 0.0253894 ],
        [ 0.12241374],
        [-0.22168326],
        ...,
        [-0.03380027],
        [ 0.01163926],
        [-0.1375364 ]]], dtype=float32)

In [10]:
help(fastText.FastText)

Help on module fastText.FastText in fastText:

NAME
    fastText.FastText

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the BSD-style license found in the
    # LICENSE file in the root directory of this source tree. An additional grant
    # of patent rights can be found in the PATENTS file in the same directory.

FUNCTIONS
    load_model(path)
        Load a model given a filepath and return a model object.
    
    tokenize(text)
        Given a string of text, tokenize it and return a list of tokens
    
    train_supervised(input, lr=0.1, dim=100, ws=5, epoch=5, minCount=1, minCountLabel=0, minn=0, maxn=0, neg=5, wordNgrams=1, loss='softmax', bucket=2000000, thread=12, lrUpdateRate=100, t=0.0001, label='__label__', verbose=2, pretrainedVectors='')
        Train a supervised model and return a model object.
        
        input must be a filepath. The input text does not need to be tokenized

In [11]:
import fastText.util

In [12]:
help(fastText.util)

Help on package fastText.util in fastText:

NAME
    fastText.util

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the BSD-style license found in the
    # LICENSE file in the root directory of this source tree. An additional grant
    # of patent rights can be found in the PATENTS file in the same directory.

PACKAGE CONTENTS
    util

DATA
    absolute_import = _Feature((2, 5, 0, 'alpha', 1), (3, 0, 0, 'alpha', 0...
    division = _Feature((2, 2, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0), 8192...
    print_function = _Feature((2, 6, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0)...
    unicode_literals = _Feature((2, 6, 0, 'alpha', 2), (3, 0, 0, 'alpha', ...

FILE
    /opt/conda/lib/python3.5/site-packages/fastText/util/__init__.py




In [13]:
fastText.util.find_nearest_neighbor("CrookedHillary",ft_model, ban_set='' )

TypeError: Object arrays are not currently supported

In [None]:
help(ft_model)