In [1]:
import pandas as pd
import pickle
import re
stop_words = pickle.load(open("../data/external/stop_words_en.pkl", "rb"))
TRAIN_FILENAME = "../data/external/condensed_2016.json.zip"

all_data = pd.read_json(TRAIN_FILENAME)

all_data = all_data.loc[all_data['source'] != 'Twitter Ads']

# Assume that only Android is authored by Trump. All else is not Trump.
def isTrump(row):
    if row['source'] == 'Twitter for Android' :
      return True
    return False

all_data['isTrump'] = all_data.apply(lambda row: isTrump (row),axis=1)

all_data.groupby('isTrump').count()

Unnamed: 0_level_0,created_at,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text
isTrump,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,2327,2327,2327,2,2327,2327,2327,2327
True,1835,1835,1835,4,1835,1835,1835,1835


In [2]:
tweets_only = all_data[all_data.is_retweet ==False]


In [3]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [4]:

def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'"', ' ', text)
    text = re.sub('\n',' ',text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        #stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)

    return text

In [5]:
def clean_tweet(line):
    line = line.lower()
    line = re.sub('"','',line)
    line = re.sub('\'','',line)
    line = re.sub('\n',' ',line)
    line = re.sub('\t',' ',line)
    return line
    
cleaned = tweets_only['text'].map(clean_text)
   
cleaned.to_csv('../data/processed/just-tweets.txt', index=False, sep='\t', encoding = 'utf-8')

In [6]:
# do some fasttexting
import fastText

max_features = 100

ft_model = fastText.FastText.train_unsupervised('../data/processed/just-tweets.txt',
                                                wordNgrams=3,
                                                thread=4,
                                                model='cbow',
                                                dim = max_features)


In [7]:
# shape up the training set
import numpy as np

X = []
for text in cleaned:
    vec = ft_model.get_sentence_vector(text)
    X.append(vec)

# reshape a few things
# https://stackoverflow.com/questions/46197493/using-gensim-doc2vec-with-keras-conv1d-valueerror
X = np.array(X)

X = X.reshape((X.shape[0],X.shape[1],1))

X.shape

(3974, 100, 1)

In [8]:
# shape up the labels
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit([True,False])
Y = le.transform(tweets_only['isTrump'])

In [9]:
#now that we have the vectors, we can try to classify. 
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

# set parameters:
batch_size = 128
filters = 250
kernel_size = 2
hidden_dims = 250
epochs = 20



model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
#model.add(Embedding(max_features,
#                    embedding_dims,
#                    input_length=maxlen))


#model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(input_shape = (X.shape[1],X.shape[2]),
                 filters=filters,
                 kernel_size=kernel_size,
                 padding='valid',
                 activation='relu'))

# Use max pooling:
model.add(GlobalMaxPooling1D())

# Add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# Add a vanilla hidden layer:
model.add(Dense(50))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X, Y,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Train on 3576 samples, validate on 398 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb7ef464b70>

In [10]:
Y

array([0, 1, 1, ..., 0, 0, 1])

In [11]:
X

array([[[ 0.08728243],
        [-0.1128547 ],
        [-0.00620146],
        ...,
        [-0.11858591],
        [-0.02351624],
        [ 0.00816149]],

       [[ 0.08779234],
        [-0.11284603],
        [-0.00658891],
        ...,
        [-0.11880445],
        [-0.0235959 ],
        [ 0.00803726]],

       [[ 0.0875293 ],
        [-0.1128301 ],
        [-0.00718569],
        ...,
        [-0.11879747],
        [-0.02352853],
        [ 0.0078991 ]],

       ...,

       [[ 0.08820534],
        [-0.11232013],
        [-0.00707519],
        ...,
        [-0.11911884],
        [-0.02436996],
        [ 0.00731871]],

       [[ 0.08758877],
        [-0.11262521],
        [-0.00678988],
        ...,
        [-0.11877411],
        [-0.02384715],
        [ 0.00831338]],

       [[ 0.08845446],
        [-0.1132536 ],
        [-0.00601076],
        ...,
        [-0.11878899],
        [-0.0235492 ],
        [ 0.00741486]]], dtype=float32)

In [12]:
help(fastText.FastText)

Help on module fastText.FastText in fastText:

NAME
    fastText.FastText

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the BSD-style license found in the
    # LICENSE file in the root directory of this source tree. An additional grant
    # of patent rights can be found in the PATENTS file in the same directory.

FUNCTIONS
    load_model(path)
        Load a model given a filepath and return a model object.
    
    tokenize(text)
        Given a string of text, tokenize it and return a list of tokens
    
    train_supervised(input, lr=0.1, dim=100, ws=5, epoch=5, minCount=1, minCountLabel=0, minn=0, maxn=0, neg=5, wordNgrams=1, loss='softmax', bucket=2000000, thread=12, lrUpdateRate=100, t=0.0001, label='__label__', verbose=2, pretrainedVectors='')
        Train a supervised model and return a model object.
        
        input must be a filepath. The input text does not need to be tokenized

In [13]:
import fastText.util

In [14]:
help(fastText.util)

Help on package fastText.util in fastText:

NAME
    fastText.util

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the BSD-style license found in the
    # LICENSE file in the root directory of this source tree. An additional grant
    # of patent rights can be found in the PATENTS file in the same directory.

PACKAGE CONTENTS
    util

DATA
    absolute_import = _Feature((2, 5, 0, 'alpha', 1), (3, 0, 0, 'alpha', 0...
    division = _Feature((2, 2, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0), 8192...
    print_function = _Feature((2, 6, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0)...
    unicode_literals = _Feature((2, 6, 0, 'alpha', 2), (3, 0, 0, 'alpha', ...

FILE
    /opt/conda/lib/python3.5/site-packages/fastText/util/__init__.py




In [15]:
fastText.util.find_nearest_neighbor("CrookedHillary",ft_model, ban_set='' )

TypeError: Object arrays are not currently supported

In [None]:
help(ft_model)