# Load

In [2]:
import pandas as pd
import pickle
stop_words = pickle.load(open("../data/external/stop_words_en.pkl", "rb"))
TRAIN_FILENAME = "../data/external/condensed_2016.json.zip"

all_data = pd.read_json(TRAIN_FILENAME)


In [3]:
all_data.groupby('source')['source'].count()

source
Instagram                 2
Media Studio              1
Mobile Web (M5)           1
Periscope                 1
TweetDeck                 2
Twitter Ads              63
Twitter Web Client      340
Twitter for Android    1835
Twitter for iPad         22
Twitter for iPhone     1958
Name: source, dtype: int64

In [4]:
all_data = all_data.loc[all_data['source'] != 'Twitter Ads']

In [5]:
# Assume that only Android is authored by Trump. All else is not Trump.
def isTrump(row):
    if row['source'] == 'Twitter for Android' :
      return True
    return False

all_data['isTrump'] = all_data.apply(lambda row: isTrump (row),axis=1)

all_data.groupby('isTrump').count()

Unnamed: 0_level_0,created_at,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text
isTrump,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,2327,2327,2327,2,2327,2327,2327,2327
True,1835,1835,1835,4,1835,1835,1835,1835


## feature.py ##

In [6]:
import string
import gensim
from gensim import corpora,models
from gensim.models import Phrases
from string import digits
from nltk.corpus import words
import pandas as pd
import pickle
import nltk
from nltk import word_tokenize
import itertools
import numpy as np
from keras.utils import np_utils

lemma = nltk.wordnet.WordNetLemmatizer()

def clean_message(single_message):
    single_message = single_message.lower()
    sentence = [re.sub("[^a-zA-Z]", " ", word) for word in [single_message]]
    sentence_word = [[lemma.lemmatize(i) for i in word_tokenize(word) if i not in string.punctuation and i not in digits and len(i) > 2 and i not in stop_words] for word in sentence]
    sentence_word = list(itertools.chain(*sentence_word))
    return sentence_word



def build_feature (messages, vectorizer, train=True):
    messages_clean=messages.map(clean_message)
    
    messages_clean_str=[]
    for sen in messages_clean:
        messages_clean_str.append(' '.join(str(e) for e in sen))
    
    if train:
        vectorizer=vectorizer.fit(messages_clean_str)
        with open('../models/vectorizer.pkl', 'wb') as f: 
            pickle.dump(vectorizer, f)
        
    features = vectorizer.transform(messages_clean_str)
    features = features.todense()
    
    if train:
        return np.array(features)
    
    else:
        return features

def save_encoder(label):
    encoder = preprocessing.LabelEncoder()
    encoder.fit(label)
    np.save('classes.npy', encoder.classes_)
    return encoder
    
    
def one_hot_vec(var):
    encoder = LabelEncoder()
    encoder.classes_ = np.load('classes.npy')
    encoded_y = encoder.transform(var) #dummy code
    dummy_y = np_utils.to_categorical(encoded_y)#one hot code
    return dummy_y

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## train.py ##

In [7]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/keras/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/keras/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from collections import defaultdict
import string
import nltk
from gensim import corpora,models
import gensim
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
#from nltk.corpus import stopwords
#stopwords.words("english")
import re
from nltk.stem import RegexpStemmer
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
#import tflearn




def create_training_set(messages):
    
    # Creating Features from a Bag of Words
    vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 2000,
                             ngram_range=(1,2)) 

    training_set = build_feature(messages, vectorizer, train=True)
    save_encoder(all_data['isTrump'])
    return training_set, one_hot_vec(all_data['isTrump']), vectorizer
    
training_set, labels, vectorizer= create_training_set(all_data['text']) 


In [9]:
all_data['text'].map(clean_message)

0       [realdonaldtrump, happy, birthday, donaldjtrum...
1       [happy, birthday, donaldjtrumpjr, http, urxycd...
2       [happy, new, year, including, many, enemy, fou...
3       [russian, playing, cnn, nbcnews, fool, funny, ...
4       [join, american, founded, hall, fame, legend, ...
5        [great, move, delay, putin, always, knew, smart]
6       [administration, follow, two, simple, rule, ht...
7       [economist, say, trump, delivered, hope, http,...
8       [anymore, beginning, end, horrible, iran, deal...
9       [continue, let, israel, treated, total, disdai...
10      [best, disregard, many, inflammatory, presiden...
11      [consumer, confidence, index, december, surged...
12      [president, obama, campaigned, hard, personall...
13      [djt, foundation, unlike, foundation, never, p...
14      [gave, million, dollar, djt, foundation, raise...
15      [world, gloomy, hope, market, nearly, christma...
16      [united, nation, great, potential, right, club...
17      [presi

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from keras.models import Sequential
from sklearn.metrics import accuracy_score



def save_model(model):
    # saving model and weights
    json_model = model.to_json()
    open('model_architecture.json', 'w').write(json_model)
    model.save_weights('model_weights.h5', overwrite=True)

def load_model():
    # loading model
    model = model_from_json(open('model_architecture.json').read())
    model.load_weights('model_weights.h5')
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model


X=training_set
Y=labels 

print(len(X))
print(len(X[0]))
print(len(Y))
print(len(Y[0]))

def build_model(optimizer='adam', dropout_rate=0.4, init='normal', 
                num_features = 1000, num_classes = 19,
               units1 = 400, units2 = 400):
    model = Sequential()
    model.add(Dense(units1, input_dim=num_features, kernel_initializer=init, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units2, kernel_initializer=init, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_classes, kernel_initializer=init, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


model=build_model(num_features = len(X[0]), num_classes = len(Y[0]), dropout_rate=0.4)
model.fit(X, Y, epochs=5, batch_size=64, verbose=2, validation_split=0.2)
save_model(model)

4162
2000
4162
2
Train on 3329 samples, validate on 833 samples
Epoch 1/5
 - 1s - loss: 0.4815 - acc: 0.7516 - val_loss: 0.5581 - val_acc: 0.7203
Epoch 2/5
 - 0s - loss: 0.2737 - acc: 0.8862 - val_loss: 0.5844 - val_acc: 0.7467
Epoch 3/5
 - 0s - loss: 0.1555 - acc: 0.9423 - val_loss: 0.7547 - val_acc: 0.7215
Epoch 4/5
 - 0s - loss: 0.0901 - acc: 0.9694 - val_loss: 0.9299 - val_acc: 0.7275
Epoch 5/5
 - 0s - loss: 0.0447 - acc: 0.9850 - val_loss: 1.0959 - val_acc: 0.7215


## grid serach for hyperparameters##

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV


# create model
model = KerasClassifier(build_fn=build_model, verbose=0, num_features=len(X[0]), num_classes=len(Y[0]))
# grid search epochs, batch size and optimizer...
optimizers = ['adam']
init = [ 'normal']
epochs = [50]
batches = [64,128]
dropout_rate = [0.4]
units1 = [40, 30]
units2 = [40, 30]
seed=10
param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches, 
                  dropout_rate=dropout_rate, init=init,
                 units1 = units1, units2 = units2)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold)
grid_result = grid.fit(X, Y)

#grid_result.cv_results_

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
training_set