In [1]:
import pandas as pd

base_dir = "."

# Read the twitter data file csv
df = pd.read_csv(base_dir + "/preprocessed_tweets.csv", encoding='latin-1')
df = df.drop(columns=['msg_id'])
df

Unnamed: 0,content,label
0,Cotiviti Holdings Incorporated (NYSE:COTV) Ca...,1
1,Boeing hit hard by tariff and trade war headl...,-1
2,<NAME/> <HASHTAG/> Microsoft is a proud spons...,1
3,"<NAME/> <NAME/> It's not fake news, I own Boe...",1
4,<NAME/> Canada should consider slapping 300% ...,-1
5,"'Upwards of 20,000 workers' could lose jobs d...",-1
6,Audi could be hedged thanks to its Mexican fa...,0
7,"$TSLA Short Interest: 28,382,800 vs Prev 28,7...",1
8,The most logical way-forward for <HASHTAG/> S...,-1
9,<NAME/> <NAME/> Or could lead to a monopoly w...,1


In [9]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.5)
print(train.shape)
print(test.shape)

#print('Label:', train['label'][0])
#print('Text:', train['content'][0])

(4800, 2)
(3200, 2)


In [10]:
import re
import string
import os
import matplotlib
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer

# Custom Tokenizer
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

# Plot sentence by lenght
plt.hist([len(tokenize(s)) for s in train['content'].values], bins=50)
plt.title('Tokens per sentence')
plt.xlabel('Len (number of token)')
plt.ylabel('# samples')
plt.show()

<Figure size 640x480 with 1 Axes>

In [11]:
BATCH_SIZE = 128 # Number of examples used in each iteration
EPOCHS = 3 # Number of passes through entire dataset
VOCAB_SIZE = 30000 # Size of vocabulary dictionary
MAX_LEN = 25 # Max length of review (in words)
EMBEDDING_DIM = 40 # Dimension of word embedding vector

In [12]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPooling1D, Flatten, Conv1D, Dropout, Activation
from keras.preprocessing.text import Tokenizer

import tensorflow as tf
import numpy as np

# For reproducibility
from tensorflow import set_random_seed
from numpy.random import seed

seed(1)
set_random_seed(2)

tweet_tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tweet_tokenizer.fit_on_texts(train['content'].values)

x_train_seq = tweet_tokenizer.texts_to_sequences(train['content'].values)
x_val_seq = tweet_tokenizer.texts_to_sequences(test['content'].values)

x_train = sequence.pad_sequences(x_train_seq, maxlen=MAX_LEN, padding="post", value=0)
x_val = sequence.pad_sequences(x_val_seq, maxlen=MAX_LEN, padding="post", value=0)

y_train, y_val = train['label'].values, test['label'].values

print('First sample before preprocessing: \n', train['content'].values[0], '\n')
print('First sample after preprocessing: \n', x_train[0])

First sample before preprocessing: 
  Hey <NAME/> Uber needs to add an ambulance fleet in developing countries. This can be an unprecendented new growthÉ <LINK/> 

First sample after preprocessing: 
 [2268    2   46  494    5  438   36 6306 3924    9 1420 1896   21   77
   26   36 6307   25 2854    1    0    0    0    0    0]


In [13]:
NUM_FILTERS = 250
KERNEL_SIZE = 3
HIDDEN_DIMS = 250

In [14]:
# CNN Model
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into EMBEDDING_DIM dimensions
model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn NUM_FILTERS filters
model.add(Conv1D(NUM_FILTERS,
                 KERNEL_SIZE,
                 padding='valid',
                 activation='relu',
                 strides=1))

# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(HIDDEN_DIMS))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 40)            1200000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 25, 40)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 23, 250)           30250     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 250)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_4 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_3 (Activation)    (None, 250)               0     

In [15]:
# fit a model
model.fit(x_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.1)

# Evaluate the model
score, acc = model.evaluate(x_val, y_val, batch_size=BATCH_SIZE)
print('\nAccuracy: ', acc*100)

pred = model.predict_classes(x_val)

Train on 4320 samples, validate on 480 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy:  32.9375
