In [1]:
import string
import csv
import re
import math
from random import shuffle

tweets = []
nb_lines = 0

# Patter for regular expression
noUrl_patter = r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*'  # Remove links
noTag_pattern = "(?<!\w)@\w+"  # Remove tags


def preprocess_text(original_text):
    text = original_text.lower()
    noUrl_text = re.sub(noUrl_patter, '', text)
    noTag_text = re.sub(noTag_pattern, '', noUrl_text)
    noPunctuation_text = noTag_text.translate(None, string.punctuation)
    noDigit_text = noPunctuation_text.translate(None, string.digits).strip()
    list_words = noDigit_text.split(" ")
    if (" ") in list_words:
        list_words.remove(" ")
    return list_words

dataset = open("Dataset.csv", "rb")
reader = csv.reader(dataset)
reader.next()  # Skip the 1st line of the dataset which is the header
for row in reader:
    nb_lines += 1
    text = row[3]
    preprocessed_text = preprocess_text(text)
    tweets.append([preprocessed_text, int(row[1])])

shuffle(tweets)        
        
nb_learn = int(math.floor(nb_lines * 0.8))
nb_train = int(math.floor((nb_lines-nb_learn)/2))
nb_test = int(math.floor((nb_lines-nb_learn)/2))

In [2]:
print("This model uses " + str(nb_learn) + " data to create the vocabulaire.")
print("This model uses " + str(nb_train) + " data to train and " + str(nb_test) + " to test")

This model uses 1262891 data to create the vocabulaire.
This model uses 157861 data to train and 157861 to test


In [3]:
X_learn = [tweets[i][0] for i in range(nb_learn)]
Y_learn = [tweets[i][1] for i in range(nb_learn)]

all_words = []
for data in X_learn:
    all_words.extend(word for word in data)
    
all_diff_words = list(set(all_words))

nb_words = len(all_diff_words)

vocabulaire = {all_diff_words[i]: [i, 0] for i in range(nb_words)}

def manipulate_vocabulaire(old_vocabulaire):
    # Count the repitition of each word
    for data in X_learn:
        for word in data:
            old_vocabulaire[word][1] += 1
    
    # List all the word which its repitition is smaller than 5
    delete_keys = []
    for key in vocabulaire.keys():
        if old_vocabulaire[key][1] < 5:
            delete_keys.append(key)
            
    # Delete the key in the vocabulaire
    for key in delete_keys:
        old_vocabulaire.pop(key)
        
    # Create the new vocabulaire
    keys = old_vocabulaire.keys()
    size = len(keys)
    new_vocabulaire = {keys[i]: i for i in range(size)}
    
    return new_vocabulaire, size

(vocabulaire,size) = manipulate_vocabulaire(vocabulaire)


In [4]:
print("Size of the vocabulaire is " + str(size))

Size of the vocabulaire is 49881


In [5]:
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM, SimpleRNN, GRU

Using TensorFlow backend.


In [6]:
X_train = [tweets[i + nb_learn][0] for i in range(nb_train)]
Y_train = [tweets[i + nb_learn][1] for i in range(nb_train)]

X_test = [tweets[i + nb_learn + nb_train][0] for i in range(nb_test)]
Y_test = [tweets[i + nb_learn + nb_train][1] for i in range(nb_test)]

In [7]:
print(X_train[101])
print(X_test[101])
print(Y_train[101])
print(Y_test[101])

['is', 'preparing', 'for', 'tests']
['what', 'a', 'super', 'awesome', 'really', 'swell', 'day']
0
0


In [8]:
max_features = size
maxlen = 40  # cut texts after this number of words (among top max_features most common words)
batch_size = 512

print('Loading data...')
for i in range(nb_train):
    X_train[i] = [vocabulaire[j] for j in X_train[i] if vocabulaire.has_key(j)]
    
for i in range(nb_test):
    X_test[i] = [vocabulaire[j] for j in X_test[i] if vocabulaire.has_key(j)]
    
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

Loading data...
157861 train sequences
157861 test sequences


In [9]:
print(X_train[101])
print(X_test[101])
print(Y_train[101])
print(Y_test[101])

[2528, 8245, 45156, 2853]
[35774, 42638, 5746, 28406, 17864, 36413, 46989]
0
0


In [10]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen = maxlen)
X_test = sequence.pad_sequences(X_test, maxlen = maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Pad sequences (samples x time)
X_train shape: (157861, 40)
X_test shape: (157861, 40)


In [11]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
h=model.fit(X_train, Y_train, batch_size = batch_size, nb_epoch = 15,
          validation_data = (X_test, Y_test))
score, acc = model.evaluate(X_test, Y_test, 
                            batch_size = batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Build model...
Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 157861 samples, validate on 157861 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 0.617429086997
Test accuracy: 0.772001951112


In [12]:
print(h.history)

{'acc': [0.72174254564366269, 0.7769240027543719, 0.79322315202178695, 0.80646264749523255, 0.81656013836806196, 0.82634089481407635, 0.83505108926375082, 0.84426805861494147, 0.84991859929443403, 0.85685508134337729, 0.86077625250216905, 0.86520419863270615, 0.86999322188130035, 0.87248908853022822, 0.87687269180074645], 'loss': [0.54238053578214507, 0.4708819278992678, 0.44065636519913437, 0.41676526371464645, 0.39617827669489381, 0.37827069717582479, 0.36138953994485379, 0.34402986505157679, 0.33102240165552932, 0.31817650576988971, 0.30764534471742749, 0.29878960991020392, 0.29003809382852352, 0.28330515286311175, 0.27579916351177308], 'val_acc': [0.78259988216538523, 0.78731922387869868, 0.78654639205947363, 0.78728121575763022, 0.78630567398957896, 0.78334104053415077, 0.78441793729775933, 0.7792108246992876, 0.77865970698797027, 0.77759547955991604, 0.77511861703986384, 0.7725910769767238, 0.77259741166960982, 0.76955042730332346, 0.77200195111224401], 'val_loss': [0.46819703915