In [1]:
import json
import csv
import random
import numpy as np

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

## Load data

In [2]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv \
    -O /tmp/training_cleaned.csv

--2021-01-25 23:36:35--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 34.64.4.80, 34.64.4.16, 2404:f340:10:1802::2010, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|34.64.4.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 238942690 (228M) [application/octet-stream]
Saving to: ‘/tmp/training_cleaned.csv’


2021-01-25 23:37:56 (2.88 MB/s) - ‘/tmp/training_cleaned.csv’ saved [238942690/238942690]



In [4]:
num_sentences = 0
corpus = []

with open("/tmp/training_cleaned.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=",")
    for row in reader:
        list_item = []
        list_item.append(row[5])
        
        tmp_label = row[0]
        if tmp_label == '0':
            list_item.append(0)
        else:
            list_item.append(1)
            
        num_sentences = num_sentences + 1
        corpus.append(list_item)

In [5]:
print(num_sentences)

1600000


In [6]:
print(len(corpus))

1600000


In [7]:
print(corpus[1])

["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", 0]


In [8]:
print(corpus[0])

["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D", 0]


In [9]:
print(corpus[2])

['@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds', 0]


In [10]:
embedding_dim = 100
max_length = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size=160000
test_portion=.1

In [12]:
sentences = []
labels = []
random.shuffle(corpus)

for x in range(training_size):
    labels.append(corpus[x][1])
    sentences.append(corpus[x][0])

## Tokenize
### Training data

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

vocab_size = len(word_index)
print(vocab_size)

138502


In [14]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

### test data

In [15]:
split = int(test_portion * training_size)

test_seq = padded[0:split]
training_seq = padded[split:training_size]

test_labels = labels[0:split]
training_labels = labels[split:training_size]

In [16]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt
embeddings_index = {};
with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

--2021-01-25 23:46:28--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 34.64.4.80, 34.64.4.16, 2404:f340:10:1802::2010, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|34.64.4.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 347116733 (331M) [text/plain]
Saving to: ‘/tmp/glove.6B.100d.txt’


2021-01-25 23:48:26 (2.83 MB/s) - ‘/tmp/glove.6B.100d.txt’ saved [347116733/347116733]



In [17]:
print(len(embeddings_matrix))

138503


In [21]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 16, 100)           13850300  
_________________________________________________________________
dropout (Dropout)            (None, 16, 100)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 12, 64)            32064     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 3, 64)             0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 13,915,453
Trainable params: 65,153
Non-trainable params: 13,850,300
_______________________________________

In [24]:
num_epochs=50
training_padded = np.array(training_seq)
training_labels = np.array(training_labels)
testing_padded = np.array(test_seq)
testing_labels = np.array(test_labels)

In [25]:
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)


Epoch 1/50
4500/4500 - 18s - loss: 0.5668 - accuracy: 0.7008 - val_loss: 0.5252 - val_accuracy: 0.7346
Epoch 2/50
4500/4500 - 18s - loss: 0.5275 - accuracy: 0.7315 - val_loss: 0.5113 - val_accuracy: 0.7437
Epoch 3/50
4500/4500 - 16s - loss: 0.5116 - accuracy: 0.7428 - val_loss: 0.5048 - val_accuracy: 0.7460
Epoch 4/50
4500/4500 - 18s - loss: 0.5009 - accuracy: 0.7506 - val_loss: 0.5074 - val_accuracy: 0.7480
Epoch 5/50


KeyboardInterrupt: 