In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
import csv

corpus = []
num_sentences = 0

with open("./training_cleaned.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        list_item=[]
        
        list_item.append(row[5])
        this_label=row[0]
        if this_label=='0':
            list_item.append(0)
        else:
            list_item.append(1)
        
        num_sentences = num_sentences + 1
        corpus.append(list_item)

In [3]:
corpus[0]

["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
 0]

In [4]:
print(num_sentences)
print(len(corpus))

1600000
1600000


In [5]:
vocab_size = 10000
max_length = 16
embedding_dim = 100
trunc_type = 'post'
padding_type = 'post'
oov_token = '<oov>'

data_size = int(len(corpus) * 0.1)
split = 0.9

In [6]:
sentences = []
labels = []

import random
random.shuffle(corpus)

for i in range(0, data_size):
  sentences.append(corpus[i][0])
  labels.append(corpus[i][1])

In [7]:
print(sentences[0])
print(labels[0])

@peterinkal Are you going to the play tonight?  I think i'm going on Saturday but only if you tell me its good 
1


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

training_split = int(data_size * split)
train_sentences = sentences[:training_split]
train_labels = labels[:training_split]
val_sentences = sentences[training_split:]
val_labels = labels[training_split:]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)

word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
padded_train_sequences = pad_sequences(train_sequences, maxlen=max_length, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_sentences)
padded_val_sequences = pad_sequences(val_sequences, maxlen=max_length, truncating=trunc_type)

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

In [9]:
print(padded_train_sequences.shape)
print(train_labels.shape)

print(padded_val_sequences.shape)
print(val_labels.shape)

(144000, 16)
(144000,)
(16000, 16)
(16000,)


In [10]:
model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv1D(64, 5, activation='relu'),
        tf.keras.layers.MaxPooling1D(4),
        tf.keras.layers.GRU(64),
        tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 100)           1000000   
                                                                 
 dropout (Dropout)           (None, 16, 100)           0         
                                                                 
 conv1d (Conv1D)             (None, 12, 64)            32064     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 3, 64)            0         
 )                                                               
                                                                 
 gru (GRU)                   (None, 64)                24960     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                        

In [11]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [12]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_accuracy',
                               patience=3,
                               min_delta=0.01,
                               restore_best_weights=True)

In [13]:
history = model.fit(padded_train_sequences,
                    train_labels,
                    validation_data=(padded_val_sequences, val_labels),
                    epochs=50,
                    callbacks=early_stopping)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


In [14]:
model.save('model.h5')

In [21]:
def create_glove_embeddings(file_path):

  embeddings_index = {}
  glove_file = open(file_path, encoding='utf8')
  for line in glove_file:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
  glove_file.close()

  print('Found {} word vectors.'.format(len(embeddings_index)))

  embedding_matrix = np.zeros((vocab_size, embedding_dim))
  for word, i in word_index.items():
      if i < vocab_size:
          embedding_vector = embeddings_index.get(word)
          if embedding_vector is not None:
              embedding_matrix[i] = embedding_vector

  print('Embedding matrix shape:', embedding_matrix.shape)
  return embedding_matrix

embeddings_matrix = create_glove_embeddings(file_path='glove.6B.100d.txt')

Found 400000 word vectors.
Embedding matrix shape: (10000, 100)


In [23]:
model2 = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length, 
                                  weights=[embeddings_matrix], trainable=False),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv1D(64, 5, activation='relu'),
        tf.keras.layers.MaxPooling1D(4),
        tf.keras.layers.GRU(64),
        tf.keras.layers.Dense(1, activation='sigmoid')
])

model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 16, 100)           1000000   
                                                                 
 dropout_2 (Dropout)         (None, 16, 100)           0         
                                                                 
 conv1d_2 (Conv1D)           (None, 12, 64)            32064     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 3, 64)            0         
 1D)                                                             
                                                                 
 gru_2 (GRU)                 (None, 64)                24960     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                      

In [24]:
model2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [26]:
history2 = model2.fit(padded_train_sequences,
                    train_labels,
                    validation_data=(padded_val_sequences, val_labels),
                    epochs=50,
                    callbacks=early_stopping)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


In [27]:
model2.save('model2.h5')