In [1]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import tensorflow as tf
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import numpy as np

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
df = pd.read_csv('datasets/reddit_train.csv', encoding = 'latin-1')
df.shape

(21336, 4)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,X,BODY,REMOVED
0,8756,8877,Always be wary of news articles that cite unpu...,0
1,7330,7432,The problem I have with this is that the artic...,0
2,15711,15944,"This is indicative of a typical power law, and...",0
3,1604,1625,This doesn't make sense. Chess obviously trans...,0
4,13327,13520,1. I dispute that gene engineering is burdenso...,0


In [3]:
X_train = df.loc[:10667, 'BODY'].values
y_train = df.loc[:10667, 'REMOVED'].values
X_test = df.loc[10668:, 'BODY'].values
y_test = df.loc[10668:, 'REMOVED'].values

In [4]:
tokenizer_obj = Tokenizer()
total_comments = X_train + X_test
tokenizer_obj.fit_on_texts(total_comments)

max_length = max([len(s.split()) for s in total_comments])

vocab_size = len(tokenizer_obj.word_index) + 1

In [5]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

comment_lines = list()
lines = df['BODY'].values.tolist()

for line in lines:
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    comment_lines.append(words)
len(comment_lines)

21336

In [27]:
comment_lines[1]

['problem',
 'article',
 'appears',
 'credit',
 'plain',
 'packets',
 'decline',
 'smoking',
 'proportion',
 'daily',
 'smokers',
 'australia',
 'dropped',
 'per',
 'cent',
 'record',
 'decline',
 'words',
 'drop',
 'percent',
 'yet',
 'united',
 'states',
 'plain',
 'packaging',
 'rate',
 'dropped',
 'percent',
 'granted',
 'much',
 'australia',
 'decline',
 'indicates',
 'credit',
 'australia',
 'entire',
 'decline',
 'plain',
 'packaging',
 'since',
 'declines',
 'happened',
 'places',
 'without',
 'plain',
 'packaging']

In [6]:
import multiprocessing

from gensim.models import Word2Vec

In [7]:
cores = multiprocessing.cpu_count()

In [59]:
w2v_model = Word2Vec(min_count=1,
                     size = 100,
                     window=5,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007,
                     workers=cores-1)

In [60]:
t = time()

w2v_model.build_vocab(comment_lines, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 15:24:43: collecting all words and their counts
INFO - 15:24:43: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 15:24:43: PROGRESS: at sentence #10000, processed 274410 words, keeping 26815 word types
INFO - 15:24:43: PROGRESS: at sentence #20000, processed 556969 words, keeping 38475 word types
INFO - 15:24:43: collected 39716 word types from a corpus of 592851 raw words and 21336 sentences
INFO - 15:24:43: Loading a fresh vocabulary
INFO - 15:24:43: effective_min_count=1 retains 39716 unique words (100% of original 39716, drops 0)
INFO - 15:24:43: effective_min_count=1 leaves 592851 word corpus (100% of original 592851, drops 0)
INFO - 15:24:43: deleting the raw counts dictionary of 39716 items
INFO - 15:24:43: sample=6e-05 downsamples 1194 most-common words
INFO - 15:24:43: downsampling leaves estimated 392741 word corpus (66.2% of prior 592851)
INFO - 15:24:43: estimated required memory for 39716 words and 300 dimensions: 115176400 bytes
INFO - 15:2

Time to build vocab: 0.12 mins


In [61]:
t = time()

w2v_model.train(comment_lines, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 15:25:15: training model with 11 workers on 39716 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=5 window=5
INFO - 15:25:16: worker thread finished; awaiting finish of 10 more threads
INFO - 15:25:16: worker thread finished; awaiting finish of 9 more threads
INFO - 15:25:16: worker thread finished; awaiting finish of 8 more threads
INFO - 15:25:16: worker thread finished; awaiting finish of 7 more threads
INFO - 15:25:16: worker thread finished; awaiting finish of 6 more threads
INFO - 15:25:16: worker thread finished; awaiting finish of 5 more threads
INFO - 15:25:16: worker thread finished; awaiting finish of 4 more threads
INFO - 15:25:16: worker thread finished; awaiting finish of 3 more threads
INFO - 15:25:16: worker thread finished; awaiting finish of 2 more threads
INFO - 15:25:16: worker thread finished; awaiting finish of 1 more threads
INFO - 15:25:16: worker thread finished; awaiting finish of 0 more threads
INFO - 15:25:16: EPOCH - 1 : training o

INFO - 15:25:19: worker thread finished; awaiting finish of 3 more threads
INFO - 15:25:19: worker thread finished; awaiting finish of 2 more threads
INFO - 15:25:19: worker thread finished; awaiting finish of 1 more threads
INFO - 15:25:19: worker thread finished; awaiting finish of 0 more threads
INFO - 15:25:19: EPOCH - 9 : training on 592851 raw words (393188 effective words) took 0.4s, 1045334 effective words/s
INFO - 15:25:19: worker thread finished; awaiting finish of 10 more threads
INFO - 15:25:19: worker thread finished; awaiting finish of 9 more threads
INFO - 15:25:19: worker thread finished; awaiting finish of 8 more threads
INFO - 15:25:19: worker thread finished; awaiting finish of 7 more threads
INFO - 15:25:19: worker thread finished; awaiting finish of 6 more threads
INFO - 15:25:19: worker thread finished; awaiting finish of 5 more threads
INFO - 15:25:19: worker thread finished; awaiting finish of 4 more threads
INFO - 15:25:19: worker thread finished; awaiting fini

INFO - 15:25:22: worker thread finished; awaiting finish of 7 more threads
INFO - 15:25:22: worker thread finished; awaiting finish of 6 more threads
INFO - 15:25:22: worker thread finished; awaiting finish of 5 more threads
INFO - 15:25:22: worker thread finished; awaiting finish of 4 more threads
INFO - 15:25:22: worker thread finished; awaiting finish of 3 more threads
INFO - 15:25:22: worker thread finished; awaiting finish of 2 more threads
INFO - 15:25:22: worker thread finished; awaiting finish of 1 more threads
INFO - 15:25:22: worker thread finished; awaiting finish of 0 more threads
INFO - 15:25:22: EPOCH - 18 : training on 592851 raw words (392983 effective words) took 0.3s, 1129833 effective words/s
INFO - 15:25:22: worker thread finished; awaiting finish of 10 more threads
INFO - 15:25:22: worker thread finished; awaiting finish of 9 more threads
INFO - 15:25:22: worker thread finished; awaiting finish of 8 more threads
INFO - 15:25:22: worker thread finished; awaiting fin

INFO - 15:25:25: worker thread finished; awaiting finish of 10 more threads
INFO - 15:25:25: worker thread finished; awaiting finish of 9 more threads
INFO - 15:25:25: worker thread finished; awaiting finish of 8 more threads
INFO - 15:25:25: worker thread finished; awaiting finish of 7 more threads
INFO - 15:25:25: worker thread finished; awaiting finish of 6 more threads
INFO - 15:25:25: worker thread finished; awaiting finish of 5 more threads
INFO - 15:25:25: worker thread finished; awaiting finish of 4 more threads
INFO - 15:25:25: worker thread finished; awaiting finish of 3 more threads
INFO - 15:25:25: worker thread finished; awaiting finish of 2 more threads
INFO - 15:25:25: worker thread finished; awaiting finish of 1 more threads
INFO - 15:25:25: worker thread finished; awaiting finish of 0 more threads
INFO - 15:25:25: EPOCH - 27 : training on 592851 raw words (392619 effective words) took 0.4s, 1117429 effective words/s
INFO - 15:25:26: worker thread finished; awaiting fin

Time to train the model: 0.19 mins


In [58]:
words = list(w2v_model.wv.vocab)
print('Vocabulary size: %d' % len(words))

Vocabulary size: 39716


In [68]:
w2v_model.wv.most_similar('banned')

[('cafes', 0.8643336296081543),
 ('subs', 0.8506642580032349),
 ('ban', 0.8423351049423218),
 ('visa', 0.8318437337875366),
 ('welcome', 0.8228675127029419),
 ('puffed', 0.8227126598358154),
 ('scihub', 0.8220528960227966),
 ('devolping', 0.8204940557479858),
 ('admins', 0.8190491199493408),
 ('cp', 0.815090537071228)]

In [88]:
filename = 'reddit_embedding_word2vec.txt'
w2v_model.wv.save_word2vec_format(filename, binary = False)

INFO - 15:48:37: storing 39716x300 projection weights into reddit_embedding_word2vec.txt


In [90]:
import os
embeddings_index = {}
f = open(os.path.join('', 'reddit_embedding_word2vec.txt'), encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [84]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(comment_lines)
sequences = tokenizer_obj.texts_to_sequences(comment_lines)

word_index = tokenizer_obj.word_index
print('Found %s unique tokens.' % len(word_index))

comment_pad = pad_sequences(sequences, maxlen = max_length)
removed = df['REMOVED'].values
print('Shape of comment tensor:', comment_pad.shape)
print('Shape of removed tensor:', removed.shape)

Found 39716 unique tokens.
Shape of comment tensor: (21336, 1558)
Shape of removed tensor: (21336,)


In [92]:
num_words = len(word_index) + 1
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [93]:
print(num_words)

39717


In [95]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

model = Sequential()
embedding_layer = Embedding(num_words,
                           EMBEDDING_DIM,
                           embeddings_initializer = Constant(embedding_matrix),
                           input_length = max_length,
                           trainable = False)
model.add(embedding_layer)
model.add(GRU(units = 32, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [98]:
VALIDATION_SPLIT = 0.2
indices = np.arange(comment_pad.shape[0])
np.random.shuffle(indices)
comment_pad = comment_pad[indices]
removed = removed[indices]
num_validation_samples = int(VALIDATION_SPLIT * comment_pad.shape[0])

X_train_pad = comment_pad[:-num_validation_samples]
y_train = removed[:-num_validation_samples]
X_test_pad = comment_pad[-num_validation_samples:]
y_test = removed[-num_validation_samples:]

print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y-train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_test_pad.shape)
print('Shape of y-test tensor:', y_test.shape)

Shape of X_train_pad tensor: (17069, 1558)
Shape of y-train tensor: (17069,)
Shape of X_test_pad tensor: (4267, 1558)
Shape of y-test tensor: (4267,)


In [100]:
model.fit(X_train_pad, y_train, batch_size = 128, epochs = 20, validation_data = (X_test_pad, y_test), verbose = 2)

Epoch 1/20
134/134 - 691s - loss: 0.5249 - accuracy: 0.7237 - val_loss: 0.5129 - val_accuracy: 0.7258
Epoch 2/20
134/134 - 755s - loss: 0.5173 - accuracy: 0.7288 - val_loss: 0.5114 - val_accuracy: 0.7251
Epoch 3/20
134/134 - 731s - loss: 0.5113 - accuracy: 0.7339 - val_loss: 0.5090 - val_accuracy: 0.7286
Epoch 4/20
134/134 - 755s - loss: 0.5108 - accuracy: 0.7331 - val_loss: 0.5075 - val_accuracy: 0.7277
Epoch 5/20
134/134 - 1361s - loss: 0.5047 - accuracy: 0.7393 - val_loss: 0.5065 - val_accuracy: 0.7246
Epoch 6/20
134/134 - 770s - loss: 0.5023 - accuracy: 0.7406 - val_loss: 0.5117 - val_accuracy: 0.7274
Epoch 7/20
134/134 - 775s - loss: 0.4987 - accuracy: 0.7427 - val_loss: 0.5129 - val_accuracy: 0.7263
Epoch 8/20
134/134 - 771s - loss: 0.4977 - accuracy: 0.7426 - val_loss: 0.5138 - val_accuracy: 0.7251
Epoch 9/20
134/134 - 758s - loss: 0.4936 - accuracy: 0.7489 - val_loss: 0.5104 - val_accuracy: 0.7291
Epoch 10/20
134/134 - 741s - loss: 0.4922 - accuracy: 0.7493 - val_loss: 0.5116 -

<tensorflow.python.keras.callbacks.History at 0x1c907923a08>

In [8]:
w2v_model2 = Word2Vec(min_count=1,
                     size = 100,
                     window=5,
                     sample=6e-5, 
                     alpha=0.07, 
                     min_alpha=0.0007,
                     workers=cores-1)

In [9]:
t = time()

w2v_model2.build_vocab(comment_lines, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 09:54:53: collecting all words and their counts
INFO - 09:54:53: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 09:54:53: PROGRESS: at sentence #10000, processed 274410 words, keeping 26815 word types
INFO - 09:54:53: PROGRESS: at sentence #20000, processed 556969 words, keeping 38475 word types
INFO - 09:54:53: collected 39716 word types from a corpus of 592851 raw words and 21336 sentences
INFO - 09:54:53: Loading a fresh vocabulary
INFO - 09:54:53: effective_min_count=1 retains 39716 unique words (100% of original 39716, drops 0)
INFO - 09:54:53: effective_min_count=1 leaves 592851 word corpus (100% of original 592851, drops 0)
INFO - 09:54:53: deleting the raw counts dictionary of 39716 items
INFO - 09:54:53: sample=6e-05 downsamples 1194 most-common words
INFO - 09:54:53: downsampling leaves estimated 392741 word corpus (66.2% of prior 592851)
INFO - 09:54:53: estimated required memory for 39716 words and 100 dimensions: 51630800 bytes
INFO - 09:54

Time to build vocab: 0.12 mins


In [13]:
t = time()

w2v_model2.train(comment_lines, total_examples=w2v_model2.corpus_count, epochs=50, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 09:55:44: training model with 11 workers on 39716 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=5 window=5
INFO - 09:55:45: worker thread finished; awaiting finish of 10 more threads
INFO - 09:55:45: worker thread finished; awaiting finish of 9 more threads
INFO - 09:55:45: worker thread finished; awaiting finish of 8 more threads
INFO - 09:55:45: worker thread finished; awaiting finish of 7 more threads
INFO - 09:55:45: worker thread finished; awaiting finish of 6 more threads
INFO - 09:55:45: worker thread finished; awaiting finish of 5 more threads
INFO - 09:55:45: worker thread finished; awaiting finish of 4 more threads
INFO - 09:55:45: worker thread finished; awaiting finish of 3 more threads
INFO - 09:55:45: worker thread finished; awaiting finish of 2 more threads
INFO - 09:55:45: worker thread finished; awaiting finish of 1 more threads
INFO - 09:55:45: worker thread finished; awaiting finish of 0 more threads
INFO - 09:55:45: EPOCH - 1 : training o

INFO - 09:55:47: worker thread finished; awaiting finish of 4 more threads
INFO - 09:55:47: worker thread finished; awaiting finish of 3 more threads
INFO - 09:55:47: worker thread finished; awaiting finish of 2 more threads
INFO - 09:55:47: worker thread finished; awaiting finish of 1 more threads
INFO - 09:55:47: worker thread finished; awaiting finish of 0 more threads
INFO - 09:55:47: EPOCH - 9 : training on 592851 raw words (392719 effective words) took 0.3s, 1451842 effective words/s
INFO - 09:55:47: worker thread finished; awaiting finish of 10 more threads
INFO - 09:55:47: worker thread finished; awaiting finish of 9 more threads
INFO - 09:55:47: worker thread finished; awaiting finish of 8 more threads
INFO - 09:55:47: worker thread finished; awaiting finish of 7 more threads
INFO - 09:55:47: worker thread finished; awaiting finish of 6 more threads
INFO - 09:55:47: worker thread finished; awaiting finish of 5 more threads
INFO - 09:55:47: worker thread finished; awaiting fini

INFO - 09:55:50: worker thread finished; awaiting finish of 8 more threads
INFO - 09:55:50: worker thread finished; awaiting finish of 7 more threads
INFO - 09:55:50: worker thread finished; awaiting finish of 6 more threads
INFO - 09:55:50: worker thread finished; awaiting finish of 5 more threads
INFO - 09:55:50: worker thread finished; awaiting finish of 4 more threads
INFO - 09:55:50: worker thread finished; awaiting finish of 3 more threads
INFO - 09:55:50: worker thread finished; awaiting finish of 2 more threads
INFO - 09:55:50: worker thread finished; awaiting finish of 1 more threads
INFO - 09:55:50: worker thread finished; awaiting finish of 0 more threads
INFO - 09:55:50: EPOCH - 18 : training on 592851 raw words (393340 effective words) took 0.3s, 1201302 effective words/s
INFO - 09:55:50: worker thread finished; awaiting finish of 10 more threads
INFO - 09:55:50: worker thread finished; awaiting finish of 9 more threads
INFO - 09:55:50: worker thread finished; awaiting fin

INFO - 09:55:52: EPOCH - 26 : training on 592851 raw words (392363 effective words) took 0.3s, 1359521 effective words/s
INFO - 09:55:53: worker thread finished; awaiting finish of 10 more threads
INFO - 09:55:53: worker thread finished; awaiting finish of 9 more threads
INFO - 09:55:53: worker thread finished; awaiting finish of 8 more threads
INFO - 09:55:53: worker thread finished; awaiting finish of 7 more threads
INFO - 09:55:53: worker thread finished; awaiting finish of 6 more threads
INFO - 09:55:53: worker thread finished; awaiting finish of 5 more threads
INFO - 09:55:53: worker thread finished; awaiting finish of 4 more threads
INFO - 09:55:53: worker thread finished; awaiting finish of 3 more threads
INFO - 09:55:53: worker thread finished; awaiting finish of 2 more threads
INFO - 09:55:53: worker thread finished; awaiting finish of 1 more threads
INFO - 09:55:53: worker thread finished; awaiting finish of 0 more threads
INFO - 09:55:53: EPOCH - 27 : training on 592851 raw 

INFO - 09:55:55: worker thread finished; awaiting finish of 3 more threads
INFO - 09:55:55: worker thread finished; awaiting finish of 2 more threads
INFO - 09:55:55: worker thread finished; awaiting finish of 1 more threads
INFO - 09:55:55: worker thread finished; awaiting finish of 0 more threads
INFO - 09:55:55: EPOCH - 35 : training on 592851 raw words (392472 effective words) took 0.3s, 1391519 effective words/s
INFO - 09:55:55: worker thread finished; awaiting finish of 10 more threads
INFO - 09:55:55: worker thread finished; awaiting finish of 9 more threads
INFO - 09:55:55: worker thread finished; awaiting finish of 8 more threads
INFO - 09:55:55: worker thread finished; awaiting finish of 7 more threads
INFO - 09:55:55: worker thread finished; awaiting finish of 6 more threads
INFO - 09:55:55: worker thread finished; awaiting finish of 5 more threads
INFO - 09:55:55: worker thread finished; awaiting finish of 4 more threads
INFO - 09:55:55: worker thread finished; awaiting fin

INFO - 09:55:58: worker thread finished; awaiting finish of 7 more threads
INFO - 09:55:58: worker thread finished; awaiting finish of 6 more threads
INFO - 09:55:58: worker thread finished; awaiting finish of 5 more threads
INFO - 09:55:58: worker thread finished; awaiting finish of 4 more threads
INFO - 09:55:58: worker thread finished; awaiting finish of 3 more threads
INFO - 09:55:58: worker thread finished; awaiting finish of 2 more threads
INFO - 09:55:58: worker thread finished; awaiting finish of 1 more threads
INFO - 09:55:58: worker thread finished; awaiting finish of 0 more threads
INFO - 09:55:58: EPOCH - 44 : training on 592851 raw words (392453 effective words) took 0.3s, 1216487 effective words/s
INFO - 09:55:58: worker thread finished; awaiting finish of 10 more threads
INFO - 09:55:58: worker thread finished; awaiting finish of 9 more threads
INFO - 09:55:58: worker thread finished; awaiting finish of 8 more threads
INFO - 09:55:58: worker thread finished; awaiting fin

Time to train the model: 0.26 mins


In [14]:
filename = 'reddit_embedding_word2vec2.txt'
w2v_model2.wv.save_word2vec_format(filename, binary = False)

INFO - 09:56:12: storing 39716x100 projection weights into reddit_embedding_word2vec2.txt


In [15]:
import os
embeddings_index2 = {}
f = open(os.path.join('', 'reddit_embedding_word2vec2.txt'), encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index2[word] = coefs
f.close()

In [16]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(comment_lines)
sequences = tokenizer_obj.texts_to_sequences(comment_lines)

word_index = tokenizer_obj.word_index
print('Found %s unique tokens.' % len(word_index))

comment_pad = pad_sequences(sequences, maxlen = max_length)
removed = df['REMOVED'].values
print('Shape of comment tensor:', comment_pad.shape)
print('Shape of removed tensor:', removed.shape)

Found 39716 unique tokens.
Shape of comment tensor: (21336, 1558)
Shape of removed tensor: (21336,)


In [17]:
num_words = len(word_index) + 1
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index2.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [18]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

model = Sequential()
embedding_layer = Embedding(num_words,
                           EMBEDDING_DIM,
                           embeddings_initializer = Constant(embedding_matrix),
                           input_length = max_length,
                           trainable = False)
model.add(embedding_layer)
model.add(GRU(units = 32, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [19]:
VALIDATION_SPLIT = 0.2
indices = np.arange(comment_pad.shape[0])
np.random.shuffle(indices)
comment_pad = comment_pad[indices]
removed = removed[indices]
num_validation_samples = int(VALIDATION_SPLIT * comment_pad.shape[0])

X_train_pad = comment_pad[:-num_validation_samples]
y_train = removed[:-num_validation_samples]
X_test_pad = comment_pad[-num_validation_samples:]
y_test = removed[-num_validation_samples:]

print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y-train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_test_pad.shape)
print('Shape of y-test tensor:', y_test.shape)

Shape of X_train_pad tensor: (17069, 1558)
Shape of y-train tensor: (17069,)
Shape of X_test_pad tensor: (4267, 1558)
Shape of y-test tensor: (4267,)


In [20]:
model.fit(X_train_pad, y_train, batch_size = 128, epochs = 15, validation_data = (X_test_pad, y_test), verbose = 2)

Epoch 1/15
134/134 - 364s - loss: 0.6085 - accuracy: 0.6669 - val_loss: 0.5651 - val_accuracy: 0.7033
Epoch 2/15
134/134 - 385s - loss: 0.5486 - accuracy: 0.7135 - val_loss: 0.5336 - val_accuracy: 0.7190
Epoch 3/15
134/134 - 376s - loss: 0.5248 - accuracy: 0.7265 - val_loss: 0.5286 - val_accuracy: 0.7221
Epoch 4/15
134/134 - 385s - loss: 0.5146 - accuracy: 0.7346 - val_loss: 0.5262 - val_accuracy: 0.7237
Epoch 5/15
134/134 - 385s - loss: 0.5521 - accuracy: 0.7152 - val_loss: 0.5416 - val_accuracy: 0.7174
Epoch 6/15
134/134 - 398s - loss: 0.5136 - accuracy: 0.7364 - val_loss: 0.5235 - val_accuracy: 0.7260
Epoch 7/15
134/134 - 398s - loss: 0.4996 - accuracy: 0.7465 - val_loss: 0.5225 - val_accuracy: 0.7246
Epoch 8/15
134/134 - 401s - loss: 0.4925 - accuracy: 0.7507 - val_loss: 0.5223 - val_accuracy: 0.7239
Epoch 9/15
134/134 - 411s - loss: 0.4895 - accuracy: 0.7524 - val_loss: 0.5233 - val_accuracy: 0.7300
Epoch 10/15
134/134 - 409s - loss: 0.4812 - accuracy: 0.7618 - val_loss: 0.5239 - 

<tensorflow.python.keras.callbacks.History at 0x2219bbe4048>

In [21]:
model.fit(X_train_pad, y_train, batch_size = 64, epochs = 20, validation_data = (X_test_pad, y_test), verbose = 1)

Epoch 1/20
 46/267 [====>.........................] - ETA: 10:43 - loss: 0.4354 - accuracy: 0.7840

KeyboardInterrupt: 