In [40]:
# Execute this prior to running test
import nltk
nltk.download('punkt')
nltk.download('stopwords')
 

[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
PRECOMPILED_WV_FILEPATH = "./glove.twitter.27B.50d.txt"
TRAINING_OFFENSIVE_FILENAME = "./training_offensive.csv"
TRAINING_REGULAR_FILENAME = "./training_regular.csv"
TEST_OFFENSIVE_FILENAME = "./test_offensive.csv"
TEST_REGULAR_FILENAME = "./test_regular.csv"

#
MAX_SEQUENCE_LENGTH = 70
DIMENSIONS = 50

# Model related variables
FILTER_NUM = 32
KERNEL_SIZE = 10
VOCAB_SIZE = 100000



In [42]:
# for the four data sets 
# loop 0 to 35 and if in the sentence, replace it with the word's value from wordtovec
# if no word then put in 0 instead
import os
import pandas
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

TRAINING_OFFENSIVE_FILENAME = "./training_offensive.csv"
TRAINING_REGULAR_FILENAME = "./training_regular.csv"
TEST_OFFENSIVE_FILENAME = "./test_offensive.csv"
TEST_REGULAR_FILENAME = "./test_regular.csv"

off_train_dataframe = pandas.read_csv(TRAINING_OFFENSIVE_FILENAME)
off_test_dataframe = pandas.read_csv(TEST_OFFENSIVE_FILENAME)
reg_train_dataframe = pandas.read_csv(TRAINING_REGULAR_FILENAME)
reg_test_dataframe = pandas.read_csv(TEST_REGULAR_FILENAME)


In [43]:
stop_words = set(stopwords.words('english')) 
non_alphabet_pattern = re.compile("[^a-zA-Z]")
#NULL_VECTOR = fb_model.wv[""]

def get_cleaned_tokens(sentence):    
    word_tokens = word_tokenize(sentence) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = [re.sub(non_alphabet_pattern, "", w) for w in filtered_sentence]
    filtered_sentence = [w for w in filtered_sentence if w != ""]
    return filtered_sentence


In [44]:
all_train_tweets = off_train_dataframe['tweet']
all_train_tweets = all_train_tweets.append(reg_train_dataframe['tweet'])
all_train_tweets = [get_cleaned_tokens(tweet) for tweet in all_train_tweets]
all_train_labels = off_train_dataframe["offensive"]
all_train_labels = all_train_labels.append(reg_train_dataframe["offensive"])

all_test_tweets = off_test_dataframe['tweet']
all_test_tweets = all_test_tweets.append(reg_test_dataframe['tweet'])
all_test_tweets = [get_cleaned_tokens(tweet) for tweet in all_test_tweets]
all_test_labels = off_test_dataframe["offensive"]
all_test_labels = all_test_labels.append(reg_test_dataframe["offensive"])

assert len(all_train_tweets) == len(all_train_labels), f"{len(all_train_tweets)} != {len(all_train_labels)}"
assert len(all_test_tweets) == len(all_test_labels), f"{len(all_test_tweets)} != {len(all_test_labels)}"


In [34]:
# tokenize words 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(100000)
tokenizer.fit_on_texts(all_train_tweets)
sequences_train = tokenizer.texts_to_sequences(all_train_tweets)
sequences_test = tokenizer.texts_to_sequences(all_test_tweets)

print('Found %s unique tokens.' % len(tokenizer.word_index))


Found 33731 unique tokens.


In [35]:
# pad sequences
train_data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)


In [36]:
embeddings_index = {}
import os
f = open(os.path.join(PRECOMPILED_WV_FILEPATH))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [37]:
from numpy import array
from numpy import asarray
from numpy import zeros
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = zeros((VOCAB_SIZE, 50)) #DIMENSIONS!
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    

Loaded 1193514 word vectors.


In [38]:
"""
MAIN MODEL
"""
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D

model = Sequential()
e = Embedding(VOCAB_SIZE, 50, weights=[embedding_matrix], 
              input_length=MAX_SEQUENCE_LENGTH, trainable=False)
model.add(e)
model.add(Conv1D(FILTER_NUM, KERNEL_SIZE, activation = 'relu'))
model.add(Flatten()) # must use Flatter or GlobalMax
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(train_data, all_train_labels, epochs=50, verbose=0)
# evaluate the model

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 70, 50)            5000000   
_________________________________________________________________
conv1d (Conv1D)              (None, 61, 32)            16032     
_________________________________________________________________
flatten (Flatten)            (None, 1952)              0         
_________________________________________________________________
dense (Dense)                (None, 10)                19530     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 5,035,573
Trainable params: 35,573
Non-trainable params: 5,000,000
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.


<tensorflow.python.keras.callbacks.History at 0x14ccd8470>

In [39]:
loss, accuracy = model.evaluate(test_data, all_test_labels, verbose=0)
print("Loss: {}".format(loss))
print('Accuracy: %f' % (accuracy*100))
loss, accuracy = model.evaluate(train_data, all_train_labels, verbose=0)
print("Loss: {}".format(loss))
print('Accuracy: %f' % (accuracy*100))

Loss: 0.4772871879800131
Accuracy: 93.704599
Loss: 0.004486987130572844
Accuracy: 99.838603


In [324]:
def vectorize_sentence(tokenizer, sentence):
    cleaned_words = get_cleaned_tokens(sentence)
    tokenized_sentence = tokenizer.texts_to_sequences([cleaned_words])
    return pad_sequences(tokenized_sentence, maxlen=MAX_SEQUENCE_LENGTH)

test_sentence = """termite eats bullets"""
vectorized = vectorize_sentence(tokenizer, test_sentence)
model.predict_classes(vectorized)

array([[0]], dtype=int32)

In [191]:
# anything with yo seems to be heavily weighted towards offensive
# Garbage in, garbage out


array([[    0,     0,     0, ...,   718,     8,   899],
       [    0,     0,     0, ...,    33,    13,   319],
       [    0,     0,     0, ...,   598,  4190, 24273],
       ...,
       [    0,     0,     0, ...,    35,   516,  1613],
       [    0,     0,     0, ...,    19,   499,   278],
       [    0,     0,     0, ...,    17,    12,    15]], dtype=int32)

In [None]:


# model = keras.Sequential()
# model.add(keras.layers.Conv1D(filter_num, kernel_size, activation = 'relu'))
# model.add(keras.layers.GlobalMaxPooling1D())
# model.add(keras.layers.Dense(10, activation='relu'))
# model.add(keras.layers.Dense(1, activation='sigmoid'))
# model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
# history = model.fit(offensive_vectors, non_offensive_vectors,
#                     epochs=10,
#                     verbose=False,
#                     validation_data=(offensive_vectors_test, non_offensive_vectors_test),
#                     batch_size=10)
# loss, accuracy = model.evaluate(offensive_vectors, non_offensive_vectors, verbose=False)
# print("Training Accuracy: {:.4f}".format(accuracy))
# loss, accuracy = model.evaluate(offensive_vectors_test, non_offensive_vectors_test, verbose=False)
# print("Testing Accuracy:  {:.4f}".format(accuracy))
# plot_history(history)

In [85]:
# from numpy import array
# from numpy import asarray
# from numpy import zeros
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Flatten
# from keras.layers import Embedding
# # define documents
# docs = ['Well done!',
# 		'Good work',
# 		'Great effort',
# 		'nice work',
# 		'Excellent!',
# 		'Weak',
# 		'Poor effort!',
# 		'not good',
# 		'poor work',
# 		'Could have done better.']
# # define class labels
# labels = array([1,1,1,1,1,0,0,0,0,0])
# # prepare tokenizer
# t = Tokenizer()
# t.fit_on_texts(docs)
# vocab_size = len(t.word_index) + 1
# # integer encode the documents
# encoded_docs = t.texts_to_sequences(docs)
# print(encoded_docs)
# # pad documents to a max length of 4 words
# max_length = 4
# padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# print(padded_docs)
# # load the whole embedding into memory
# embeddings_index = dict()
# f = open('../glove_data/glove.6B/glove.6B.100d.txt')
# for line in f:
# 	values = line.split()
# 	word = values[0]
# 	coefs = asarray(values[1:], dtype='float32')
# 	embeddings_index[word] = coefs
# f.close()