# About this Notebook
This notebook creates a Keras Sequential ANN that accompanies the other notebook (49 Years of Music - Collection and Analysis). Run this first. You'll need to collect the data from the link in the other notebook. 
This is a standard Bag of Words (BoW) model approach

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import HTML, display
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
import tabulate
import spacy
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.models import model_from_json
import json
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Functions

In [3]:
def add_spacy_data(dataset, feature_column):
    '''
    Grabs the verb, adverb, noun, and stop word Parts of Speech (POS) 
    tokens and pushes them into a new dataset. returns an 
    enriched dataset'''
    verbs = []
    nouns = []
    adverbs = []
    corpus = []
    nlp = spacy.load('en_core_web_sm')
    ##
    for i in range (0, len(dataset)):
        print("Extracting verbs and topics from record {} of {}".format(i+1, len(dataset)), end = "\r")
        song = dataset.iloc[i][feature_column]
        doc = nlp(song)
        spacy_dataframe = pd.DataFrame()
        for token in doc:
            if token.lemma_ == "-PRON-":
                    lemma = token.text
            else:
                lemma = token.lemma_
            row = {
                "Word": token.text,
                "Lemma": lemma,
                "PoS": token.pos_,
                "Stop Word": token.is_stop
            }
            spacy_dataframe = spacy_dataframe.append(row, ignore_index = True)
        verbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "VERB"].values))
        nouns.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "NOUN"].values))
        adverbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "ADV"].values))
        corpus_clean = " ".join(spacy_dataframe["Lemma"][spacy_dataframe["Stop Word"] == False].values)
        corpus_clean = re.sub(r'[^A-Za-z0-9]+', ' ', corpus_clean)   
        corpus.append(corpus_clean)
    dataset['Verbs'] = verbs
    dataset['Nouns'] = nouns
    dataset['Adverbs'] = adverbs
    dataset['Corpus'] = corpus
    return dataset

In [4]:
def prep_corpus(raw_string):
    '''Single use of add_spacy_data to enable pipelining 
    data into predictions'''
    verbs = []
    nouns = []
    adverbs = []
    corpus = []
    nlp = spacy.load('en_core_web_sm')

    doc = nlp(raw_string)
    spacy_dataframe = pd.DataFrame()
    for token in doc:
        if token.lemma_ == "-PRON-":
                lemma = token.text
        else:
            lemma = token.lemma_
        row = {
            "Word": token.text,
            "Lemma": lemma,
            "PoS": token.pos_,
            "Stop Word": token.is_stop
        }
        spacy_dataframe = spacy_dataframe.append(row, ignore_index = True)
    verbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "VERB"].values))
    nouns.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "NOUN"].values))
    adverbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "ADV"].values))
    corpus_clean = " ".join(spacy_dataframe["Lemma"][spacy_dataframe["Stop Word"] == False].values)
    corpus_clean = re.sub(r'[^A-Za-z0-9]+', ' ', corpus_clean)   

    return corpus_clean

# Loading and prepping the training data

In [5]:
training_data = pd.read_json("Dataset for Detection of Cyber-Trolls.json", lines = True)

In [6]:
labels = []
for i in range(0, len(training_data)):
    labels.append("".join(training_data.iloc[i]['annotation']['label']))

In [7]:
training_data['label'] = labels

In [8]:
training_data.head()

Unnamed: 0,annotation,content,extras,label
0,"{'notes': '', 'label': ['1']}",Get fucking real dude.,,1
1,"{'notes': '', 'label': ['1']}",She is as dirty as they come and that crook ...,,1
2,"{'notes': '', 'label': ['1']}",why did you fuck it up. I could do it all day...,,1
3,"{'notes': '', 'label': ['1']}",Dude they dont finish enclosing the fucking s...,,1
4,"{'notes': '', 'label': ['1']}",WTF are you talking about Men? No men thats n...,,1


In [9]:
prepped_training_data = add_spacy_data(training_data, 'content')

Extracting verbs and topics from record 20001 of 20001

In [10]:
prepped_training_data.to_json('prepped_aggression_data.json')

In [11]:
prepped_training_data = pd.read_json('prepped_aggression_data.json')

In [12]:
prepped_training_data.head()

Unnamed: 0,Adverbs,Corpus,Nouns,Verbs,annotation,content,extras,label
0,,get fuck real dude,dude,get fuck,"{'notes': '', 'label': ['1']}",Get fucking real dude.,,1
1,as so,She dirty come crook rengel dems fuck corrupt...,crook joke,be come be fuck be make look,"{'notes': '', 'label': ['1']}",She is as dirty as they come and that crook ...,,1
10,,truth count guy ass product sub par I tell peo...,truth count guy ass product sub par people orjim,be be tell try,"{'notes': '', 'label': ['1']}",truth on both counts that guy is an ass and t...,,1
100,now just not,I listen cow love yahoo slurpee bag what cup I...,cow yahoo slurpee bag what cup cup,can see listen love put do need fuck,"{'notes': '', 'label': ['1']}","I can see it now. ""Listen you cow-loving yaho...",,1
1000,on,oh come on 1 10 that fuck ridiculous be movie ...,what movie shot celphone,come be fuck be give,"{'notes': '', 'label': ['1']}",Oh come ON... 1/10? That is fucking ridiculo...,,1


## Vectorizing and Splitting the Data

In [13]:
cv = CountVectorizer(max_features = 250)
X = cv.fit_transform(prepped_training_data['Corpus']).toarray()
y = prepped_training_data['label'].values
print("Done!")

Done!


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

In [15]:
train_data = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=250)

## Building the Keras Model

In [16]:
model = keras.Sequential()
#model.add(keras.layers.Embedding(128, 64))
model.add(Dense(128, input_shape=(250,)))
model.add(Dropout(0.3))
#model.add(keras.layers.GlobalAveragePooling1D())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='relu'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               32128     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               66048     
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 250)               128250    
__________

In [17]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

## Training

In [18]:
history = model.fit(train_data,
                    y_train,
                    epochs=200,
                    batch_size=512,
                    verbose=2,
                   validation_split=0.1)

Train on 15300 samples, validate on 1700 samples
Epoch 1/200
 - 2s - loss: 0.7104 - acc: 0.6084 - val_loss: 0.6442 - val_acc: 0.6294
Epoch 2/200
 - 1s - loss: 0.6183 - acc: 0.6499 - val_loss: 0.5950 - val_acc: 0.6700
Epoch 3/200
 - 1s - loss: 0.6221 - acc: 0.6735 - val_loss: 0.5877 - val_acc: 0.6524
Epoch 4/200
 - 1s - loss: 0.5780 - acc: 0.6869 - val_loss: 0.6079 - val_acc: 0.6435
Epoch 5/200
 - 1s - loss: 0.5742 - acc: 0.6797 - val_loss: 0.6845 - val_acc: 0.6318
Epoch 6/200
 - 1s - loss: 0.5767 - acc: 0.6728 - val_loss: 0.5517 - val_acc: 0.6671
Epoch 7/200
 - 1s - loss: 0.5446 - acc: 0.6986 - val_loss: 0.5502 - val_acc: 0.6876
Epoch 8/200
 - 1s - loss: 0.5411 - acc: 0.7133 - val_loss: 0.5451 - val_acc: 0.6794
Epoch 9/200
 - 1s - loss: 0.5349 - acc: 0.7226 - val_loss: 0.5713 - val_acc: 0.6929
Epoch 10/200
 - 1s - loss: 0.5301 - acc: 0.7303 - val_loss: 0.6007 - val_acc: 0.6406
Epoch 11/200
 - 1s - loss: 0.5169 - acc: 0.7214 - val_loss: 0.5784 - val_acc: 0.6759
Epoch 12/200
 - 1s - loss

Epoch 97/200
 - 1s - loss: 0.4112 - acc: 0.8027 - val_loss: 0.6083 - val_acc: 0.7876
Epoch 98/200
 - 1s - loss: 0.3796 - acc: 0.8139 - val_loss: 0.6195 - val_acc: 0.7859
Epoch 99/200
 - 1s - loss: 0.3676 - acc: 0.8129 - val_loss: 0.5834 - val_acc: 0.7900
Epoch 100/200
 - 1s - loss: 0.3748 - acc: 0.8156 - val_loss: 0.6029 - val_acc: 0.7659
Epoch 101/200
 - 1s - loss: 0.3859 - acc: 0.8126 - val_loss: 0.5476 - val_acc: 0.7953
Epoch 102/200
 - 1s - loss: 0.3660 - acc: 0.8154 - val_loss: 0.6457 - val_acc: 0.7724
Epoch 103/200
 - 1s - loss: 0.3780 - acc: 0.8067 - val_loss: 0.5834 - val_acc: 0.7835
Epoch 104/200
 - 1s - loss: 0.3595 - acc: 0.8188 - val_loss: 0.5575 - val_acc: 0.7835
Epoch 105/200
 - 1s - loss: 0.3776 - acc: 0.8120 - val_loss: 0.5918 - val_acc: 0.7718
Epoch 106/200
 - 1s - loss: 0.3574 - acc: 0.8214 - val_loss: 0.5712 - val_acc: 0.7876
Epoch 107/200
 - 1s - loss: 0.3584 - acc: 0.8260 - val_loss: 0.5775 - val_acc: 0.7976
Epoch 108/200
 - 1s - loss: 0.3431 - acc: 0.8352 - val_lo

Epoch 193/200
 - 1s - loss: 0.3134 - acc: 0.8062 - val_loss: 0.5806 - val_acc: 0.7841
Epoch 194/200
 - 1s - loss: 0.2999 - acc: 0.8242 - val_loss: 0.5679 - val_acc: 0.8176
Epoch 195/200
 - 1s - loss: 0.2895 - acc: 0.8351 - val_loss: 0.6435 - val_acc: 0.7741
Epoch 196/200
 - 1s - loss: 0.3018 - acc: 0.8210 - val_loss: 0.5924 - val_acc: 0.7924
Epoch 197/200
 - 1s - loss: 0.3027 - acc: 0.8392 - val_loss: 0.6137 - val_acc: 0.8118
Epoch 198/200
 - 1s - loss: 0.3521 - acc: 0.8275 - val_loss: 0.6898 - val_acc: 0.7647
Epoch 199/200
 - 1s - loss: 0.3028 - acc: 0.8069 - val_loss: 0.6266 - val_acc: 0.7829
Epoch 200/200
 - 1s - loss: 0.3024 - acc: 0.8170 - val_loss: 0.6387 - val_acc: 0.7935


In [19]:
results = model.evaluate(test_data, y_test,verbose=1)
results



[0.6718504123392204, 0.7990669777734166]

 ## Testing

In [20]:
test_text = "I hate you"
test_text = prep_corpus(test_text)

In [21]:
test_cv = cv.transform([test_text]).toarray()

In [22]:
display(model.predict([test_cv]))
model.predict_classes([test_cv])

array([[0.62720925]], dtype=float32)

array([[1]], dtype=int32)

## Saving the Model

In [23]:
model_json = model.to_json()
with open("82pct_aggression_detectioon_keras_model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("82pct_aggression_detectioon_keras_weights.h5")

## Saving the fitted Vectorizer

In [24]:
pickle.dump(cv, open("cv.pickle", "wb"))